1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 #if defined(PETSC_HAVE_XMMINTRIN_H) 5 #include <xmmintrin.h> 6 #endif 7 8 /* 9 Special case where the matrix was ILU(0) factored in the natural 10 ordering. This eliminates the need for the column and row permutation. 11 */ 12 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 15 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag; 16 const MatScalar *aa = a->a, *v; 17 PetscScalar *x, s1, s2, x1, x2; 18 const PetscScalar *b; 19 PetscInt jdx, idt, idx, nz, i; 20 21 PetscFunctionBegin; 22 PetscCall(VecGetArrayRead(bb, &b)); 23 PetscCall(VecGetArray(xx, &x)); 24 25 /* forward solve the lower triangular */ 26 idx = 0; 27 x[0] = b[0]; 28 x[1] = b[1]; 29 for (i = 1; i < n; i++) { 30 v = aa + 4 * ai[i]; 31 vi = aj + ai[i]; 32 nz = diag[i] - ai[i]; 33 idx += 2; 34 s1 = b[idx]; 35 s2 = b[1 + idx]; 36 while (nz--) { 37 jdx = 2 * (*vi++); 38 x1 = x[jdx]; 39 x2 = x[1 + jdx]; 40 s1 -= v[0] * x1 + v[2] * x2; 41 s2 -= v[1] * x1 + v[3] * x2; 42 v += 4; 43 } 44 x[idx] = s1; 45 x[1 + idx] = s2; 46 } 47 /* backward solve the upper triangular */ 48 for (i = n - 1; i >= 0; i--) { 49 v = aa + 4 * diag[i] + 4; 50 vi = aj + diag[i] + 1; 51 nz = ai[i + 1] - diag[i] - 1; 52 idt = 2 * i; 53 s1 = x[idt]; 54 s2 = x[1 + idt]; 55 while (nz--) { 56 idx = 2 * (*vi++); 57 x1 = x[idx]; 58 x2 = x[1 + idx]; 59 s1 -= v[0] * x1 + v[2] * x2; 60 s2 -= v[1] * x1 + v[3] * x2; 61 v += 4; 62 } 63 v = aa + 4 * diag[i]; 64 x[idt] = v[0] * s1 + v[2] * s2; 65 x[1 + idt] = v[1] * s1 + v[3] * s2; 66 } 67 68 PetscCall(VecRestoreArrayRead(bb, &b)); 69 PetscCall(VecRestoreArray(xx, &x)); 70 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 71 PetscFunctionReturn(PETSC_SUCCESS); 72 } 73 74 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) 75 { 76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 77 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 78 PetscInt i, k, nz, idx, idt, jdx; 79 const MatScalar *aa = a->a, *v; 80 PetscScalar *x, s1, s2, x1, x2; 81 const PetscScalar *b; 82 83 PetscFunctionBegin; 84 PetscCall(VecGetArrayRead(bb, &b)); 85 PetscCall(VecGetArray(xx, &x)); 86 /* forward solve the lower triangular */ 87 idx = 0; 88 x[0] = b[idx]; 89 x[1] = b[1 + idx]; 90 for (i = 1; i < n; i++) { 91 v = aa + 4 * ai[i]; 92 vi = aj + ai[i]; 93 nz = ai[i + 1] - ai[i]; 94 idx = 2 * i; 95 s1 = b[idx]; 96 s2 = b[1 + idx]; 97 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 98 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 99 for (k = 0; k < nz; k++) { 100 jdx = 2 * vi[k]; 101 x1 = x[jdx]; 102 x2 = x[1 + jdx]; 103 s1 -= v[0] * x1 + v[2] * x2; 104 s2 -= v[1] * x1 + v[3] * x2; 105 v += 4; 106 } 107 x[idx] = s1; 108 x[1 + idx] = s2; 109 } 110 111 /* backward solve the upper triangular */ 112 for (i = n - 1; i >= 0; i--) { 113 v = aa + 4 * (adiag[i + 1] + 1); 114 vi = aj + adiag[i + 1] + 1; 115 nz = adiag[i] - adiag[i + 1] - 1; 116 idt = 2 * i; 117 s1 = x[idt]; 118 s2 = x[1 + idt]; 119 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 120 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 121 for (k = 0; k < nz; k++) { 122 idx = 2 * vi[k]; 123 x1 = x[idx]; 124 x2 = x[1 + idx]; 125 s1 -= v[0] * x1 + v[2] * x2; 126 s2 -= v[1] * x1 + v[3] * x2; 127 v += 4; 128 } 129 /* x = inv_diagonal*x */ 130 x[idt] = v[0] * s1 + v[2] * s2; 131 x[1 + idt] = v[1] * s1 + v[3] * s2; 132 } 133 134 PetscCall(VecRestoreArrayRead(bb, &b)); 135 PetscCall(VecRestoreArray(xx, &x)); 136 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) 141 { 142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 143 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 144 PetscInt i, k, nz, idx, jdx; 145 const MatScalar *aa = a->a, *v; 146 PetscScalar *x, s1, s2, x1, x2; 147 const PetscScalar *b; 148 149 PetscFunctionBegin; 150 PetscCall(VecGetArrayRead(bb, &b)); 151 PetscCall(VecGetArray(xx, &x)); 152 /* forward solve the lower triangular */ 153 idx = 0; 154 x[0] = b[idx]; 155 x[1] = b[1 + idx]; 156 for (i = 1; i < n; i++) { 157 v = aa + 4 * ai[i]; 158 vi = aj + ai[i]; 159 nz = ai[i + 1] - ai[i]; 160 idx = 2 * i; 161 s1 = b[idx]; 162 s2 = b[1 + idx]; 163 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 164 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 165 for (k = 0; k < nz; k++) { 166 jdx = 2 * vi[k]; 167 x1 = x[jdx]; 168 x2 = x[1 + jdx]; 169 s1 -= v[0] * x1 + v[2] * x2; 170 s2 -= v[1] * x1 + v[3] * x2; 171 v += 4; 172 } 173 x[idx] = s1; 174 x[1 + idx] = s2; 175 } 176 177 PetscCall(VecRestoreArrayRead(bb, &b)); 178 PetscCall(VecRestoreArray(xx, &x)); 179 PetscCall(PetscLogFlops(4.0 * (a->nz) - A->cmap->n)); 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) 184 { 185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 186 const PetscInt n = a->mbs, *vi, *aj = a->j, *adiag = a->diag; 187 PetscInt i, k, nz, idx, idt; 188 const MatScalar *aa = a->a, *v; 189 PetscScalar *x, s1, s2, x1, x2; 190 const PetscScalar *b; 191 192 PetscFunctionBegin; 193 PetscCall(VecGetArrayRead(bb, &b)); 194 PetscCall(VecGetArray(xx, &x)); 195 196 /* backward solve the upper triangular */ 197 for (i = n - 1; i >= 0; i--) { 198 v = aa + 4 * (adiag[i + 1] + 1); 199 vi = aj + adiag[i + 1] + 1; 200 nz = adiag[i] - adiag[i + 1] - 1; 201 idt = 2 * i; 202 s1 = b[idt]; 203 s2 = b[1 + idt]; 204 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 205 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 206 for (k = 0; k < nz; k++) { 207 idx = 2 * vi[k]; 208 x1 = x[idx]; 209 x2 = x[1 + idx]; 210 s1 -= v[0] * x1 + v[2] * x2; 211 s2 -= v[1] * x1 + v[3] * x2; 212 v += 4; 213 } 214 /* x = inv_diagonal*x */ 215 x[idt] = v[0] * s1 + v[2] * s2; 216 x[1 + idt] = v[1] * s1 + v[3] * s2; 217 } 218 219 PetscCall(VecRestoreArrayRead(bb, &b)); 220 PetscCall(VecRestoreArray(xx, &x)); 221 PetscCall(PetscLogFlops(4.0 * a->nz - A->cmap->n)); 222 PetscFunctionReturn(PETSC_SUCCESS); 223 } 224