1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag; 11 const MatScalar *aa = a->a, *v; 12 PetscScalar *x, s1, s2, x1, x2; 13 const PetscScalar *b; 14 PetscInt jdx, idt, idx, nz, i; 15 16 PetscFunctionBegin; 17 PetscCall(VecGetArrayRead(bb, &b)); 18 PetscCall(VecGetArray(xx, &x)); 19 20 /* forward solve the lower triangular */ 21 idx = 0; 22 x[0] = b[0]; 23 x[1] = b[1]; 24 for (i = 1; i < n; i++) { 25 v = aa + 4 * ai[i]; 26 vi = aj + ai[i]; 27 nz = diag[i] - ai[i]; 28 idx += 2; 29 s1 = b[idx]; 30 s2 = b[1 + idx]; 31 while (nz--) { 32 jdx = 2 * (*vi++); 33 x1 = x[jdx]; 34 x2 = x[1 + jdx]; 35 s1 -= v[0] * x1 + v[2] * x2; 36 s2 -= v[1] * x1 + v[3] * x2; 37 v += 4; 38 } 39 x[idx] = s1; 40 x[1 + idx] = s2; 41 } 42 /* backward solve the upper triangular */ 43 for (i = n - 1; i >= 0; i--) { 44 v = aa + 4 * diag[i] + 4; 45 vi = aj + diag[i] + 1; 46 nz = ai[i + 1] - diag[i] - 1; 47 idt = 2 * i; 48 s1 = x[idt]; 49 s2 = x[1 + idt]; 50 while (nz--) { 51 idx = 2 * (*vi++); 52 x1 = x[idx]; 53 x2 = x[1 + idx]; 54 s1 -= v[0] * x1 + v[2] * x2; 55 s2 -= v[1] * x1 + v[3] * x2; 56 v += 4; 57 } 58 v = aa + 4 * diag[i]; 59 x[idt] = v[0] * s1 + v[2] * s2; 60 x[1 + idt] = v[1] * s1 + v[3] * s2; 61 } 62 63 PetscCall(VecRestoreArrayRead(bb, &b)); 64 PetscCall(VecRestoreArray(xx, &x)); 65 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 66 PetscFunctionReturn(0); 67 } 68 69 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) { 70 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 71 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 72 PetscInt i, k, nz, idx, idt, jdx; 73 const MatScalar *aa = a->a, *v; 74 PetscScalar *x, s1, s2, x1, x2; 75 const PetscScalar *b; 76 77 PetscFunctionBegin; 78 PetscCall(VecGetArrayRead(bb, &b)); 79 PetscCall(VecGetArray(xx, &x)); 80 /* forward solve the lower triangular */ 81 idx = 0; 82 x[0] = b[idx]; 83 x[1] = b[1 + idx]; 84 for (i = 1; i < n; i++) { 85 v = aa + 4 * ai[i]; 86 vi = aj + ai[i]; 87 nz = ai[i + 1] - ai[i]; 88 idx = 2 * i; 89 s1 = b[idx]; 90 s2 = b[1 + idx]; 91 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 92 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 93 for (k = 0; k < nz; k++) { 94 jdx = 2 * vi[k]; 95 x1 = x[jdx]; 96 x2 = x[1 + jdx]; 97 s1 -= v[0] * x1 + v[2] * x2; 98 s2 -= v[1] * x1 + v[3] * x2; 99 v += 4; 100 } 101 x[idx] = s1; 102 x[1 + idx] = s2; 103 } 104 105 /* backward solve the upper triangular */ 106 for (i = n - 1; i >= 0; i--) { 107 v = aa + 4 * (adiag[i + 1] + 1); 108 vi = aj + adiag[i + 1] + 1; 109 nz = adiag[i] - adiag[i + 1] - 1; 110 idt = 2 * i; 111 s1 = x[idt]; 112 s2 = x[1 + idt]; 113 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 114 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 115 for (k = 0; k < nz; k++) { 116 idx = 2 * vi[k]; 117 x1 = x[idx]; 118 x2 = x[1 + idx]; 119 s1 -= v[0] * x1 + v[2] * x2; 120 s2 -= v[1] * x1 + v[3] * x2; 121 v += 4; 122 } 123 /* x = inv_diagonal*x */ 124 x[idt] = v[0] * s1 + v[2] * s2; 125 x[1 + idt] = v[1] * s1 + v[3] * s2; 126 } 127 128 PetscCall(VecRestoreArrayRead(bb, &b)); 129 PetscCall(VecRestoreArray(xx, &x)); 130 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 131 PetscFunctionReturn(0); 132 } 133 134 PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) { 135 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 136 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 137 PetscInt i, k, nz, idx, jdx; 138 const MatScalar *aa = a->a, *v; 139 PetscScalar *x, s1, s2, x1, x2; 140 const PetscScalar *b; 141 142 PetscFunctionBegin; 143 PetscCall(VecGetArrayRead(bb, &b)); 144 PetscCall(VecGetArray(xx, &x)); 145 /* forward solve the lower triangular */ 146 idx = 0; 147 x[0] = b[idx]; 148 x[1] = b[1 + idx]; 149 for (i = 1; i < n; i++) { 150 v = aa + 4 * ai[i]; 151 vi = aj + ai[i]; 152 nz = ai[i + 1] - ai[i]; 153 idx = 2 * i; 154 s1 = b[idx]; 155 s2 = b[1 + idx]; 156 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 157 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 158 for (k = 0; k < nz; k++) { 159 jdx = 2 * vi[k]; 160 x1 = x[jdx]; 161 x2 = x[1 + jdx]; 162 s1 -= v[0] * x1 + v[2] * x2; 163 s2 -= v[1] * x1 + v[3] * x2; 164 v += 4; 165 } 166 x[idx] = s1; 167 x[1 + idx] = s2; 168 } 169 170 PetscCall(VecRestoreArrayRead(bb, &b)); 171 PetscCall(VecRestoreArray(xx, &x)); 172 PetscCall(PetscLogFlops(4.0 * (a->nz) - A->cmap->n)); 173 PetscFunctionReturn(0); 174 } 175 176 PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx) { 177 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 178 const PetscInt n = a->mbs, *vi, *aj = a->j, *adiag = a->diag; 179 PetscInt i, k, nz, idx, idt; 180 const MatScalar *aa = a->a, *v; 181 PetscScalar *x, s1, s2, x1, x2; 182 const PetscScalar *b; 183 184 PetscFunctionBegin; 185 PetscCall(VecGetArrayRead(bb, &b)); 186 PetscCall(VecGetArray(xx, &x)); 187 188 /* backward solve the upper triangular */ 189 for (i = n - 1; i >= 0; i--) { 190 v = aa + 4 * (adiag[i + 1] + 1); 191 vi = aj + adiag[i + 1] + 1; 192 nz = adiag[i] - adiag[i + 1] - 1; 193 idt = 2 * i; 194 s1 = b[idt]; 195 s2 = b[1 + idt]; 196 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); 197 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); 198 for (k = 0; k < nz; k++) { 199 idx = 2 * vi[k]; 200 x1 = x[idx]; 201 x2 = x[1 + idx]; 202 s1 -= v[0] * x1 + v[2] * x2; 203 s2 -= v[1] * x1 + v[3] * x2; 204 v += 4; 205 } 206 /* x = inv_diagonal*x */ 207 x[idt] = v[0] * s1 + v[2] * s2; 208 x[1 + idt] = v[1] * s1 + v[3] * s2; 209 } 210 211 PetscCall(VecRestoreArrayRead(bb, &b)); 212 PetscCall(VecRestoreArray(xx, &x)); 213 PetscCall(PetscLogFlops(4.0 * a->nz - A->cmap->n)); 214 PetscFunctionReturn(0); 215 } 216