1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 5 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6 IS iscol = a->col, isrow = a->row; 7 const PetscInt *r, *c, *rout, *cout; 8 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 9 PetscInt i, nz; 10 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 11 const MatScalar *aa = a->a, *v; 12 PetscScalar *x, *s, *t, *ls; 13 const PetscScalar *b; 14 15 PetscFunctionBegin; 16 PetscCall(VecGetArrayRead(bb, &b)); 17 PetscCall(VecGetArray(xx, &x)); 18 t = a->solve_work; 19 20 PetscCall(ISGetIndices(isrow, &rout)); 21 r = rout; 22 PetscCall(ISGetIndices(iscol, &cout)); 23 c = cout + (n - 1); 24 25 /* forward solve the lower triangular */ 26 PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 27 for (i = 1; i < n; i++) { 28 v = aa + bs2 * ai[i]; 29 vi = aj + ai[i]; 30 nz = a->diag[i] - ai[i]; 31 s = t + bs * i; 32 PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 33 while (nz--) { 34 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 35 v += bs2; 36 } 37 } 38 /* backward solve the upper triangular */ 39 ls = a->solve_work + A->cmap->n; 40 for (i = n - 1; i >= 0; i--) { 41 v = aa + bs2 * (a->diag[i] + 1); 42 vi = aj + a->diag[i] + 1; 43 nz = ai[i + 1] - a->diag[i] - 1; 44 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 45 while (nz--) { 46 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 47 v += bs2; 48 } 49 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 50 PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 51 } 52 53 PetscCall(ISRestoreIndices(isrow, &rout)); 54 PetscCall(ISRestoreIndices(iscol, &cout)); 55 PetscCall(VecRestoreArrayRead(bb, &b)); 56 PetscCall(VecRestoreArray(xx, &x)); 57 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 58 PetscFunctionReturn(0); 59 } 60 61 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) { 62 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 63 IS iscol = a->col, isrow = a->row; 64 const PetscInt *r, *c, *ai = a->i, *aj = a->j; 65 const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 66 PetscInt i, nz, idx, idt, idc; 67 const MatScalar *aa = a->a, *v; 68 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 69 const PetscScalar *b; 70 71 PetscFunctionBegin; 72 PetscCall(VecGetArrayRead(bb, &b)); 73 PetscCall(VecGetArray(xx, &x)); 74 t = a->solve_work; 75 76 PetscCall(ISGetIndices(isrow, &rout)); 77 r = rout; 78 PetscCall(ISGetIndices(iscol, &cout)); 79 c = cout + (n - 1); 80 81 /* forward solve the lower triangular */ 82 idx = 7 * (*r++); 83 t[0] = b[idx]; 84 t[1] = b[1 + idx]; 85 t[2] = b[2 + idx]; 86 t[3] = b[3 + idx]; 87 t[4] = b[4 + idx]; 88 t[5] = b[5 + idx]; 89 t[6] = b[6 + idx]; 90 91 for (i = 1; i < n; i++) { 92 v = aa + 49 * ai[i]; 93 vi = aj + ai[i]; 94 nz = diag[i] - ai[i]; 95 idx = 7 * (*r++); 96 s1 = b[idx]; 97 s2 = b[1 + idx]; 98 s3 = b[2 + idx]; 99 s4 = b[3 + idx]; 100 s5 = b[4 + idx]; 101 s6 = b[5 + idx]; 102 s7 = b[6 + idx]; 103 while (nz--) { 104 idx = 7 * (*vi++); 105 x1 = t[idx]; 106 x2 = t[1 + idx]; 107 x3 = t[2 + idx]; 108 x4 = t[3 + idx]; 109 x5 = t[4 + idx]; 110 x6 = t[5 + idx]; 111 x7 = t[6 + idx]; 112 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 113 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 114 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 115 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 116 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 117 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 118 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 119 v += 49; 120 } 121 idx = 7 * i; 122 t[idx] = s1; 123 t[1 + idx] = s2; 124 t[2 + idx] = s3; 125 t[3 + idx] = s4; 126 t[4 + idx] = s5; 127 t[5 + idx] = s6; 128 t[6 + idx] = s7; 129 } 130 /* backward solve the upper triangular */ 131 for (i = n - 1; i >= 0; i--) { 132 v = aa + 49 * diag[i] + 49; 133 vi = aj + diag[i] + 1; 134 nz = ai[i + 1] - diag[i] - 1; 135 idt = 7 * i; 136 s1 = t[idt]; 137 s2 = t[1 + idt]; 138 s3 = t[2 + idt]; 139 s4 = t[3 + idt]; 140 s5 = t[4 + idt]; 141 s6 = t[5 + idt]; 142 s7 = t[6 + idt]; 143 while (nz--) { 144 idx = 7 * (*vi++); 145 x1 = t[idx]; 146 x2 = t[1 + idx]; 147 x3 = t[2 + idx]; 148 x4 = t[3 + idx]; 149 x5 = t[4 + idx]; 150 x6 = t[5 + idx]; 151 x7 = t[6 + idx]; 152 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 153 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 154 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 155 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 156 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 157 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 158 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 159 v += 49; 160 } 161 idc = 7 * (*c--); 162 v = aa + 49 * diag[i]; 163 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 164 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 165 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 166 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 167 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 168 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 169 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 170 } 171 172 PetscCall(ISRestoreIndices(isrow, &rout)); 173 PetscCall(ISRestoreIndices(iscol, &cout)); 174 PetscCall(VecRestoreArrayRead(bb, &b)); 175 PetscCall(VecRestoreArray(xx, &x)); 176 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 177 PetscFunctionReturn(0); 178 } 179 180 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) { 181 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 182 IS iscol = a->col, isrow = a->row; 183 const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 184 const PetscInt n = a->mbs, *rout, *cout, *vi; 185 PetscInt i, nz, idx, idt, idc, m; 186 const MatScalar *aa = a->a, *v; 187 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 188 const PetscScalar *b; 189 190 PetscFunctionBegin; 191 PetscCall(VecGetArrayRead(bb, &b)); 192 PetscCall(VecGetArray(xx, &x)); 193 t = a->solve_work; 194 195 PetscCall(ISGetIndices(isrow, &rout)); 196 r = rout; 197 PetscCall(ISGetIndices(iscol, &cout)); 198 c = cout; 199 200 /* forward solve the lower triangular */ 201 idx = 7 * r[0]; 202 t[0] = b[idx]; 203 t[1] = b[1 + idx]; 204 t[2] = b[2 + idx]; 205 t[3] = b[3 + idx]; 206 t[4] = b[4 + idx]; 207 t[5] = b[5 + idx]; 208 t[6] = b[6 + idx]; 209 210 for (i = 1; i < n; i++) { 211 v = aa + 49 * ai[i]; 212 vi = aj + ai[i]; 213 nz = ai[i + 1] - ai[i]; 214 idx = 7 * r[i]; 215 s1 = b[idx]; 216 s2 = b[1 + idx]; 217 s3 = b[2 + idx]; 218 s4 = b[3 + idx]; 219 s5 = b[4 + idx]; 220 s6 = b[5 + idx]; 221 s7 = b[6 + idx]; 222 for (m = 0; m < nz; m++) { 223 idx = 7 * vi[m]; 224 x1 = t[idx]; 225 x2 = t[1 + idx]; 226 x3 = t[2 + idx]; 227 x4 = t[3 + idx]; 228 x5 = t[4 + idx]; 229 x6 = t[5 + idx]; 230 x7 = t[6 + idx]; 231 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 232 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 233 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 234 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 235 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 236 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 237 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 238 v += 49; 239 } 240 idx = 7 * i; 241 t[idx] = s1; 242 t[1 + idx] = s2; 243 t[2 + idx] = s3; 244 t[3 + idx] = s4; 245 t[4 + idx] = s5; 246 t[5 + idx] = s6; 247 t[6 + idx] = s7; 248 } 249 /* backward solve the upper triangular */ 250 for (i = n - 1; i >= 0; i--) { 251 v = aa + 49 * (adiag[i + 1] + 1); 252 vi = aj + adiag[i + 1] + 1; 253 nz = adiag[i] - adiag[i + 1] - 1; 254 idt = 7 * i; 255 s1 = t[idt]; 256 s2 = t[1 + idt]; 257 s3 = t[2 + idt]; 258 s4 = t[3 + idt]; 259 s5 = t[4 + idt]; 260 s6 = t[5 + idt]; 261 s7 = t[6 + idt]; 262 for (m = 0; m < nz; m++) { 263 idx = 7 * vi[m]; 264 x1 = t[idx]; 265 x2 = t[1 + idx]; 266 x3 = t[2 + idx]; 267 x4 = t[3 + idx]; 268 x5 = t[4 + idx]; 269 x6 = t[5 + idx]; 270 x7 = t[6 + idx]; 271 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 272 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 273 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 274 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 275 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 276 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 277 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 278 v += 49; 279 } 280 idc = 7 * c[i]; 281 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 282 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 283 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 284 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 285 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 286 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 287 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 288 } 289 290 PetscCall(ISRestoreIndices(isrow, &rout)); 291 PetscCall(ISRestoreIndices(iscol, &cout)); 292 PetscCall(VecRestoreArrayRead(bb, &b)); 293 PetscCall(VecRestoreArray(xx, &x)); 294 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 295 PetscFunctionReturn(0); 296 } 297 298 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) { 299 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 300 IS iscol = a->col, isrow = a->row; 301 const PetscInt *r, *c, *rout, *cout; 302 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 303 PetscInt i, nz, idx, idt, idc; 304 const MatScalar *aa = a->a, *v; 305 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 306 const PetscScalar *b; 307 308 PetscFunctionBegin; 309 PetscCall(VecGetArrayRead(bb, &b)); 310 PetscCall(VecGetArray(xx, &x)); 311 t = a->solve_work; 312 313 PetscCall(ISGetIndices(isrow, &rout)); 314 r = rout; 315 PetscCall(ISGetIndices(iscol, &cout)); 316 c = cout + (n - 1); 317 318 /* forward solve the lower triangular */ 319 idx = 6 * (*r++); 320 t[0] = b[idx]; 321 t[1] = b[1 + idx]; 322 t[2] = b[2 + idx]; 323 t[3] = b[3 + idx]; 324 t[4] = b[4 + idx]; 325 t[5] = b[5 + idx]; 326 for (i = 1; i < n; i++) { 327 v = aa + 36 * ai[i]; 328 vi = aj + ai[i]; 329 nz = diag[i] - ai[i]; 330 idx = 6 * (*r++); 331 s1 = b[idx]; 332 s2 = b[1 + idx]; 333 s3 = b[2 + idx]; 334 s4 = b[3 + idx]; 335 s5 = b[4 + idx]; 336 s6 = b[5 + idx]; 337 while (nz--) { 338 idx = 6 * (*vi++); 339 x1 = t[idx]; 340 x2 = t[1 + idx]; 341 x3 = t[2 + idx]; 342 x4 = t[3 + idx]; 343 x5 = t[4 + idx]; 344 x6 = t[5 + idx]; 345 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 346 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 347 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 348 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 349 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 350 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 351 v += 36; 352 } 353 idx = 6 * i; 354 t[idx] = s1; 355 t[1 + idx] = s2; 356 t[2 + idx] = s3; 357 t[3 + idx] = s4; 358 t[4 + idx] = s5; 359 t[5 + idx] = s6; 360 } 361 /* backward solve the upper triangular */ 362 for (i = n - 1; i >= 0; i--) { 363 v = aa + 36 * diag[i] + 36; 364 vi = aj + diag[i] + 1; 365 nz = ai[i + 1] - diag[i] - 1; 366 idt = 6 * i; 367 s1 = t[idt]; 368 s2 = t[1 + idt]; 369 s3 = t[2 + idt]; 370 s4 = t[3 + idt]; 371 s5 = t[4 + idt]; 372 s6 = t[5 + idt]; 373 while (nz--) { 374 idx = 6 * (*vi++); 375 x1 = t[idx]; 376 x2 = t[1 + idx]; 377 x3 = t[2 + idx]; 378 x4 = t[3 + idx]; 379 x5 = t[4 + idx]; 380 x6 = t[5 + idx]; 381 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 382 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 383 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 384 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 385 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 386 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 387 v += 36; 388 } 389 idc = 6 * (*c--); 390 v = aa + 36 * diag[i]; 391 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 392 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 393 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 394 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 395 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 396 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 397 } 398 399 PetscCall(ISRestoreIndices(isrow, &rout)); 400 PetscCall(ISRestoreIndices(iscol, &cout)); 401 PetscCall(VecRestoreArrayRead(bb, &b)); 402 PetscCall(VecRestoreArray(xx, &x)); 403 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 404 PetscFunctionReturn(0); 405 } 406 407 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) { 408 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 409 IS iscol = a->col, isrow = a->row; 410 const PetscInt *r, *c, *rout, *cout; 411 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 412 PetscInt i, nz, idx, idt, idc, m; 413 const MatScalar *aa = a->a, *v; 414 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 415 const PetscScalar *b; 416 417 PetscFunctionBegin; 418 PetscCall(VecGetArrayRead(bb, &b)); 419 PetscCall(VecGetArray(xx, &x)); 420 t = a->solve_work; 421 422 PetscCall(ISGetIndices(isrow, &rout)); 423 r = rout; 424 PetscCall(ISGetIndices(iscol, &cout)); 425 c = cout; 426 427 /* forward solve the lower triangular */ 428 idx = 6 * r[0]; 429 t[0] = b[idx]; 430 t[1] = b[1 + idx]; 431 t[2] = b[2 + idx]; 432 t[3] = b[3 + idx]; 433 t[4] = b[4 + idx]; 434 t[5] = b[5 + idx]; 435 for (i = 1; i < n; i++) { 436 v = aa + 36 * ai[i]; 437 vi = aj + ai[i]; 438 nz = ai[i + 1] - ai[i]; 439 idx = 6 * r[i]; 440 s1 = b[idx]; 441 s2 = b[1 + idx]; 442 s3 = b[2 + idx]; 443 s4 = b[3 + idx]; 444 s5 = b[4 + idx]; 445 s6 = b[5 + idx]; 446 for (m = 0; m < nz; m++) { 447 idx = 6 * vi[m]; 448 x1 = t[idx]; 449 x2 = t[1 + idx]; 450 x3 = t[2 + idx]; 451 x4 = t[3 + idx]; 452 x5 = t[4 + idx]; 453 x6 = t[5 + idx]; 454 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 455 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 456 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 457 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 458 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 459 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 460 v += 36; 461 } 462 idx = 6 * i; 463 t[idx] = s1; 464 t[1 + idx] = s2; 465 t[2 + idx] = s3; 466 t[3 + idx] = s4; 467 t[4 + idx] = s5; 468 t[5 + idx] = s6; 469 } 470 /* backward solve the upper triangular */ 471 for (i = n - 1; i >= 0; i--) { 472 v = aa + 36 * (adiag[i + 1] + 1); 473 vi = aj + adiag[i + 1] + 1; 474 nz = adiag[i] - adiag[i + 1] - 1; 475 idt = 6 * i; 476 s1 = t[idt]; 477 s2 = t[1 + idt]; 478 s3 = t[2 + idt]; 479 s4 = t[3 + idt]; 480 s5 = t[4 + idt]; 481 s6 = t[5 + idt]; 482 for (m = 0; m < nz; m++) { 483 idx = 6 * vi[m]; 484 x1 = t[idx]; 485 x2 = t[1 + idx]; 486 x3 = t[2 + idx]; 487 x4 = t[3 + idx]; 488 x5 = t[4 + idx]; 489 x6 = t[5 + idx]; 490 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 491 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 492 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 493 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 494 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 495 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 496 v += 36; 497 } 498 idc = 6 * c[i]; 499 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 500 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 501 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 502 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 503 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 504 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 505 } 506 507 PetscCall(ISRestoreIndices(isrow, &rout)); 508 PetscCall(ISRestoreIndices(iscol, &cout)); 509 PetscCall(VecRestoreArrayRead(bb, &b)); 510 PetscCall(VecRestoreArray(xx, &x)); 511 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 512 PetscFunctionReturn(0); 513 } 514 515 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) { 516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 517 IS iscol = a->col, isrow = a->row; 518 const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 519 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 520 PetscInt i, nz, idx, idt, idc; 521 const MatScalar *aa = a->a, *v; 522 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 523 const PetscScalar *b; 524 525 PetscFunctionBegin; 526 PetscCall(VecGetArrayRead(bb, &b)); 527 PetscCall(VecGetArray(xx, &x)); 528 t = a->solve_work; 529 530 PetscCall(ISGetIndices(isrow, &rout)); 531 r = rout; 532 PetscCall(ISGetIndices(iscol, &cout)); 533 c = cout + (n - 1); 534 535 /* forward solve the lower triangular */ 536 idx = 5 * (*r++); 537 t[0] = b[idx]; 538 t[1] = b[1 + idx]; 539 t[2] = b[2 + idx]; 540 t[3] = b[3 + idx]; 541 t[4] = b[4 + idx]; 542 for (i = 1; i < n; i++) { 543 v = aa + 25 * ai[i]; 544 vi = aj + ai[i]; 545 nz = diag[i] - ai[i]; 546 idx = 5 * (*r++); 547 s1 = b[idx]; 548 s2 = b[1 + idx]; 549 s3 = b[2 + idx]; 550 s4 = b[3 + idx]; 551 s5 = b[4 + idx]; 552 while (nz--) { 553 idx = 5 * (*vi++); 554 x1 = t[idx]; 555 x2 = t[1 + idx]; 556 x3 = t[2 + idx]; 557 x4 = t[3 + idx]; 558 x5 = t[4 + idx]; 559 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 560 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 561 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 562 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 563 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 564 v += 25; 565 } 566 idx = 5 * i; 567 t[idx] = s1; 568 t[1 + idx] = s2; 569 t[2 + idx] = s3; 570 t[3 + idx] = s4; 571 t[4 + idx] = s5; 572 } 573 /* backward solve the upper triangular */ 574 for (i = n - 1; i >= 0; i--) { 575 v = aa + 25 * diag[i] + 25; 576 vi = aj + diag[i] + 1; 577 nz = ai[i + 1] - diag[i] - 1; 578 idt = 5 * i; 579 s1 = t[idt]; 580 s2 = t[1 + idt]; 581 s3 = t[2 + idt]; 582 s4 = t[3 + idt]; 583 s5 = t[4 + idt]; 584 while (nz--) { 585 idx = 5 * (*vi++); 586 x1 = t[idx]; 587 x2 = t[1 + idx]; 588 x3 = t[2 + idx]; 589 x4 = t[3 + idx]; 590 x5 = t[4 + idx]; 591 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 592 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 593 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 594 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 595 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 596 v += 25; 597 } 598 idc = 5 * (*c--); 599 v = aa + 25 * diag[i]; 600 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 601 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 602 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 603 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 604 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 605 } 606 607 PetscCall(ISRestoreIndices(isrow, &rout)); 608 PetscCall(ISRestoreIndices(iscol, &cout)); 609 PetscCall(VecRestoreArrayRead(bb, &b)); 610 PetscCall(VecRestoreArray(xx, &x)); 611 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 612 PetscFunctionReturn(0); 613 } 614 615 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) { 616 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 617 IS iscol = a->col, isrow = a->row; 618 const PetscInt *r, *c, *rout, *cout; 619 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 620 PetscInt i, nz, idx, idt, idc, m; 621 const MatScalar *aa = a->a, *v; 622 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 623 const PetscScalar *b; 624 625 PetscFunctionBegin; 626 PetscCall(VecGetArrayRead(bb, &b)); 627 PetscCall(VecGetArray(xx, &x)); 628 t = a->solve_work; 629 630 PetscCall(ISGetIndices(isrow, &rout)); 631 r = rout; 632 PetscCall(ISGetIndices(iscol, &cout)); 633 c = cout; 634 635 /* forward solve the lower triangular */ 636 idx = 5 * r[0]; 637 t[0] = b[idx]; 638 t[1] = b[1 + idx]; 639 t[2] = b[2 + idx]; 640 t[3] = b[3 + idx]; 641 t[4] = b[4 + idx]; 642 for (i = 1; i < n; i++) { 643 v = aa + 25 * ai[i]; 644 vi = aj + ai[i]; 645 nz = ai[i + 1] - ai[i]; 646 idx = 5 * r[i]; 647 s1 = b[idx]; 648 s2 = b[1 + idx]; 649 s3 = b[2 + idx]; 650 s4 = b[3 + idx]; 651 s5 = b[4 + idx]; 652 for (m = 0; m < nz; m++) { 653 idx = 5 * vi[m]; 654 x1 = t[idx]; 655 x2 = t[1 + idx]; 656 x3 = t[2 + idx]; 657 x4 = t[3 + idx]; 658 x5 = t[4 + idx]; 659 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 660 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 661 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 662 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 663 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 664 v += 25; 665 } 666 idx = 5 * i; 667 t[idx] = s1; 668 t[1 + idx] = s2; 669 t[2 + idx] = s3; 670 t[3 + idx] = s4; 671 t[4 + idx] = s5; 672 } 673 /* backward solve the upper triangular */ 674 for (i = n - 1; i >= 0; i--) { 675 v = aa + 25 * (adiag[i + 1] + 1); 676 vi = aj + adiag[i + 1] + 1; 677 nz = adiag[i] - adiag[i + 1] - 1; 678 idt = 5 * i; 679 s1 = t[idt]; 680 s2 = t[1 + idt]; 681 s3 = t[2 + idt]; 682 s4 = t[3 + idt]; 683 s5 = t[4 + idt]; 684 for (m = 0; m < nz; m++) { 685 idx = 5 * vi[m]; 686 x1 = t[idx]; 687 x2 = t[1 + idx]; 688 x3 = t[2 + idx]; 689 x4 = t[3 + idx]; 690 x5 = t[4 + idx]; 691 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 692 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 693 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 694 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 695 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 696 v += 25; 697 } 698 idc = 5 * c[i]; 699 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 700 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 701 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 702 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 703 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 704 } 705 706 PetscCall(ISRestoreIndices(isrow, &rout)); 707 PetscCall(ISRestoreIndices(iscol, &cout)); 708 PetscCall(VecRestoreArrayRead(bb, &b)); 709 PetscCall(VecRestoreArray(xx, &x)); 710 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 711 PetscFunctionReturn(0); 712 } 713 714 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) { 715 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 716 IS iscol = a->col, isrow = a->row; 717 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 718 PetscInt i, nz, idx, idt, idc; 719 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 720 const MatScalar *aa = a->a, *v; 721 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 722 const PetscScalar *b; 723 724 PetscFunctionBegin; 725 PetscCall(VecGetArrayRead(bb, &b)); 726 PetscCall(VecGetArray(xx, &x)); 727 t = a->solve_work; 728 729 PetscCall(ISGetIndices(isrow, &rout)); 730 r = rout; 731 PetscCall(ISGetIndices(iscol, &cout)); 732 c = cout + (n - 1); 733 734 /* forward solve the lower triangular */ 735 idx = 4 * (*r++); 736 t[0] = b[idx]; 737 t[1] = b[1 + idx]; 738 t[2] = b[2 + idx]; 739 t[3] = b[3 + idx]; 740 for (i = 1; i < n; i++) { 741 v = aa + 16 * ai[i]; 742 vi = aj + ai[i]; 743 nz = diag[i] - ai[i]; 744 idx = 4 * (*r++); 745 s1 = b[idx]; 746 s2 = b[1 + idx]; 747 s3 = b[2 + idx]; 748 s4 = b[3 + idx]; 749 while (nz--) { 750 idx = 4 * (*vi++); 751 x1 = t[idx]; 752 x2 = t[1 + idx]; 753 x3 = t[2 + idx]; 754 x4 = t[3 + idx]; 755 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 756 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 757 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 758 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 759 v += 16; 760 } 761 idx = 4 * i; 762 t[idx] = s1; 763 t[1 + idx] = s2; 764 t[2 + idx] = s3; 765 t[3 + idx] = s4; 766 } 767 /* backward solve the upper triangular */ 768 for (i = n - 1; i >= 0; i--) { 769 v = aa + 16 * diag[i] + 16; 770 vi = aj + diag[i] + 1; 771 nz = ai[i + 1] - diag[i] - 1; 772 idt = 4 * i; 773 s1 = t[idt]; 774 s2 = t[1 + idt]; 775 s3 = t[2 + idt]; 776 s4 = t[3 + idt]; 777 while (nz--) { 778 idx = 4 * (*vi++); 779 x1 = t[idx]; 780 x2 = t[1 + idx]; 781 x3 = t[2 + idx]; 782 x4 = t[3 + idx]; 783 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 784 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 785 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 786 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 787 v += 16; 788 } 789 idc = 4 * (*c--); 790 v = aa + 16 * diag[i]; 791 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 792 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 793 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 794 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 795 } 796 797 PetscCall(ISRestoreIndices(isrow, &rout)); 798 PetscCall(ISRestoreIndices(iscol, &cout)); 799 PetscCall(VecRestoreArrayRead(bb, &b)); 800 PetscCall(VecRestoreArray(xx, &x)); 801 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 802 PetscFunctionReturn(0); 803 } 804 805 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) { 806 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 807 IS iscol = a->col, isrow = a->row; 808 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 809 PetscInt i, nz, idx, idt, idc, m; 810 const PetscInt *r, *c, *rout, *cout; 811 const MatScalar *aa = a->a, *v; 812 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 813 const PetscScalar *b; 814 815 PetscFunctionBegin; 816 PetscCall(VecGetArrayRead(bb, &b)); 817 PetscCall(VecGetArray(xx, &x)); 818 t = a->solve_work; 819 820 PetscCall(ISGetIndices(isrow, &rout)); 821 r = rout; 822 PetscCall(ISGetIndices(iscol, &cout)); 823 c = cout; 824 825 /* forward solve the lower triangular */ 826 idx = 4 * r[0]; 827 t[0] = b[idx]; 828 t[1] = b[1 + idx]; 829 t[2] = b[2 + idx]; 830 t[3] = b[3 + idx]; 831 for (i = 1; i < n; i++) { 832 v = aa + 16 * ai[i]; 833 vi = aj + ai[i]; 834 nz = ai[i + 1] - ai[i]; 835 idx = 4 * r[i]; 836 s1 = b[idx]; 837 s2 = b[1 + idx]; 838 s3 = b[2 + idx]; 839 s4 = b[3 + idx]; 840 for (m = 0; m < nz; m++) { 841 idx = 4 * vi[m]; 842 x1 = t[idx]; 843 x2 = t[1 + idx]; 844 x3 = t[2 + idx]; 845 x4 = t[3 + idx]; 846 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 847 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 848 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 849 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 850 v += 16; 851 } 852 idx = 4 * i; 853 t[idx] = s1; 854 t[1 + idx] = s2; 855 t[2 + idx] = s3; 856 t[3 + idx] = s4; 857 } 858 /* backward solve the upper triangular */ 859 for (i = n - 1; i >= 0; i--) { 860 v = aa + 16 * (adiag[i + 1] + 1); 861 vi = aj + adiag[i + 1] + 1; 862 nz = adiag[i] - adiag[i + 1] - 1; 863 idt = 4 * i; 864 s1 = t[idt]; 865 s2 = t[1 + idt]; 866 s3 = t[2 + idt]; 867 s4 = t[3 + idt]; 868 for (m = 0; m < nz; m++) { 869 idx = 4 * vi[m]; 870 x1 = t[idx]; 871 x2 = t[1 + idx]; 872 x3 = t[2 + idx]; 873 x4 = t[3 + idx]; 874 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 875 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 876 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 877 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 878 v += 16; 879 } 880 idc = 4 * c[i]; 881 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 882 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 883 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 884 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 885 } 886 887 PetscCall(ISRestoreIndices(isrow, &rout)); 888 PetscCall(ISRestoreIndices(iscol, &cout)); 889 PetscCall(VecRestoreArrayRead(bb, &b)); 890 PetscCall(VecRestoreArray(xx, &x)); 891 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 892 PetscFunctionReturn(0); 893 } 894 895 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) { 896 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 897 IS iscol = a->col, isrow = a->row; 898 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 899 PetscInt i, nz, idx, idt, idc; 900 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 901 const MatScalar *aa = a->a, *v; 902 MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 903 PetscScalar *x; 904 const PetscScalar *b; 905 906 PetscFunctionBegin; 907 PetscCall(VecGetArrayRead(bb, &b)); 908 PetscCall(VecGetArray(xx, &x)); 909 t = (MatScalar *)a->solve_work; 910 911 PetscCall(ISGetIndices(isrow, &rout)); 912 r = rout; 913 PetscCall(ISGetIndices(iscol, &cout)); 914 c = cout + (n - 1); 915 916 /* forward solve the lower triangular */ 917 idx = 4 * (*r++); 918 t[0] = (MatScalar)b[idx]; 919 t[1] = (MatScalar)b[1 + idx]; 920 t[2] = (MatScalar)b[2 + idx]; 921 t[3] = (MatScalar)b[3 + idx]; 922 for (i = 1; i < n; i++) { 923 v = aa + 16 * ai[i]; 924 vi = aj + ai[i]; 925 nz = diag[i] - ai[i]; 926 idx = 4 * (*r++); 927 s1 = (MatScalar)b[idx]; 928 s2 = (MatScalar)b[1 + idx]; 929 s3 = (MatScalar)b[2 + idx]; 930 s4 = (MatScalar)b[3 + idx]; 931 while (nz--) { 932 idx = 4 * (*vi++); 933 x1 = t[idx]; 934 x2 = t[1 + idx]; 935 x3 = t[2 + idx]; 936 x4 = t[3 + idx]; 937 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 938 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 939 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 940 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 941 v += 16; 942 } 943 idx = 4 * i; 944 t[idx] = s1; 945 t[1 + idx] = s2; 946 t[2 + idx] = s3; 947 t[3 + idx] = s4; 948 } 949 /* backward solve the upper triangular */ 950 for (i = n - 1; i >= 0; i--) { 951 v = aa + 16 * diag[i] + 16; 952 vi = aj + diag[i] + 1; 953 nz = ai[i + 1] - diag[i] - 1; 954 idt = 4 * i; 955 s1 = t[idt]; 956 s2 = t[1 + idt]; 957 s3 = t[2 + idt]; 958 s4 = t[3 + idt]; 959 while (nz--) { 960 idx = 4 * (*vi++); 961 x1 = t[idx]; 962 x2 = t[1 + idx]; 963 x3 = t[2 + idx]; 964 x4 = t[3 + idx]; 965 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 966 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 967 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 968 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 969 v += 16; 970 } 971 idc = 4 * (*c--); 972 v = aa + 16 * diag[i]; 973 t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 974 t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 975 t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 976 t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 977 x[idc] = (PetscScalar)t[idt]; 978 x[1 + idc] = (PetscScalar)t[1 + idt]; 979 x[2 + idc] = (PetscScalar)t[2 + idt]; 980 x[3 + idc] = (PetscScalar)t[3 + idt]; 981 } 982 983 PetscCall(ISRestoreIndices(isrow, &rout)); 984 PetscCall(ISRestoreIndices(iscol, &cout)); 985 PetscCall(VecRestoreArrayRead(bb, &b)); 986 PetscCall(VecRestoreArray(xx, &x)); 987 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 988 PetscFunctionReturn(0); 989 } 990 991 #if defined(PETSC_HAVE_SSE) 992 993 #include PETSC_HAVE_SSE 994 995 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) { 996 /* 997 Note: This code uses demotion of double 998 to float when performing the mixed-mode computation. 999 This may not be numerically reasonable for all applications. 1000 */ 1001 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1002 IS iscol = a->col, isrow = a->row; 1003 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 1004 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1005 MatScalar *aa = a->a, *v; 1006 PetscScalar *x, *b, *t; 1007 1008 /* Make space in temp stack for 16 Byte Aligned arrays */ 1009 float ssealignedspace[11], *tmps, *tmpx; 1010 unsigned long offset; 1011 1012 PetscFunctionBegin; 1013 SSE_SCOPE_BEGIN; 1014 1015 offset = (unsigned long)ssealignedspace % 16; 1016 if (offset) offset = (16 - offset) / 4; 1017 tmps = &ssealignedspace[offset]; 1018 tmpx = &ssealignedspace[offset + 4]; 1019 PREFETCH_NTA(aa + 16 * ai[1]); 1020 1021 PetscCall(VecGetArray(bb, &b)); 1022 PetscCall(VecGetArray(xx, &x)); 1023 t = a->solve_work; 1024 1025 PetscCall(ISGetIndices(isrow, &rout)); 1026 r = rout; 1027 PetscCall(ISGetIndices(iscol, &cout)); 1028 c = cout + (n - 1); 1029 1030 /* forward solve the lower triangular */ 1031 idx = 4 * (*r++); 1032 t[0] = b[idx]; 1033 t[1] = b[1 + idx]; 1034 t[2] = b[2 + idx]; 1035 t[3] = b[3 + idx]; 1036 v = aa + 16 * ai[1]; 1037 1038 for (i = 1; i < n;) { 1039 PREFETCH_NTA(&v[8]); 1040 vi = aj + ai[i]; 1041 nz = diag[i] - ai[i]; 1042 idx = 4 * (*r++); 1043 1044 /* Demote sum from double to float */ 1045 CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 1046 LOAD_PS(tmps, XMM7); 1047 1048 while (nz--) { 1049 PREFETCH_NTA(&v[16]); 1050 idx = 4 * (*vi++); 1051 1052 /* Demote solution (so far) from double to float */ 1053 CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 1054 1055 /* 4x4 Matrix-Vector product with negative accumulation: */ 1056 SSE_INLINE_BEGIN_2(tmpx, v) 1057 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1058 1059 /* First Column */ 1060 SSE_COPY_PS(XMM0, XMM6) 1061 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1062 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1063 SSE_SUB_PS(XMM7, XMM0) 1064 1065 /* Second Column */ 1066 SSE_COPY_PS(XMM1, XMM6) 1067 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1068 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1069 SSE_SUB_PS(XMM7, XMM1) 1070 1071 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1072 1073 /* Third Column */ 1074 SSE_COPY_PS(XMM2, XMM6) 1075 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1076 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1077 SSE_SUB_PS(XMM7, XMM2) 1078 1079 /* Fourth Column */ 1080 SSE_COPY_PS(XMM3, XMM6) 1081 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1082 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1083 SSE_SUB_PS(XMM7, XMM3) 1084 SSE_INLINE_END_2 1085 1086 v += 16; 1087 } 1088 idx = 4 * i; 1089 v = aa + 16 * ai[++i]; 1090 PREFETCH_NTA(v); 1091 STORE_PS(tmps, XMM7); 1092 1093 /* Promote result from float to double */ 1094 CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 1095 } 1096 /* backward solve the upper triangular */ 1097 idt = 4 * (n - 1); 1098 ai16 = 16 * diag[n - 1]; 1099 v = aa + ai16 + 16; 1100 for (i = n - 1; i >= 0;) { 1101 PREFETCH_NTA(&v[8]); 1102 vi = aj + diag[i] + 1; 1103 nz = ai[i + 1] - diag[i] - 1; 1104 1105 /* Demote accumulator from double to float */ 1106 CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 1107 LOAD_PS(tmps, XMM7); 1108 1109 while (nz--) { 1110 PREFETCH_NTA(&v[16]); 1111 idx = 4 * (*vi++); 1112 1113 /* Demote solution (so far) from double to float */ 1114 CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 1115 1116 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1117 SSE_INLINE_BEGIN_2(tmpx, v) 1118 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1119 1120 /* First Column */ 1121 SSE_COPY_PS(XMM0, XMM6) 1122 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1123 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1124 SSE_SUB_PS(XMM7, XMM0) 1125 1126 /* Second Column */ 1127 SSE_COPY_PS(XMM1, XMM6) 1128 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1129 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1130 SSE_SUB_PS(XMM7, XMM1) 1131 1132 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1133 1134 /* Third Column */ 1135 SSE_COPY_PS(XMM2, XMM6) 1136 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1137 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1138 SSE_SUB_PS(XMM7, XMM2) 1139 1140 /* Fourth Column */ 1141 SSE_COPY_PS(XMM3, XMM6) 1142 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1143 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1144 SSE_SUB_PS(XMM7, XMM3) 1145 SSE_INLINE_END_2 1146 v += 16; 1147 } 1148 v = aa + ai16; 1149 ai16 = 16 * diag[--i]; 1150 PREFETCH_NTA(aa + ai16 + 16); 1151 /* 1152 Scale the result by the diagonal 4x4 block, 1153 which was inverted as part of the factorization 1154 */ 1155 SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 1156 /* First Column */ 1157 SSE_COPY_PS(XMM0, XMM7) 1158 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1159 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 1160 1161 /* Second Column */ 1162 SSE_COPY_PS(XMM1, XMM7) 1163 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1164 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 1165 SSE_ADD_PS(XMM0, XMM1) 1166 1167 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 1168 1169 /* Third Column */ 1170 SSE_COPY_PS(XMM2, XMM7) 1171 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1172 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 1173 SSE_ADD_PS(XMM0, XMM2) 1174 1175 /* Fourth Column */ 1176 SSE_COPY_PS(XMM3, XMM7) 1177 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1178 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 1179 SSE_ADD_PS(XMM0, XMM3) 1180 1181 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 1182 SSE_INLINE_END_3 1183 1184 /* Promote solution from float to double */ 1185 CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 1186 1187 /* Apply reordering to t and stream into x. */ 1188 /* This way, x doesn't pollute the cache. */ 1189 /* Be careful with size: 2 doubles = 4 floats! */ 1190 idc = 4 * (*c--); 1191 SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 1192 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1193 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 1194 SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 1195 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1196 SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 1197 SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 1198 SSE_INLINE_END_2 1199 v = aa + ai16 + 16; 1200 idt -= 4; 1201 } 1202 1203 PetscCall(ISRestoreIndices(isrow, &rout)); 1204 PetscCall(ISRestoreIndices(iscol, &cout)); 1205 PetscCall(VecRestoreArray(bb, &b)); 1206 PetscCall(VecRestoreArray(xx, &x)); 1207 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 1208 SSE_SCOPE_END; 1209 PetscFunctionReturn(0); 1210 } 1211 1212 #endif 1213 1214 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) { 1215 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1216 IS iscol = a->col, isrow = a->row; 1217 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1218 PetscInt i, nz, idx, idt, idc; 1219 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1220 const MatScalar *aa = a->a, *v; 1221 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1222 const PetscScalar *b; 1223 1224 PetscFunctionBegin; 1225 PetscCall(VecGetArrayRead(bb, &b)); 1226 PetscCall(VecGetArray(xx, &x)); 1227 t = a->solve_work; 1228 1229 PetscCall(ISGetIndices(isrow, &rout)); 1230 r = rout; 1231 PetscCall(ISGetIndices(iscol, &cout)); 1232 c = cout + (n - 1); 1233 1234 /* forward solve the lower triangular */ 1235 idx = 3 * (*r++); 1236 t[0] = b[idx]; 1237 t[1] = b[1 + idx]; 1238 t[2] = b[2 + idx]; 1239 for (i = 1; i < n; i++) { 1240 v = aa + 9 * ai[i]; 1241 vi = aj + ai[i]; 1242 nz = diag[i] - ai[i]; 1243 idx = 3 * (*r++); 1244 s1 = b[idx]; 1245 s2 = b[1 + idx]; 1246 s3 = b[2 + idx]; 1247 while (nz--) { 1248 idx = 3 * (*vi++); 1249 x1 = t[idx]; 1250 x2 = t[1 + idx]; 1251 x3 = t[2 + idx]; 1252 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1253 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1254 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1255 v += 9; 1256 } 1257 idx = 3 * i; 1258 t[idx] = s1; 1259 t[1 + idx] = s2; 1260 t[2 + idx] = s3; 1261 } 1262 /* backward solve the upper triangular */ 1263 for (i = n - 1; i >= 0; i--) { 1264 v = aa + 9 * diag[i] + 9; 1265 vi = aj + diag[i] + 1; 1266 nz = ai[i + 1] - diag[i] - 1; 1267 idt = 3 * i; 1268 s1 = t[idt]; 1269 s2 = t[1 + idt]; 1270 s3 = t[2 + idt]; 1271 while (nz--) { 1272 idx = 3 * (*vi++); 1273 x1 = t[idx]; 1274 x2 = t[1 + idx]; 1275 x3 = t[2 + idx]; 1276 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1277 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1278 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1279 v += 9; 1280 } 1281 idc = 3 * (*c--); 1282 v = aa + 9 * diag[i]; 1283 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1284 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1285 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1286 } 1287 PetscCall(ISRestoreIndices(isrow, &rout)); 1288 PetscCall(ISRestoreIndices(iscol, &cout)); 1289 PetscCall(VecRestoreArrayRead(bb, &b)); 1290 PetscCall(VecRestoreArray(xx, &x)); 1291 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1292 PetscFunctionReturn(0); 1293 } 1294 1295 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) { 1296 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1297 IS iscol = a->col, isrow = a->row; 1298 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1299 PetscInt i, nz, idx, idt, idc, m; 1300 const PetscInt *r, *c, *rout, *cout; 1301 const MatScalar *aa = a->a, *v; 1302 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1303 const PetscScalar *b; 1304 1305 PetscFunctionBegin; 1306 PetscCall(VecGetArrayRead(bb, &b)); 1307 PetscCall(VecGetArray(xx, &x)); 1308 t = a->solve_work; 1309 1310 PetscCall(ISGetIndices(isrow, &rout)); 1311 r = rout; 1312 PetscCall(ISGetIndices(iscol, &cout)); 1313 c = cout; 1314 1315 /* forward solve the lower triangular */ 1316 idx = 3 * r[0]; 1317 t[0] = b[idx]; 1318 t[1] = b[1 + idx]; 1319 t[2] = b[2 + idx]; 1320 for (i = 1; i < n; i++) { 1321 v = aa + 9 * ai[i]; 1322 vi = aj + ai[i]; 1323 nz = ai[i + 1] - ai[i]; 1324 idx = 3 * r[i]; 1325 s1 = b[idx]; 1326 s2 = b[1 + idx]; 1327 s3 = b[2 + idx]; 1328 for (m = 0; m < nz; m++) { 1329 idx = 3 * vi[m]; 1330 x1 = t[idx]; 1331 x2 = t[1 + idx]; 1332 x3 = t[2 + idx]; 1333 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1334 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1335 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1336 v += 9; 1337 } 1338 idx = 3 * i; 1339 t[idx] = s1; 1340 t[1 + idx] = s2; 1341 t[2 + idx] = s3; 1342 } 1343 /* backward solve the upper triangular */ 1344 for (i = n - 1; i >= 0; i--) { 1345 v = aa + 9 * (adiag[i + 1] + 1); 1346 vi = aj + adiag[i + 1] + 1; 1347 nz = adiag[i] - adiag[i + 1] - 1; 1348 idt = 3 * i; 1349 s1 = t[idt]; 1350 s2 = t[1 + idt]; 1351 s3 = t[2 + idt]; 1352 for (m = 0; m < nz; m++) { 1353 idx = 3 * vi[m]; 1354 x1 = t[idx]; 1355 x2 = t[1 + idx]; 1356 x3 = t[2 + idx]; 1357 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1358 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1359 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1360 v += 9; 1361 } 1362 idc = 3 * c[i]; 1363 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1364 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1365 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1366 } 1367 PetscCall(ISRestoreIndices(isrow, &rout)); 1368 PetscCall(ISRestoreIndices(iscol, &cout)); 1369 PetscCall(VecRestoreArrayRead(bb, &b)); 1370 PetscCall(VecRestoreArray(xx, &x)); 1371 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1372 PetscFunctionReturn(0); 1373 } 1374 1375 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) { 1376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1377 IS iscol = a->col, isrow = a->row; 1378 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1379 PetscInt i, nz, idx, idt, idc; 1380 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1381 const MatScalar *aa = a->a, *v; 1382 PetscScalar *x, s1, s2, x1, x2, *t; 1383 const PetscScalar *b; 1384 1385 PetscFunctionBegin; 1386 PetscCall(VecGetArrayRead(bb, &b)); 1387 PetscCall(VecGetArray(xx, &x)); 1388 t = a->solve_work; 1389 1390 PetscCall(ISGetIndices(isrow, &rout)); 1391 r = rout; 1392 PetscCall(ISGetIndices(iscol, &cout)); 1393 c = cout + (n - 1); 1394 1395 /* forward solve the lower triangular */ 1396 idx = 2 * (*r++); 1397 t[0] = b[idx]; 1398 t[1] = b[1 + idx]; 1399 for (i = 1; i < n; i++) { 1400 v = aa + 4 * ai[i]; 1401 vi = aj + ai[i]; 1402 nz = diag[i] - ai[i]; 1403 idx = 2 * (*r++); 1404 s1 = b[idx]; 1405 s2 = b[1 + idx]; 1406 while (nz--) { 1407 idx = 2 * (*vi++); 1408 x1 = t[idx]; 1409 x2 = t[1 + idx]; 1410 s1 -= v[0] * x1 + v[2] * x2; 1411 s2 -= v[1] * x1 + v[3] * x2; 1412 v += 4; 1413 } 1414 idx = 2 * i; 1415 t[idx] = s1; 1416 t[1 + idx] = s2; 1417 } 1418 /* backward solve the upper triangular */ 1419 for (i = n - 1; i >= 0; i--) { 1420 v = aa + 4 * diag[i] + 4; 1421 vi = aj + diag[i] + 1; 1422 nz = ai[i + 1] - diag[i] - 1; 1423 idt = 2 * i; 1424 s1 = t[idt]; 1425 s2 = t[1 + idt]; 1426 while (nz--) { 1427 idx = 2 * (*vi++); 1428 x1 = t[idx]; 1429 x2 = t[1 + idx]; 1430 s1 -= v[0] * x1 + v[2] * x2; 1431 s2 -= v[1] * x1 + v[3] * x2; 1432 v += 4; 1433 } 1434 idc = 2 * (*c--); 1435 v = aa + 4 * diag[i]; 1436 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1437 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1438 } 1439 PetscCall(ISRestoreIndices(isrow, &rout)); 1440 PetscCall(ISRestoreIndices(iscol, &cout)); 1441 PetscCall(VecRestoreArrayRead(bb, &b)); 1442 PetscCall(VecRestoreArray(xx, &x)); 1443 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) { 1448 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1449 IS iscol = a->col, isrow = a->row; 1450 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1451 PetscInt i, nz, idx, jdx, idt, idc, m; 1452 const PetscInt *r, *c, *rout, *cout; 1453 const MatScalar *aa = a->a, *v; 1454 PetscScalar *x, s1, s2, x1, x2, *t; 1455 const PetscScalar *b; 1456 1457 PetscFunctionBegin; 1458 PetscCall(VecGetArrayRead(bb, &b)); 1459 PetscCall(VecGetArray(xx, &x)); 1460 t = a->solve_work; 1461 1462 PetscCall(ISGetIndices(isrow, &rout)); 1463 r = rout; 1464 PetscCall(ISGetIndices(iscol, &cout)); 1465 c = cout; 1466 1467 /* forward solve the lower triangular */ 1468 idx = 2 * r[0]; 1469 t[0] = b[idx]; 1470 t[1] = b[1 + idx]; 1471 for (i = 1; i < n; i++) { 1472 v = aa + 4 * ai[i]; 1473 vi = aj + ai[i]; 1474 nz = ai[i + 1] - ai[i]; 1475 idx = 2 * r[i]; 1476 s1 = b[idx]; 1477 s2 = b[1 + idx]; 1478 for (m = 0; m < nz; m++) { 1479 jdx = 2 * vi[m]; 1480 x1 = t[jdx]; 1481 x2 = t[1 + jdx]; 1482 s1 -= v[0] * x1 + v[2] * x2; 1483 s2 -= v[1] * x1 + v[3] * x2; 1484 v += 4; 1485 } 1486 idx = 2 * i; 1487 t[idx] = s1; 1488 t[1 + idx] = s2; 1489 } 1490 /* backward solve the upper triangular */ 1491 for (i = n - 1; i >= 0; i--) { 1492 v = aa + 4 * (adiag[i + 1] + 1); 1493 vi = aj + adiag[i + 1] + 1; 1494 nz = adiag[i] - adiag[i + 1] - 1; 1495 idt = 2 * i; 1496 s1 = t[idt]; 1497 s2 = t[1 + idt]; 1498 for (m = 0; m < nz; m++) { 1499 idx = 2 * vi[m]; 1500 x1 = t[idx]; 1501 x2 = t[1 + idx]; 1502 s1 -= v[0] * x1 + v[2] * x2; 1503 s2 -= v[1] * x1 + v[3] * x2; 1504 v += 4; 1505 } 1506 idc = 2 * c[i]; 1507 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1508 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1509 } 1510 PetscCall(ISRestoreIndices(isrow, &rout)); 1511 PetscCall(ISRestoreIndices(iscol, &cout)); 1512 PetscCall(VecRestoreArrayRead(bb, &b)); 1513 PetscCall(VecRestoreArray(xx, &x)); 1514 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1515 PetscFunctionReturn(0); 1516 } 1517 1518 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 1519 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1520 IS iscol = a->col, isrow = a->row; 1521 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1522 PetscInt i, nz; 1523 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1524 const MatScalar *aa = a->a, *v; 1525 PetscScalar *x, s1, *t; 1526 const PetscScalar *b; 1527 1528 PetscFunctionBegin; 1529 if (!n) PetscFunctionReturn(0); 1530 1531 PetscCall(VecGetArrayRead(bb, &b)); 1532 PetscCall(VecGetArray(xx, &x)); 1533 t = a->solve_work; 1534 1535 PetscCall(ISGetIndices(isrow, &rout)); 1536 r = rout; 1537 PetscCall(ISGetIndices(iscol, &cout)); 1538 c = cout + (n - 1); 1539 1540 /* forward solve the lower triangular */ 1541 t[0] = b[*r++]; 1542 for (i = 1; i < n; i++) { 1543 v = aa + ai[i]; 1544 vi = aj + ai[i]; 1545 nz = diag[i] - ai[i]; 1546 s1 = b[*r++]; 1547 while (nz--) { s1 -= (*v++) * t[*vi++]; } 1548 t[i] = s1; 1549 } 1550 /* backward solve the upper triangular */ 1551 for (i = n - 1; i >= 0; i--) { 1552 v = aa + diag[i] + 1; 1553 vi = aj + diag[i] + 1; 1554 nz = ai[i + 1] - diag[i] - 1; 1555 s1 = t[i]; 1556 while (nz--) { s1 -= (*v++) * t[*vi++]; } 1557 x[*c--] = t[i] = aa[diag[i]] * s1; 1558 } 1559 1560 PetscCall(ISRestoreIndices(isrow, &rout)); 1561 PetscCall(ISRestoreIndices(iscol, &cout)); 1562 PetscCall(VecRestoreArrayRead(bb, &b)); 1563 PetscCall(VecRestoreArray(xx, &x)); 1564 PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 1565 PetscFunctionReturn(0); 1566 } 1567 1568 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) { 1569 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1570 IS iscol = a->col, isrow = a->row; 1571 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 1572 const PetscInt *rout, *cout, *r, *c; 1573 PetscScalar *x, *tmp, sum; 1574 const PetscScalar *b; 1575 const MatScalar *aa = a->a, *v; 1576 1577 PetscFunctionBegin; 1578 if (!n) PetscFunctionReturn(0); 1579 1580 PetscCall(VecGetArrayRead(bb, &b)); 1581 PetscCall(VecGetArray(xx, &x)); 1582 tmp = a->solve_work; 1583 1584 PetscCall(ISGetIndices(isrow, &rout)); 1585 r = rout; 1586 PetscCall(ISGetIndices(iscol, &cout)); 1587 c = cout; 1588 1589 /* forward solve the lower triangular */ 1590 tmp[0] = b[r[0]]; 1591 v = aa; 1592 vi = aj; 1593 for (i = 1; i < n; i++) { 1594 nz = ai[i + 1] - ai[i]; 1595 sum = b[r[i]]; 1596 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1597 tmp[i] = sum; 1598 v += nz; 1599 vi += nz; 1600 } 1601 1602 /* backward solve the upper triangular */ 1603 for (i = n - 1; i >= 0; i--) { 1604 v = aa + adiag[i + 1] + 1; 1605 vi = aj + adiag[i + 1] + 1; 1606 nz = adiag[i] - adiag[i + 1] - 1; 1607 sum = tmp[i]; 1608 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1609 x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 1610 } 1611 1612 PetscCall(ISRestoreIndices(isrow, &rout)); 1613 PetscCall(ISRestoreIndices(iscol, &cout)); 1614 PetscCall(VecRestoreArrayRead(bb, &b)); 1615 PetscCall(VecRestoreArray(xx, &x)); 1616 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1617 PetscFunctionReturn(0); 1618 } 1619