1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 5 { 6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 7 IS iscol = a->col, isrow = a->row; 8 const PetscInt *r, *c, *rout, *cout; 9 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 10 PetscInt i, nz; 11 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 12 const MatScalar *aa = a->a, *v; 13 PetscScalar *x, *s, *t, *ls; 14 const PetscScalar *b; 15 16 PetscFunctionBegin; 17 PetscCall(VecGetArrayRead(bb, &b)); 18 PetscCall(VecGetArray(xx, &x)); 19 t = a->solve_work; 20 21 PetscCall(ISGetIndices(isrow, &rout)); 22 r = rout; 23 PetscCall(ISGetIndices(iscol, &cout)); 24 c = cout + (n - 1); 25 26 /* forward solve the lower triangular */ 27 PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 28 for (i = 1; i < n; i++) { 29 v = aa + bs2 * ai[i]; 30 vi = aj + ai[i]; 31 nz = a->diag[i] - ai[i]; 32 s = t + bs * i; 33 PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 34 while (nz--) { 35 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 36 v += bs2; 37 } 38 } 39 /* backward solve the upper triangular */ 40 ls = a->solve_work + A->cmap->n; 41 for (i = n - 1; i >= 0; i--) { 42 v = aa + bs2 * (a->diag[i] + 1); 43 vi = aj + a->diag[i] + 1; 44 nz = ai[i + 1] - a->diag[i] - 1; 45 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 46 while (nz--) { 47 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 48 v += bs2; 49 } 50 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 51 PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 52 } 53 54 PetscCall(ISRestoreIndices(isrow, &rout)); 55 PetscCall(ISRestoreIndices(iscol, &cout)); 56 PetscCall(VecRestoreArrayRead(bb, &b)); 57 PetscCall(VecRestoreArray(xx, &x)); 58 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 59 PetscFunctionReturn(PETSC_SUCCESS); 60 } 61 62 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 63 { 64 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 65 IS iscol = a->col, isrow = a->row; 66 const PetscInt *r, *c, *ai = a->i, *aj = a->j; 67 const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 68 PetscInt i, nz, idx, idt, idc; 69 const MatScalar *aa = a->a, *v; 70 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 71 const PetscScalar *b; 72 73 PetscFunctionBegin; 74 PetscCall(VecGetArrayRead(bb, &b)); 75 PetscCall(VecGetArray(xx, &x)); 76 t = a->solve_work; 77 78 PetscCall(ISGetIndices(isrow, &rout)); 79 r = rout; 80 PetscCall(ISGetIndices(iscol, &cout)); 81 c = cout + (n - 1); 82 83 /* forward solve the lower triangular */ 84 idx = 7 * (*r++); 85 t[0] = b[idx]; 86 t[1] = b[1 + idx]; 87 t[2] = b[2 + idx]; 88 t[3] = b[3 + idx]; 89 t[4] = b[4 + idx]; 90 t[5] = b[5 + idx]; 91 t[6] = b[6 + idx]; 92 93 for (i = 1; i < n; i++) { 94 v = aa + 49 * ai[i]; 95 vi = aj + ai[i]; 96 nz = diag[i] - ai[i]; 97 idx = 7 * (*r++); 98 s1 = b[idx]; 99 s2 = b[1 + idx]; 100 s3 = b[2 + idx]; 101 s4 = b[3 + idx]; 102 s5 = b[4 + idx]; 103 s6 = b[5 + idx]; 104 s7 = b[6 + idx]; 105 while (nz--) { 106 idx = 7 * (*vi++); 107 x1 = t[idx]; 108 x2 = t[1 + idx]; 109 x3 = t[2 + idx]; 110 x4 = t[3 + idx]; 111 x5 = t[4 + idx]; 112 x6 = t[5 + idx]; 113 x7 = t[6 + idx]; 114 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 115 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 116 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 117 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 118 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 119 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 120 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 121 v += 49; 122 } 123 idx = 7 * i; 124 t[idx] = s1; 125 t[1 + idx] = s2; 126 t[2 + idx] = s3; 127 t[3 + idx] = s4; 128 t[4 + idx] = s5; 129 t[5 + idx] = s6; 130 t[6 + idx] = s7; 131 } 132 /* backward solve the upper triangular */ 133 for (i = n - 1; i >= 0; i--) { 134 v = aa + 49 * diag[i] + 49; 135 vi = aj + diag[i] + 1; 136 nz = ai[i + 1] - diag[i] - 1; 137 idt = 7 * i; 138 s1 = t[idt]; 139 s2 = t[1 + idt]; 140 s3 = t[2 + idt]; 141 s4 = t[3 + idt]; 142 s5 = t[4 + idt]; 143 s6 = t[5 + idt]; 144 s7 = t[6 + idt]; 145 while (nz--) { 146 idx = 7 * (*vi++); 147 x1 = t[idx]; 148 x2 = t[1 + idx]; 149 x3 = t[2 + idx]; 150 x4 = t[3 + idx]; 151 x5 = t[4 + idx]; 152 x6 = t[5 + idx]; 153 x7 = t[6 + idx]; 154 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 155 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 156 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 157 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 158 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 159 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 160 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 161 v += 49; 162 } 163 idc = 7 * (*c--); 164 v = aa + 49 * diag[i]; 165 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 166 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 167 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 168 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 169 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 170 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 171 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 172 } 173 174 PetscCall(ISRestoreIndices(isrow, &rout)); 175 PetscCall(ISRestoreIndices(iscol, &cout)); 176 PetscCall(VecRestoreArrayRead(bb, &b)); 177 PetscCall(VecRestoreArray(xx, &x)); 178 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 179 PetscFunctionReturn(PETSC_SUCCESS); 180 } 181 182 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) 183 { 184 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 185 IS iscol = a->col, isrow = a->row; 186 const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 187 const PetscInt n = a->mbs, *rout, *cout, *vi; 188 PetscInt i, nz, idx, idt, idc, m; 189 const MatScalar *aa = a->a, *v; 190 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 191 const PetscScalar *b; 192 193 PetscFunctionBegin; 194 PetscCall(VecGetArrayRead(bb, &b)); 195 PetscCall(VecGetArray(xx, &x)); 196 t = a->solve_work; 197 198 PetscCall(ISGetIndices(isrow, &rout)); 199 r = rout; 200 PetscCall(ISGetIndices(iscol, &cout)); 201 c = cout; 202 203 /* forward solve the lower triangular */ 204 idx = 7 * r[0]; 205 t[0] = b[idx]; 206 t[1] = b[1 + idx]; 207 t[2] = b[2 + idx]; 208 t[3] = b[3 + idx]; 209 t[4] = b[4 + idx]; 210 t[5] = b[5 + idx]; 211 t[6] = b[6 + idx]; 212 213 for (i = 1; i < n; i++) { 214 v = aa + 49 * ai[i]; 215 vi = aj + ai[i]; 216 nz = ai[i + 1] - ai[i]; 217 idx = 7 * r[i]; 218 s1 = b[idx]; 219 s2 = b[1 + idx]; 220 s3 = b[2 + idx]; 221 s4 = b[3 + idx]; 222 s5 = b[4 + idx]; 223 s6 = b[5 + idx]; 224 s7 = b[6 + idx]; 225 for (m = 0; m < nz; m++) { 226 idx = 7 * vi[m]; 227 x1 = t[idx]; 228 x2 = t[1 + idx]; 229 x3 = t[2 + idx]; 230 x4 = t[3 + idx]; 231 x5 = t[4 + idx]; 232 x6 = t[5 + idx]; 233 x7 = t[6 + idx]; 234 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 235 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 236 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 237 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 238 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 239 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 240 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 241 v += 49; 242 } 243 idx = 7 * i; 244 t[idx] = s1; 245 t[1 + idx] = s2; 246 t[2 + idx] = s3; 247 t[3 + idx] = s4; 248 t[4 + idx] = s5; 249 t[5 + idx] = s6; 250 t[6 + idx] = s7; 251 } 252 /* backward solve the upper triangular */ 253 for (i = n - 1; i >= 0; i--) { 254 v = aa + 49 * (adiag[i + 1] + 1); 255 vi = aj + adiag[i + 1] + 1; 256 nz = adiag[i] - adiag[i + 1] - 1; 257 idt = 7 * i; 258 s1 = t[idt]; 259 s2 = t[1 + idt]; 260 s3 = t[2 + idt]; 261 s4 = t[3 + idt]; 262 s5 = t[4 + idt]; 263 s6 = t[5 + idt]; 264 s7 = t[6 + idt]; 265 for (m = 0; m < nz; m++) { 266 idx = 7 * vi[m]; 267 x1 = t[idx]; 268 x2 = t[1 + idx]; 269 x3 = t[2 + idx]; 270 x4 = t[3 + idx]; 271 x5 = t[4 + idx]; 272 x6 = t[5 + idx]; 273 x7 = t[6 + idx]; 274 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 275 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 276 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 277 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 278 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 279 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 280 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 281 v += 49; 282 } 283 idc = 7 * c[i]; 284 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 285 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 286 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 287 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 288 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 289 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 290 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 291 } 292 293 PetscCall(ISRestoreIndices(isrow, &rout)); 294 PetscCall(ISRestoreIndices(iscol, &cout)); 295 PetscCall(VecRestoreArrayRead(bb, &b)); 296 PetscCall(VecRestoreArray(xx, &x)); 297 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 298 PetscFunctionReturn(PETSC_SUCCESS); 299 } 300 301 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 302 { 303 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 304 IS iscol = a->col, isrow = a->row; 305 const PetscInt *r, *c, *rout, *cout; 306 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 307 PetscInt i, nz, idx, idt, idc; 308 const MatScalar *aa = a->a, *v; 309 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 310 const PetscScalar *b; 311 312 PetscFunctionBegin; 313 PetscCall(VecGetArrayRead(bb, &b)); 314 PetscCall(VecGetArray(xx, &x)); 315 t = a->solve_work; 316 317 PetscCall(ISGetIndices(isrow, &rout)); 318 r = rout; 319 PetscCall(ISGetIndices(iscol, &cout)); 320 c = cout + (n - 1); 321 322 /* forward solve the lower triangular */ 323 idx = 6 * (*r++); 324 t[0] = b[idx]; 325 t[1] = b[1 + idx]; 326 t[2] = b[2 + idx]; 327 t[3] = b[3 + idx]; 328 t[4] = b[4 + idx]; 329 t[5] = b[5 + idx]; 330 for (i = 1; i < n; i++) { 331 v = aa + 36 * ai[i]; 332 vi = aj + ai[i]; 333 nz = diag[i] - ai[i]; 334 idx = 6 * (*r++); 335 s1 = b[idx]; 336 s2 = b[1 + idx]; 337 s3 = b[2 + idx]; 338 s4 = b[3 + idx]; 339 s5 = b[4 + idx]; 340 s6 = b[5 + idx]; 341 while (nz--) { 342 idx = 6 * (*vi++); 343 x1 = t[idx]; 344 x2 = t[1 + idx]; 345 x3 = t[2 + idx]; 346 x4 = t[3 + idx]; 347 x5 = t[4 + idx]; 348 x6 = t[5 + idx]; 349 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 350 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 351 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 352 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 353 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 354 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 355 v += 36; 356 } 357 idx = 6 * i; 358 t[idx] = s1; 359 t[1 + idx] = s2; 360 t[2 + idx] = s3; 361 t[3 + idx] = s4; 362 t[4 + idx] = s5; 363 t[5 + idx] = s6; 364 } 365 /* backward solve the upper triangular */ 366 for (i = n - 1; i >= 0; i--) { 367 v = aa + 36 * diag[i] + 36; 368 vi = aj + diag[i] + 1; 369 nz = ai[i + 1] - diag[i] - 1; 370 idt = 6 * i; 371 s1 = t[idt]; 372 s2 = t[1 + idt]; 373 s3 = t[2 + idt]; 374 s4 = t[3 + idt]; 375 s5 = t[4 + idt]; 376 s6 = t[5 + idt]; 377 while (nz--) { 378 idx = 6 * (*vi++); 379 x1 = t[idx]; 380 x2 = t[1 + idx]; 381 x3 = t[2 + idx]; 382 x4 = t[3 + idx]; 383 x5 = t[4 + idx]; 384 x6 = t[5 + idx]; 385 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 386 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 387 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 388 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 389 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 390 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 391 v += 36; 392 } 393 idc = 6 * (*c--); 394 v = aa + 36 * diag[i]; 395 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 396 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 397 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 398 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 399 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 400 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 401 } 402 403 PetscCall(ISRestoreIndices(isrow, &rout)); 404 PetscCall(ISRestoreIndices(iscol, &cout)); 405 PetscCall(VecRestoreArrayRead(bb, &b)); 406 PetscCall(VecRestoreArray(xx, &x)); 407 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 408 PetscFunctionReturn(PETSC_SUCCESS); 409 } 410 411 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) 412 { 413 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 414 IS iscol = a->col, isrow = a->row; 415 const PetscInt *r, *c, *rout, *cout; 416 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 417 PetscInt i, nz, idx, idt, idc, m; 418 const MatScalar *aa = a->a, *v; 419 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 420 const PetscScalar *b; 421 422 PetscFunctionBegin; 423 PetscCall(VecGetArrayRead(bb, &b)); 424 PetscCall(VecGetArray(xx, &x)); 425 t = a->solve_work; 426 427 PetscCall(ISGetIndices(isrow, &rout)); 428 r = rout; 429 PetscCall(ISGetIndices(iscol, &cout)); 430 c = cout; 431 432 /* forward solve the lower triangular */ 433 idx = 6 * r[0]; 434 t[0] = b[idx]; 435 t[1] = b[1 + idx]; 436 t[2] = b[2 + idx]; 437 t[3] = b[3 + idx]; 438 t[4] = b[4 + idx]; 439 t[5] = b[5 + idx]; 440 for (i = 1; i < n; i++) { 441 v = aa + 36 * ai[i]; 442 vi = aj + ai[i]; 443 nz = ai[i + 1] - ai[i]; 444 idx = 6 * r[i]; 445 s1 = b[idx]; 446 s2 = b[1 + idx]; 447 s3 = b[2 + idx]; 448 s4 = b[3 + idx]; 449 s5 = b[4 + idx]; 450 s6 = b[5 + idx]; 451 for (m = 0; m < nz; m++) { 452 idx = 6 * vi[m]; 453 x1 = t[idx]; 454 x2 = t[1 + idx]; 455 x3 = t[2 + idx]; 456 x4 = t[3 + idx]; 457 x5 = t[4 + idx]; 458 x6 = t[5 + idx]; 459 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 460 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 461 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 462 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 463 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 464 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 465 v += 36; 466 } 467 idx = 6 * i; 468 t[idx] = s1; 469 t[1 + idx] = s2; 470 t[2 + idx] = s3; 471 t[3 + idx] = s4; 472 t[4 + idx] = s5; 473 t[5 + idx] = s6; 474 } 475 /* backward solve the upper triangular */ 476 for (i = n - 1; i >= 0; i--) { 477 v = aa + 36 * (adiag[i + 1] + 1); 478 vi = aj + adiag[i + 1] + 1; 479 nz = adiag[i] - adiag[i + 1] - 1; 480 idt = 6 * i; 481 s1 = t[idt]; 482 s2 = t[1 + idt]; 483 s3 = t[2 + idt]; 484 s4 = t[3 + idt]; 485 s5 = t[4 + idt]; 486 s6 = t[5 + idt]; 487 for (m = 0; m < nz; m++) { 488 idx = 6 * vi[m]; 489 x1 = t[idx]; 490 x2 = t[1 + idx]; 491 x3 = t[2 + idx]; 492 x4 = t[3 + idx]; 493 x5 = t[4 + idx]; 494 x6 = t[5 + idx]; 495 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 496 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 497 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 498 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 499 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 500 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 501 v += 36; 502 } 503 idc = 6 * c[i]; 504 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 505 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 506 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 507 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 508 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 509 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 510 } 511 512 PetscCall(ISRestoreIndices(isrow, &rout)); 513 PetscCall(ISRestoreIndices(iscol, &cout)); 514 PetscCall(VecRestoreArrayRead(bb, &b)); 515 PetscCall(VecRestoreArray(xx, &x)); 516 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 517 PetscFunctionReturn(PETSC_SUCCESS); 518 } 519 520 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 521 { 522 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 523 IS iscol = a->col, isrow = a->row; 524 const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 525 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 526 PetscInt i, nz, idx, idt, idc; 527 const MatScalar *aa = a->a, *v; 528 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 529 const PetscScalar *b; 530 531 PetscFunctionBegin; 532 PetscCall(VecGetArrayRead(bb, &b)); 533 PetscCall(VecGetArray(xx, &x)); 534 t = a->solve_work; 535 536 PetscCall(ISGetIndices(isrow, &rout)); 537 r = rout; 538 PetscCall(ISGetIndices(iscol, &cout)); 539 c = cout + (n - 1); 540 541 /* forward solve the lower triangular */ 542 idx = 5 * (*r++); 543 t[0] = b[idx]; 544 t[1] = b[1 + idx]; 545 t[2] = b[2 + idx]; 546 t[3] = b[3 + idx]; 547 t[4] = b[4 + idx]; 548 for (i = 1; i < n; i++) { 549 v = aa + 25 * ai[i]; 550 vi = aj + ai[i]; 551 nz = diag[i] - ai[i]; 552 idx = 5 * (*r++); 553 s1 = b[idx]; 554 s2 = b[1 + idx]; 555 s3 = b[2 + idx]; 556 s4 = b[3 + idx]; 557 s5 = b[4 + idx]; 558 while (nz--) { 559 idx = 5 * (*vi++); 560 x1 = t[idx]; 561 x2 = t[1 + idx]; 562 x3 = t[2 + idx]; 563 x4 = t[3 + idx]; 564 x5 = t[4 + idx]; 565 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 566 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 567 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 568 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 569 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 570 v += 25; 571 } 572 idx = 5 * i; 573 t[idx] = s1; 574 t[1 + idx] = s2; 575 t[2 + idx] = s3; 576 t[3 + idx] = s4; 577 t[4 + idx] = s5; 578 } 579 /* backward solve the upper triangular */ 580 for (i = n - 1; i >= 0; i--) { 581 v = aa + 25 * diag[i] + 25; 582 vi = aj + diag[i] + 1; 583 nz = ai[i + 1] - diag[i] - 1; 584 idt = 5 * i; 585 s1 = t[idt]; 586 s2 = t[1 + idt]; 587 s3 = t[2 + idt]; 588 s4 = t[3 + idt]; 589 s5 = t[4 + idt]; 590 while (nz--) { 591 idx = 5 * (*vi++); 592 x1 = t[idx]; 593 x2 = t[1 + idx]; 594 x3 = t[2 + idx]; 595 x4 = t[3 + idx]; 596 x5 = t[4 + idx]; 597 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 598 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 599 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 600 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 601 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 602 v += 25; 603 } 604 idc = 5 * (*c--); 605 v = aa + 25 * diag[i]; 606 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 607 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 608 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 609 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 610 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 611 } 612 613 PetscCall(ISRestoreIndices(isrow, &rout)); 614 PetscCall(ISRestoreIndices(iscol, &cout)); 615 PetscCall(VecRestoreArrayRead(bb, &b)); 616 PetscCall(VecRestoreArray(xx, &x)); 617 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 618 PetscFunctionReturn(PETSC_SUCCESS); 619 } 620 621 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) 622 { 623 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 624 IS iscol = a->col, isrow = a->row; 625 const PetscInt *r, *c, *rout, *cout; 626 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 627 PetscInt i, nz, idx, idt, idc, m; 628 const MatScalar *aa = a->a, *v; 629 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 630 const PetscScalar *b; 631 632 PetscFunctionBegin; 633 PetscCall(VecGetArrayRead(bb, &b)); 634 PetscCall(VecGetArray(xx, &x)); 635 t = a->solve_work; 636 637 PetscCall(ISGetIndices(isrow, &rout)); 638 r = rout; 639 PetscCall(ISGetIndices(iscol, &cout)); 640 c = cout; 641 642 /* forward solve the lower triangular */ 643 idx = 5 * r[0]; 644 t[0] = b[idx]; 645 t[1] = b[1 + idx]; 646 t[2] = b[2 + idx]; 647 t[3] = b[3 + idx]; 648 t[4] = b[4 + idx]; 649 for (i = 1; i < n; i++) { 650 v = aa + 25 * ai[i]; 651 vi = aj + ai[i]; 652 nz = ai[i + 1] - ai[i]; 653 idx = 5 * r[i]; 654 s1 = b[idx]; 655 s2 = b[1 + idx]; 656 s3 = b[2 + idx]; 657 s4 = b[3 + idx]; 658 s5 = b[4 + idx]; 659 for (m = 0; m < nz; m++) { 660 idx = 5 * vi[m]; 661 x1 = t[idx]; 662 x2 = t[1 + idx]; 663 x3 = t[2 + idx]; 664 x4 = t[3 + idx]; 665 x5 = t[4 + idx]; 666 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 667 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 668 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 669 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 670 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 671 v += 25; 672 } 673 idx = 5 * i; 674 t[idx] = s1; 675 t[1 + idx] = s2; 676 t[2 + idx] = s3; 677 t[3 + idx] = s4; 678 t[4 + idx] = s5; 679 } 680 /* backward solve the upper triangular */ 681 for (i = n - 1; i >= 0; i--) { 682 v = aa + 25 * (adiag[i + 1] + 1); 683 vi = aj + adiag[i + 1] + 1; 684 nz = adiag[i] - adiag[i + 1] - 1; 685 idt = 5 * i; 686 s1 = t[idt]; 687 s2 = t[1 + idt]; 688 s3 = t[2 + idt]; 689 s4 = t[3 + idt]; 690 s5 = t[4 + idt]; 691 for (m = 0; m < nz; m++) { 692 idx = 5 * vi[m]; 693 x1 = t[idx]; 694 x2 = t[1 + idx]; 695 x3 = t[2 + idx]; 696 x4 = t[3 + idx]; 697 x5 = t[4 + idx]; 698 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 699 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 700 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 701 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 702 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 703 v += 25; 704 } 705 idc = 5 * c[i]; 706 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 707 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 708 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 709 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 710 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 711 } 712 713 PetscCall(ISRestoreIndices(isrow, &rout)); 714 PetscCall(ISRestoreIndices(iscol, &cout)); 715 PetscCall(VecRestoreArrayRead(bb, &b)); 716 PetscCall(VecRestoreArray(xx, &x)); 717 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 718 PetscFunctionReturn(PETSC_SUCCESS); 719 } 720 721 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 722 { 723 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 724 IS iscol = a->col, isrow = a->row; 725 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 726 PetscInt i, nz, idx, idt, idc; 727 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 728 const MatScalar *aa = a->a, *v; 729 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 730 const PetscScalar *b; 731 732 PetscFunctionBegin; 733 PetscCall(VecGetArrayRead(bb, &b)); 734 PetscCall(VecGetArray(xx, &x)); 735 t = a->solve_work; 736 737 PetscCall(ISGetIndices(isrow, &rout)); 738 r = rout; 739 PetscCall(ISGetIndices(iscol, &cout)); 740 c = cout + (n - 1); 741 742 /* forward solve the lower triangular */ 743 idx = 4 * (*r++); 744 t[0] = b[idx]; 745 t[1] = b[1 + idx]; 746 t[2] = b[2 + idx]; 747 t[3] = b[3 + idx]; 748 for (i = 1; i < n; i++) { 749 v = aa + 16 * ai[i]; 750 vi = aj + ai[i]; 751 nz = diag[i] - ai[i]; 752 idx = 4 * (*r++); 753 s1 = b[idx]; 754 s2 = b[1 + idx]; 755 s3 = b[2 + idx]; 756 s4 = b[3 + idx]; 757 while (nz--) { 758 idx = 4 * (*vi++); 759 x1 = t[idx]; 760 x2 = t[1 + idx]; 761 x3 = t[2 + idx]; 762 x4 = t[3 + idx]; 763 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 764 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 765 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 766 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 767 v += 16; 768 } 769 idx = 4 * i; 770 t[idx] = s1; 771 t[1 + idx] = s2; 772 t[2 + idx] = s3; 773 t[3 + idx] = s4; 774 } 775 /* backward solve the upper triangular */ 776 for (i = n - 1; i >= 0; i--) { 777 v = aa + 16 * diag[i] + 16; 778 vi = aj + diag[i] + 1; 779 nz = ai[i + 1] - diag[i] - 1; 780 idt = 4 * i; 781 s1 = t[idt]; 782 s2 = t[1 + idt]; 783 s3 = t[2 + idt]; 784 s4 = t[3 + idt]; 785 while (nz--) { 786 idx = 4 * (*vi++); 787 x1 = t[idx]; 788 x2 = t[1 + idx]; 789 x3 = t[2 + idx]; 790 x4 = t[3 + idx]; 791 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 792 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 793 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 794 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 795 v += 16; 796 } 797 idc = 4 * (*c--); 798 v = aa + 16 * diag[i]; 799 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 800 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 801 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 802 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 803 } 804 805 PetscCall(ISRestoreIndices(isrow, &rout)); 806 PetscCall(ISRestoreIndices(iscol, &cout)); 807 PetscCall(VecRestoreArrayRead(bb, &b)); 808 PetscCall(VecRestoreArray(xx, &x)); 809 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 810 PetscFunctionReturn(PETSC_SUCCESS); 811 } 812 813 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) 814 { 815 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 816 IS iscol = a->col, isrow = a->row; 817 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 818 PetscInt i, nz, idx, idt, idc, m; 819 const PetscInt *r, *c, *rout, *cout; 820 const MatScalar *aa = a->a, *v; 821 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 822 const PetscScalar *b; 823 824 PetscFunctionBegin; 825 PetscCall(VecGetArrayRead(bb, &b)); 826 PetscCall(VecGetArray(xx, &x)); 827 t = a->solve_work; 828 829 PetscCall(ISGetIndices(isrow, &rout)); 830 r = rout; 831 PetscCall(ISGetIndices(iscol, &cout)); 832 c = cout; 833 834 /* forward solve the lower triangular */ 835 idx = 4 * r[0]; 836 t[0] = b[idx]; 837 t[1] = b[1 + idx]; 838 t[2] = b[2 + idx]; 839 t[3] = b[3 + idx]; 840 for (i = 1; i < n; i++) { 841 v = aa + 16 * ai[i]; 842 vi = aj + ai[i]; 843 nz = ai[i + 1] - ai[i]; 844 idx = 4 * r[i]; 845 s1 = b[idx]; 846 s2 = b[1 + idx]; 847 s3 = b[2 + idx]; 848 s4 = b[3 + idx]; 849 for (m = 0; m < nz; m++) { 850 idx = 4 * vi[m]; 851 x1 = t[idx]; 852 x2 = t[1 + idx]; 853 x3 = t[2 + idx]; 854 x4 = t[3 + idx]; 855 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 856 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 857 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 858 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 859 v += 16; 860 } 861 idx = 4 * i; 862 t[idx] = s1; 863 t[1 + idx] = s2; 864 t[2 + idx] = s3; 865 t[3 + idx] = s4; 866 } 867 /* backward solve the upper triangular */ 868 for (i = n - 1; i >= 0; i--) { 869 v = aa + 16 * (adiag[i + 1] + 1); 870 vi = aj + adiag[i + 1] + 1; 871 nz = adiag[i] - adiag[i + 1] - 1; 872 idt = 4 * i; 873 s1 = t[idt]; 874 s2 = t[1 + idt]; 875 s3 = t[2 + idt]; 876 s4 = t[3 + idt]; 877 for (m = 0; m < nz; m++) { 878 idx = 4 * vi[m]; 879 x1 = t[idx]; 880 x2 = t[1 + idx]; 881 x3 = t[2 + idx]; 882 x4 = t[3 + idx]; 883 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 884 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 885 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 886 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 887 v += 16; 888 } 889 idc = 4 * c[i]; 890 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 891 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 892 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 893 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 894 } 895 896 PetscCall(ISRestoreIndices(isrow, &rout)); 897 PetscCall(ISRestoreIndices(iscol, &cout)); 898 PetscCall(VecRestoreArrayRead(bb, &b)); 899 PetscCall(VecRestoreArray(xx, &x)); 900 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 901 PetscFunctionReturn(PETSC_SUCCESS); 902 } 903 904 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) 905 { 906 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 907 IS iscol = a->col, isrow = a->row; 908 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 909 PetscInt i, nz, idx, idt, idc; 910 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 911 const MatScalar *aa = a->a, *v; 912 MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 913 PetscScalar *x; 914 const PetscScalar *b; 915 916 PetscFunctionBegin; 917 PetscCall(VecGetArrayRead(bb, &b)); 918 PetscCall(VecGetArray(xx, &x)); 919 t = (MatScalar *)a->solve_work; 920 921 PetscCall(ISGetIndices(isrow, &rout)); 922 r = rout; 923 PetscCall(ISGetIndices(iscol, &cout)); 924 c = cout + (n - 1); 925 926 /* forward solve the lower triangular */ 927 idx = 4 * (*r++); 928 t[0] = (MatScalar)b[idx]; 929 t[1] = (MatScalar)b[1 + idx]; 930 t[2] = (MatScalar)b[2 + idx]; 931 t[3] = (MatScalar)b[3 + idx]; 932 for (i = 1; i < n; i++) { 933 v = aa + 16 * ai[i]; 934 vi = aj + ai[i]; 935 nz = diag[i] - ai[i]; 936 idx = 4 * (*r++); 937 s1 = (MatScalar)b[idx]; 938 s2 = (MatScalar)b[1 + idx]; 939 s3 = (MatScalar)b[2 + idx]; 940 s4 = (MatScalar)b[3 + idx]; 941 while (nz--) { 942 idx = 4 * (*vi++); 943 x1 = t[idx]; 944 x2 = t[1 + idx]; 945 x3 = t[2 + idx]; 946 x4 = t[3 + idx]; 947 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 948 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 949 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 950 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 951 v += 16; 952 } 953 idx = 4 * i; 954 t[idx] = s1; 955 t[1 + idx] = s2; 956 t[2 + idx] = s3; 957 t[3 + idx] = s4; 958 } 959 /* backward solve the upper triangular */ 960 for (i = n - 1; i >= 0; i--) { 961 v = aa + 16 * diag[i] + 16; 962 vi = aj + diag[i] + 1; 963 nz = ai[i + 1] - diag[i] - 1; 964 idt = 4 * i; 965 s1 = t[idt]; 966 s2 = t[1 + idt]; 967 s3 = t[2 + idt]; 968 s4 = t[3 + idt]; 969 while (nz--) { 970 idx = 4 * (*vi++); 971 x1 = t[idx]; 972 x2 = t[1 + idx]; 973 x3 = t[2 + idx]; 974 x4 = t[3 + idx]; 975 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 976 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 977 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 978 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 979 v += 16; 980 } 981 idc = 4 * (*c--); 982 v = aa + 16 * diag[i]; 983 t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 984 t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 985 t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 986 t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 987 x[idc] = (PetscScalar)t[idt]; 988 x[1 + idc] = (PetscScalar)t[1 + idt]; 989 x[2 + idc] = (PetscScalar)t[2 + idt]; 990 x[3 + idc] = (PetscScalar)t[3 + idt]; 991 } 992 993 PetscCall(ISRestoreIndices(isrow, &rout)); 994 PetscCall(ISRestoreIndices(iscol, &cout)); 995 PetscCall(VecRestoreArrayRead(bb, &b)); 996 PetscCall(VecRestoreArray(xx, &x)); 997 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 998 PetscFunctionReturn(PETSC_SUCCESS); 999 } 1000 1001 #if defined(PETSC_HAVE_SSE) 1002 1003 #include PETSC_HAVE_SSE 1004 1005 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) 1006 { 1007 /* 1008 Note: This code uses demotion of double 1009 to float when performing the mixed-mode computation. 1010 This may not be numerically reasonable for all applications. 1011 */ 1012 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1013 IS iscol = a->col, isrow = a->row; 1014 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 1015 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1016 MatScalar *aa = a->a, *v; 1017 PetscScalar *x, *b, *t; 1018 1019 /* Make space in temp stack for 16 Byte Aligned arrays */ 1020 float ssealignedspace[11], *tmps, *tmpx; 1021 unsigned long offset; 1022 1023 PetscFunctionBegin; 1024 SSE_SCOPE_BEGIN; 1025 1026 offset = (unsigned long)ssealignedspace % 16; 1027 if (offset) offset = (16 - offset) / 4; 1028 tmps = &ssealignedspace[offset]; 1029 tmpx = &ssealignedspace[offset + 4]; 1030 PREFETCH_NTA(aa + 16 * ai[1]); 1031 1032 PetscCall(VecGetArray(bb, &b)); 1033 PetscCall(VecGetArray(xx, &x)); 1034 t = a->solve_work; 1035 1036 PetscCall(ISGetIndices(isrow, &rout)); 1037 r = rout; 1038 PetscCall(ISGetIndices(iscol, &cout)); 1039 c = cout + (n - 1); 1040 1041 /* forward solve the lower triangular */ 1042 idx = 4 * (*r++); 1043 t[0] = b[idx]; 1044 t[1] = b[1 + idx]; 1045 t[2] = b[2 + idx]; 1046 t[3] = b[3 + idx]; 1047 v = aa + 16 * ai[1]; 1048 1049 for (i = 1; i < n;) { 1050 PREFETCH_NTA(&v[8]); 1051 vi = aj + ai[i]; 1052 nz = diag[i] - ai[i]; 1053 idx = 4 * (*r++); 1054 1055 /* Demote sum from double to float */ 1056 CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 1057 LOAD_PS(tmps, XMM7); 1058 1059 while (nz--) { 1060 PREFETCH_NTA(&v[16]); 1061 idx = 4 * (*vi++); 1062 1063 /* Demote solution (so far) from double to float */ 1064 CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 1065 1066 /* 4x4 Matrix-Vector product with negative accumulation: */ 1067 SSE_INLINE_BEGIN_2(tmpx, v) 1068 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1069 1070 /* First Column */ 1071 SSE_COPY_PS(XMM0, XMM6) 1072 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1073 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1074 SSE_SUB_PS(XMM7, XMM0) 1075 1076 /* Second Column */ 1077 SSE_COPY_PS(XMM1, XMM6) 1078 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1079 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1080 SSE_SUB_PS(XMM7, XMM1) 1081 1082 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1083 1084 /* Third Column */ 1085 SSE_COPY_PS(XMM2, XMM6) 1086 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1087 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1088 SSE_SUB_PS(XMM7, XMM2) 1089 1090 /* Fourth Column */ 1091 SSE_COPY_PS(XMM3, XMM6) 1092 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1093 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1094 SSE_SUB_PS(XMM7, XMM3) 1095 SSE_INLINE_END_2 1096 1097 v += 16; 1098 } 1099 idx = 4 * i; 1100 v = aa + 16 * ai[++i]; 1101 PREFETCH_NTA(v); 1102 STORE_PS(tmps, XMM7); 1103 1104 /* Promote result from float to double */ 1105 CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 1106 } 1107 /* backward solve the upper triangular */ 1108 idt = 4 * (n - 1); 1109 ai16 = 16 * diag[n - 1]; 1110 v = aa + ai16 + 16; 1111 for (i = n - 1; i >= 0;) { 1112 PREFETCH_NTA(&v[8]); 1113 vi = aj + diag[i] + 1; 1114 nz = ai[i + 1] - diag[i] - 1; 1115 1116 /* Demote accumulator from double to float */ 1117 CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 1118 LOAD_PS(tmps, XMM7); 1119 1120 while (nz--) { 1121 PREFETCH_NTA(&v[16]); 1122 idx = 4 * (*vi++); 1123 1124 /* Demote solution (so far) from double to float */ 1125 CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 1126 1127 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1128 SSE_INLINE_BEGIN_2(tmpx, v) 1129 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1130 1131 /* First Column */ 1132 SSE_COPY_PS(XMM0, XMM6) 1133 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1134 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1135 SSE_SUB_PS(XMM7, XMM0) 1136 1137 /* Second Column */ 1138 SSE_COPY_PS(XMM1, XMM6) 1139 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1140 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1141 SSE_SUB_PS(XMM7, XMM1) 1142 1143 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1144 1145 /* Third Column */ 1146 SSE_COPY_PS(XMM2, XMM6) 1147 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1148 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1149 SSE_SUB_PS(XMM7, XMM2) 1150 1151 /* Fourth Column */ 1152 SSE_COPY_PS(XMM3, XMM6) 1153 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1154 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1155 SSE_SUB_PS(XMM7, XMM3) 1156 SSE_INLINE_END_2 1157 v += 16; 1158 } 1159 v = aa + ai16; 1160 ai16 = 16 * diag[--i]; 1161 PREFETCH_NTA(aa + ai16 + 16); 1162 /* 1163 Scale the result by the diagonal 4x4 block, 1164 which was inverted as part of the factorization 1165 */ 1166 SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 1167 /* First Column */ 1168 SSE_COPY_PS(XMM0, XMM7) 1169 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1170 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 1171 1172 /* Second Column */ 1173 SSE_COPY_PS(XMM1, XMM7) 1174 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1175 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 1176 SSE_ADD_PS(XMM0, XMM1) 1177 1178 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 1179 1180 /* Third Column */ 1181 SSE_COPY_PS(XMM2, XMM7) 1182 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1183 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 1184 SSE_ADD_PS(XMM0, XMM2) 1185 1186 /* Fourth Column */ 1187 SSE_COPY_PS(XMM3, XMM7) 1188 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1189 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 1190 SSE_ADD_PS(XMM0, XMM3) 1191 1192 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 1193 SSE_INLINE_END_3 1194 1195 /* Promote solution from float to double */ 1196 CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 1197 1198 /* Apply reordering to t and stream into x. */ 1199 /* This way, x doesn't pollute the cache. */ 1200 /* Be careful with size: 2 doubles = 4 floats! */ 1201 idc = 4 * (*c--); 1202 SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 1203 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1204 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 1205 SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 1206 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1207 SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 1208 SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 1209 SSE_INLINE_END_2 1210 v = aa + ai16 + 16; 1211 idt -= 4; 1212 } 1213 1214 PetscCall(ISRestoreIndices(isrow, &rout)); 1215 PetscCall(ISRestoreIndices(iscol, &cout)); 1216 PetscCall(VecRestoreArray(bb, &b)); 1217 PetscCall(VecRestoreArray(xx, &x)); 1218 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 1219 SSE_SCOPE_END; 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 #endif 1224 1225 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1226 { 1227 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1228 IS iscol = a->col, isrow = a->row; 1229 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1230 PetscInt i, nz, idx, idt, idc; 1231 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1232 const MatScalar *aa = a->a, *v; 1233 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1234 const PetscScalar *b; 1235 1236 PetscFunctionBegin; 1237 PetscCall(VecGetArrayRead(bb, &b)); 1238 PetscCall(VecGetArray(xx, &x)); 1239 t = a->solve_work; 1240 1241 PetscCall(ISGetIndices(isrow, &rout)); 1242 r = rout; 1243 PetscCall(ISGetIndices(iscol, &cout)); 1244 c = cout + (n - 1); 1245 1246 /* forward solve the lower triangular */ 1247 idx = 3 * (*r++); 1248 t[0] = b[idx]; 1249 t[1] = b[1 + idx]; 1250 t[2] = b[2 + idx]; 1251 for (i = 1; i < n; i++) { 1252 v = aa + 9 * ai[i]; 1253 vi = aj + ai[i]; 1254 nz = diag[i] - ai[i]; 1255 idx = 3 * (*r++); 1256 s1 = b[idx]; 1257 s2 = b[1 + idx]; 1258 s3 = b[2 + idx]; 1259 while (nz--) { 1260 idx = 3 * (*vi++); 1261 x1 = t[idx]; 1262 x2 = t[1 + idx]; 1263 x3 = t[2 + idx]; 1264 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1265 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1266 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1267 v += 9; 1268 } 1269 idx = 3 * i; 1270 t[idx] = s1; 1271 t[1 + idx] = s2; 1272 t[2 + idx] = s3; 1273 } 1274 /* backward solve the upper triangular */ 1275 for (i = n - 1; i >= 0; i--) { 1276 v = aa + 9 * diag[i] + 9; 1277 vi = aj + diag[i] + 1; 1278 nz = ai[i + 1] - diag[i] - 1; 1279 idt = 3 * i; 1280 s1 = t[idt]; 1281 s2 = t[1 + idt]; 1282 s3 = t[2 + idt]; 1283 while (nz--) { 1284 idx = 3 * (*vi++); 1285 x1 = t[idx]; 1286 x2 = t[1 + idx]; 1287 x3 = t[2 + idx]; 1288 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1289 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1290 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1291 v += 9; 1292 } 1293 idc = 3 * (*c--); 1294 v = aa + 9 * diag[i]; 1295 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1296 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1297 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1298 } 1299 PetscCall(ISRestoreIndices(isrow, &rout)); 1300 PetscCall(ISRestoreIndices(iscol, &cout)); 1301 PetscCall(VecRestoreArrayRead(bb, &b)); 1302 PetscCall(VecRestoreArray(xx, &x)); 1303 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1304 PetscFunctionReturn(PETSC_SUCCESS); 1305 } 1306 1307 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) 1308 { 1309 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1310 IS iscol = a->col, isrow = a->row; 1311 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1312 PetscInt i, nz, idx, idt, idc, m; 1313 const PetscInt *r, *c, *rout, *cout; 1314 const MatScalar *aa = a->a, *v; 1315 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1316 const PetscScalar *b; 1317 1318 PetscFunctionBegin; 1319 PetscCall(VecGetArrayRead(bb, &b)); 1320 PetscCall(VecGetArray(xx, &x)); 1321 t = a->solve_work; 1322 1323 PetscCall(ISGetIndices(isrow, &rout)); 1324 r = rout; 1325 PetscCall(ISGetIndices(iscol, &cout)); 1326 c = cout; 1327 1328 /* forward solve the lower triangular */ 1329 idx = 3 * r[0]; 1330 t[0] = b[idx]; 1331 t[1] = b[1 + idx]; 1332 t[2] = b[2 + idx]; 1333 for (i = 1; i < n; i++) { 1334 v = aa + 9 * ai[i]; 1335 vi = aj + ai[i]; 1336 nz = ai[i + 1] - ai[i]; 1337 idx = 3 * r[i]; 1338 s1 = b[idx]; 1339 s2 = b[1 + idx]; 1340 s3 = b[2 + idx]; 1341 for (m = 0; m < nz; m++) { 1342 idx = 3 * vi[m]; 1343 x1 = t[idx]; 1344 x2 = t[1 + idx]; 1345 x3 = t[2 + idx]; 1346 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1347 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1348 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1349 v += 9; 1350 } 1351 idx = 3 * i; 1352 t[idx] = s1; 1353 t[1 + idx] = s2; 1354 t[2 + idx] = s3; 1355 } 1356 /* backward solve the upper triangular */ 1357 for (i = n - 1; i >= 0; i--) { 1358 v = aa + 9 * (adiag[i + 1] + 1); 1359 vi = aj + adiag[i + 1] + 1; 1360 nz = adiag[i] - adiag[i + 1] - 1; 1361 idt = 3 * i; 1362 s1 = t[idt]; 1363 s2 = t[1 + idt]; 1364 s3 = t[2 + idt]; 1365 for (m = 0; m < nz; m++) { 1366 idx = 3 * vi[m]; 1367 x1 = t[idx]; 1368 x2 = t[1 + idx]; 1369 x3 = t[2 + idx]; 1370 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1371 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1372 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1373 v += 9; 1374 } 1375 idc = 3 * c[i]; 1376 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1377 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1378 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1379 } 1380 PetscCall(ISRestoreIndices(isrow, &rout)); 1381 PetscCall(ISRestoreIndices(iscol, &cout)); 1382 PetscCall(VecRestoreArrayRead(bb, &b)); 1383 PetscCall(VecRestoreArray(xx, &x)); 1384 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1385 PetscFunctionReturn(PETSC_SUCCESS); 1386 } 1387 1388 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1389 { 1390 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1391 IS iscol = a->col, isrow = a->row; 1392 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1393 PetscInt i, nz, idx, idt, idc; 1394 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1395 const MatScalar *aa = a->a, *v; 1396 PetscScalar *x, s1, s2, x1, x2, *t; 1397 const PetscScalar *b; 1398 1399 PetscFunctionBegin; 1400 PetscCall(VecGetArrayRead(bb, &b)); 1401 PetscCall(VecGetArray(xx, &x)); 1402 t = a->solve_work; 1403 1404 PetscCall(ISGetIndices(isrow, &rout)); 1405 r = rout; 1406 PetscCall(ISGetIndices(iscol, &cout)); 1407 c = cout + (n - 1); 1408 1409 /* forward solve the lower triangular */ 1410 idx = 2 * (*r++); 1411 t[0] = b[idx]; 1412 t[1] = b[1 + idx]; 1413 for (i = 1; i < n; i++) { 1414 v = aa + 4 * ai[i]; 1415 vi = aj + ai[i]; 1416 nz = diag[i] - ai[i]; 1417 idx = 2 * (*r++); 1418 s1 = b[idx]; 1419 s2 = b[1 + idx]; 1420 while (nz--) { 1421 idx = 2 * (*vi++); 1422 x1 = t[idx]; 1423 x2 = t[1 + idx]; 1424 s1 -= v[0] * x1 + v[2] * x2; 1425 s2 -= v[1] * x1 + v[3] * x2; 1426 v += 4; 1427 } 1428 idx = 2 * i; 1429 t[idx] = s1; 1430 t[1 + idx] = s2; 1431 } 1432 /* backward solve the upper triangular */ 1433 for (i = n - 1; i >= 0; i--) { 1434 v = aa + 4 * diag[i] + 4; 1435 vi = aj + diag[i] + 1; 1436 nz = ai[i + 1] - diag[i] - 1; 1437 idt = 2 * i; 1438 s1 = t[idt]; 1439 s2 = t[1 + idt]; 1440 while (nz--) { 1441 idx = 2 * (*vi++); 1442 x1 = t[idx]; 1443 x2 = t[1 + idx]; 1444 s1 -= v[0] * x1 + v[2] * x2; 1445 s2 -= v[1] * x1 + v[3] * x2; 1446 v += 4; 1447 } 1448 idc = 2 * (*c--); 1449 v = aa + 4 * diag[i]; 1450 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1451 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1452 } 1453 PetscCall(ISRestoreIndices(isrow, &rout)); 1454 PetscCall(ISRestoreIndices(iscol, &cout)); 1455 PetscCall(VecRestoreArrayRead(bb, &b)); 1456 PetscCall(VecRestoreArray(xx, &x)); 1457 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1458 PetscFunctionReturn(PETSC_SUCCESS); 1459 } 1460 1461 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) 1462 { 1463 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1464 IS iscol = a->col, isrow = a->row; 1465 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1466 PetscInt i, nz, idx, jdx, idt, idc, m; 1467 const PetscInt *r, *c, *rout, *cout; 1468 const MatScalar *aa = a->a, *v; 1469 PetscScalar *x, s1, s2, x1, x2, *t; 1470 const PetscScalar *b; 1471 1472 PetscFunctionBegin; 1473 PetscCall(VecGetArrayRead(bb, &b)); 1474 PetscCall(VecGetArray(xx, &x)); 1475 t = a->solve_work; 1476 1477 PetscCall(ISGetIndices(isrow, &rout)); 1478 r = rout; 1479 PetscCall(ISGetIndices(iscol, &cout)); 1480 c = cout; 1481 1482 /* forward solve the lower triangular */ 1483 idx = 2 * r[0]; 1484 t[0] = b[idx]; 1485 t[1] = b[1 + idx]; 1486 for (i = 1; i < n; i++) { 1487 v = aa + 4 * ai[i]; 1488 vi = aj + ai[i]; 1489 nz = ai[i + 1] - ai[i]; 1490 idx = 2 * r[i]; 1491 s1 = b[idx]; 1492 s2 = b[1 + idx]; 1493 for (m = 0; m < nz; m++) { 1494 jdx = 2 * vi[m]; 1495 x1 = t[jdx]; 1496 x2 = t[1 + jdx]; 1497 s1 -= v[0] * x1 + v[2] * x2; 1498 s2 -= v[1] * x1 + v[3] * x2; 1499 v += 4; 1500 } 1501 idx = 2 * i; 1502 t[idx] = s1; 1503 t[1 + idx] = s2; 1504 } 1505 /* backward solve the upper triangular */ 1506 for (i = n - 1; i >= 0; i--) { 1507 v = aa + 4 * (adiag[i + 1] + 1); 1508 vi = aj + adiag[i + 1] + 1; 1509 nz = adiag[i] - adiag[i + 1] - 1; 1510 idt = 2 * i; 1511 s1 = t[idt]; 1512 s2 = t[1 + idt]; 1513 for (m = 0; m < nz; m++) { 1514 idx = 2 * vi[m]; 1515 x1 = t[idx]; 1516 x2 = t[1 + idx]; 1517 s1 -= v[0] * x1 + v[2] * x2; 1518 s2 -= v[1] * x1 + v[3] * x2; 1519 v += 4; 1520 } 1521 idc = 2 * c[i]; 1522 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1523 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1524 } 1525 PetscCall(ISRestoreIndices(isrow, &rout)); 1526 PetscCall(ISRestoreIndices(iscol, &cout)); 1527 PetscCall(VecRestoreArrayRead(bb, &b)); 1528 PetscCall(VecRestoreArray(xx, &x)); 1529 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 1533 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1534 { 1535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1536 IS iscol = a->col, isrow = a->row; 1537 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1538 PetscInt i, nz; 1539 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1540 const MatScalar *aa = a->a, *v; 1541 PetscScalar *x, s1, *t; 1542 const PetscScalar *b; 1543 1544 PetscFunctionBegin; 1545 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 1546 1547 PetscCall(VecGetArrayRead(bb, &b)); 1548 PetscCall(VecGetArray(xx, &x)); 1549 t = a->solve_work; 1550 1551 PetscCall(ISGetIndices(isrow, &rout)); 1552 r = rout; 1553 PetscCall(ISGetIndices(iscol, &cout)); 1554 c = cout + (n - 1); 1555 1556 /* forward solve the lower triangular */ 1557 t[0] = b[*r++]; 1558 for (i = 1; i < n; i++) { 1559 v = aa + ai[i]; 1560 vi = aj + ai[i]; 1561 nz = diag[i] - ai[i]; 1562 s1 = b[*r++]; 1563 while (nz--) s1 -= (*v++) * t[*vi++]; 1564 t[i] = s1; 1565 } 1566 /* backward solve the upper triangular */ 1567 for (i = n - 1; i >= 0; i--) { 1568 v = aa + diag[i] + 1; 1569 vi = aj + diag[i] + 1; 1570 nz = ai[i + 1] - diag[i] - 1; 1571 s1 = t[i]; 1572 while (nz--) s1 -= (*v++) * t[*vi++]; 1573 x[*c--] = t[i] = aa[diag[i]] * s1; 1574 } 1575 1576 PetscCall(ISRestoreIndices(isrow, &rout)); 1577 PetscCall(ISRestoreIndices(iscol, &cout)); 1578 PetscCall(VecRestoreArrayRead(bb, &b)); 1579 PetscCall(VecRestoreArray(xx, &x)); 1580 PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 1581 PetscFunctionReturn(PETSC_SUCCESS); 1582 } 1583 1584 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) 1585 { 1586 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1587 IS iscol = a->col, isrow = a->row; 1588 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 1589 const PetscInt *rout, *cout, *r, *c; 1590 PetscScalar *x, *tmp, sum; 1591 const PetscScalar *b; 1592 const MatScalar *aa = a->a, *v; 1593 1594 PetscFunctionBegin; 1595 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 1596 1597 PetscCall(VecGetArrayRead(bb, &b)); 1598 PetscCall(VecGetArray(xx, &x)); 1599 tmp = a->solve_work; 1600 1601 PetscCall(ISGetIndices(isrow, &rout)); 1602 r = rout; 1603 PetscCall(ISGetIndices(iscol, &cout)); 1604 c = cout; 1605 1606 /* forward solve the lower triangular */ 1607 tmp[0] = b[r[0]]; 1608 v = aa; 1609 vi = aj; 1610 for (i = 1; i < n; i++) { 1611 nz = ai[i + 1] - ai[i]; 1612 sum = b[r[i]]; 1613 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1614 tmp[i] = sum; 1615 v += nz; 1616 vi += nz; 1617 } 1618 1619 /* backward solve the upper triangular */ 1620 for (i = n - 1; i >= 0; i--) { 1621 v = aa + adiag[i + 1] + 1; 1622 vi = aj + adiag[i + 1] + 1; 1623 nz = adiag[i] - adiag[i + 1] - 1; 1624 sum = tmp[i]; 1625 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1626 x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 1627 } 1628 1629 PetscCall(ISRestoreIndices(isrow, &rout)); 1630 PetscCall(ISRestoreIndices(iscol, &cout)); 1631 PetscCall(VecRestoreArrayRead(bb, &b)); 1632 PetscCall(VecRestoreArray(xx, &x)); 1633 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636