1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 5 { 6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 7 IS iscol = a->col, isrow = a->row; 8 const PetscInt *r, *c, *rout, *cout; 9 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 10 PetscInt i, nz; 11 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 12 const MatScalar *aa = a->a, *v; 13 PetscScalar *x, *s, *t, *ls; 14 const PetscScalar *b; 15 16 PetscFunctionBegin; 17 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 18 PetscCall(VecGetArrayRead(bb, &b)); 19 PetscCall(VecGetArray(xx, &x)); 20 t = a->solve_work; 21 22 PetscCall(ISGetIndices(isrow, &rout)); 23 r = rout; 24 PetscCall(ISGetIndices(iscol, &cout)); 25 c = cout + (n - 1); 26 27 /* forward solve the lower triangular */ 28 PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 29 for (i = 1; i < n; i++) { 30 v = aa + bs2 * ai[i]; 31 vi = aj + ai[i]; 32 nz = a->diag[i] - ai[i]; 33 s = t + bs * i; 34 PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 35 while (nz--) { 36 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 37 v += bs2; 38 } 39 } 40 /* backward solve the upper triangular */ 41 ls = a->solve_work + A->cmap->n; 42 for (i = n - 1; i >= 0; i--) { 43 v = aa + bs2 * (a->diag[i] + 1); 44 vi = aj + a->diag[i] + 1; 45 nz = ai[i + 1] - a->diag[i] - 1; 46 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 47 while (nz--) { 48 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 49 v += bs2; 50 } 51 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 52 PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 53 } 54 55 PetscCall(ISRestoreIndices(isrow, &rout)); 56 PetscCall(ISRestoreIndices(iscol, &cout)); 57 PetscCall(VecRestoreArrayRead(bb, &b)); 58 PetscCall(VecRestoreArray(xx, &x)); 59 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 60 PetscFunctionReturn(PETSC_SUCCESS); 61 } 62 63 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 64 { 65 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 66 IS iscol = a->col, isrow = a->row; 67 const PetscInt *r, *c, *ai = a->i, *aj = a->j; 68 const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 69 PetscInt i, nz, idx, idt, idc; 70 const MatScalar *aa = a->a, *v; 71 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 72 const PetscScalar *b; 73 74 PetscFunctionBegin; 75 PetscCall(VecGetArrayRead(bb, &b)); 76 PetscCall(VecGetArray(xx, &x)); 77 t = a->solve_work; 78 79 PetscCall(ISGetIndices(isrow, &rout)); 80 r = rout; 81 PetscCall(ISGetIndices(iscol, &cout)); 82 c = cout + (n - 1); 83 84 /* forward solve the lower triangular */ 85 idx = 7 * (*r++); 86 t[0] = b[idx]; 87 t[1] = b[1 + idx]; 88 t[2] = b[2 + idx]; 89 t[3] = b[3 + idx]; 90 t[4] = b[4 + idx]; 91 t[5] = b[5 + idx]; 92 t[6] = b[6 + idx]; 93 94 for (i = 1; i < n; i++) { 95 v = aa + 49 * ai[i]; 96 vi = aj + ai[i]; 97 nz = diag[i] - ai[i]; 98 idx = 7 * (*r++); 99 s1 = b[idx]; 100 s2 = b[1 + idx]; 101 s3 = b[2 + idx]; 102 s4 = b[3 + idx]; 103 s5 = b[4 + idx]; 104 s6 = b[5 + idx]; 105 s7 = b[6 + idx]; 106 while (nz--) { 107 idx = 7 * (*vi++); 108 x1 = t[idx]; 109 x2 = t[1 + idx]; 110 x3 = t[2 + idx]; 111 x4 = t[3 + idx]; 112 x5 = t[4 + idx]; 113 x6 = t[5 + idx]; 114 x7 = t[6 + idx]; 115 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 116 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 117 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 118 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 119 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 120 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 121 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 122 v += 49; 123 } 124 idx = 7 * i; 125 t[idx] = s1; 126 t[1 + idx] = s2; 127 t[2 + idx] = s3; 128 t[3 + idx] = s4; 129 t[4 + idx] = s5; 130 t[5 + idx] = s6; 131 t[6 + idx] = s7; 132 } 133 /* backward solve the upper triangular */ 134 for (i = n - 1; i >= 0; i--) { 135 v = aa + 49 * diag[i] + 49; 136 vi = aj + diag[i] + 1; 137 nz = ai[i + 1] - diag[i] - 1; 138 idt = 7 * i; 139 s1 = t[idt]; 140 s2 = t[1 + idt]; 141 s3 = t[2 + idt]; 142 s4 = t[3 + idt]; 143 s5 = t[4 + idt]; 144 s6 = t[5 + idt]; 145 s7 = t[6 + idt]; 146 while (nz--) { 147 idx = 7 * (*vi++); 148 x1 = t[idx]; 149 x2 = t[1 + idx]; 150 x3 = t[2 + idx]; 151 x4 = t[3 + idx]; 152 x5 = t[4 + idx]; 153 x6 = t[5 + idx]; 154 x7 = t[6 + idx]; 155 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 156 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 157 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 158 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 159 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 160 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 161 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 162 v += 49; 163 } 164 idc = 7 * (*c--); 165 v = aa + 49 * diag[i]; 166 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 167 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 168 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 169 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 170 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 171 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 172 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 173 } 174 175 PetscCall(ISRestoreIndices(isrow, &rout)); 176 PetscCall(ISRestoreIndices(iscol, &cout)); 177 PetscCall(VecRestoreArrayRead(bb, &b)); 178 PetscCall(VecRestoreArray(xx, &x)); 179 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) 184 { 185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 186 IS iscol = a->col, isrow = a->row; 187 const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 188 const PetscInt n = a->mbs, *rout, *cout, *vi; 189 PetscInt i, nz, idx, idt, idc, m; 190 const MatScalar *aa = a->a, *v; 191 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 192 const PetscScalar *b; 193 194 PetscFunctionBegin; 195 PetscCall(VecGetArrayRead(bb, &b)); 196 PetscCall(VecGetArray(xx, &x)); 197 t = a->solve_work; 198 199 PetscCall(ISGetIndices(isrow, &rout)); 200 r = rout; 201 PetscCall(ISGetIndices(iscol, &cout)); 202 c = cout; 203 204 /* forward solve the lower triangular */ 205 idx = 7 * r[0]; 206 t[0] = b[idx]; 207 t[1] = b[1 + idx]; 208 t[2] = b[2 + idx]; 209 t[3] = b[3 + idx]; 210 t[4] = b[4 + idx]; 211 t[5] = b[5 + idx]; 212 t[6] = b[6 + idx]; 213 214 for (i = 1; i < n; i++) { 215 v = aa + 49 * ai[i]; 216 vi = aj + ai[i]; 217 nz = ai[i + 1] - ai[i]; 218 idx = 7 * r[i]; 219 s1 = b[idx]; 220 s2 = b[1 + idx]; 221 s3 = b[2 + idx]; 222 s4 = b[3 + idx]; 223 s5 = b[4 + idx]; 224 s6 = b[5 + idx]; 225 s7 = b[6 + idx]; 226 for (m = 0; m < nz; m++) { 227 idx = 7 * vi[m]; 228 x1 = t[idx]; 229 x2 = t[1 + idx]; 230 x3 = t[2 + idx]; 231 x4 = t[3 + idx]; 232 x5 = t[4 + idx]; 233 x6 = t[5 + idx]; 234 x7 = t[6 + idx]; 235 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 236 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 237 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 238 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 239 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 240 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 241 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 242 v += 49; 243 } 244 idx = 7 * i; 245 t[idx] = s1; 246 t[1 + idx] = s2; 247 t[2 + idx] = s3; 248 t[3 + idx] = s4; 249 t[4 + idx] = s5; 250 t[5 + idx] = s6; 251 t[6 + idx] = s7; 252 } 253 /* backward solve the upper triangular */ 254 for (i = n - 1; i >= 0; i--) { 255 v = aa + 49 * (adiag[i + 1] + 1); 256 vi = aj + adiag[i + 1] + 1; 257 nz = adiag[i] - adiag[i + 1] - 1; 258 idt = 7 * i; 259 s1 = t[idt]; 260 s2 = t[1 + idt]; 261 s3 = t[2 + idt]; 262 s4 = t[3 + idt]; 263 s5 = t[4 + idt]; 264 s6 = t[5 + idt]; 265 s7 = t[6 + idt]; 266 for (m = 0; m < nz; m++) { 267 idx = 7 * vi[m]; 268 x1 = t[idx]; 269 x2 = t[1 + idx]; 270 x3 = t[2 + idx]; 271 x4 = t[3 + idx]; 272 x5 = t[4 + idx]; 273 x6 = t[5 + idx]; 274 x7 = t[6 + idx]; 275 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 276 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 277 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 278 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 279 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 280 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 281 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 282 v += 49; 283 } 284 idc = 7 * c[i]; 285 x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 286 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 287 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 288 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 289 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 290 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 291 x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 292 } 293 294 PetscCall(ISRestoreIndices(isrow, &rout)); 295 PetscCall(ISRestoreIndices(iscol, &cout)); 296 PetscCall(VecRestoreArrayRead(bb, &b)); 297 PetscCall(VecRestoreArray(xx, &x)); 298 PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 299 PetscFunctionReturn(PETSC_SUCCESS); 300 } 301 302 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 303 { 304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 305 IS iscol = a->col, isrow = a->row; 306 const PetscInt *r, *c, *rout, *cout; 307 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 308 PetscInt i, nz, idx, idt, idc; 309 const MatScalar *aa = a->a, *v; 310 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 311 const PetscScalar *b; 312 313 PetscFunctionBegin; 314 PetscCall(VecGetArrayRead(bb, &b)); 315 PetscCall(VecGetArray(xx, &x)); 316 t = a->solve_work; 317 318 PetscCall(ISGetIndices(isrow, &rout)); 319 r = rout; 320 PetscCall(ISGetIndices(iscol, &cout)); 321 c = cout + (n - 1); 322 323 /* forward solve the lower triangular */ 324 idx = 6 * (*r++); 325 t[0] = b[idx]; 326 t[1] = b[1 + idx]; 327 t[2] = b[2 + idx]; 328 t[3] = b[3 + idx]; 329 t[4] = b[4 + idx]; 330 t[5] = b[5 + idx]; 331 for (i = 1; i < n; i++) { 332 v = aa + 36 * ai[i]; 333 vi = aj + ai[i]; 334 nz = diag[i] - ai[i]; 335 idx = 6 * (*r++); 336 s1 = b[idx]; 337 s2 = b[1 + idx]; 338 s3 = b[2 + idx]; 339 s4 = b[3 + idx]; 340 s5 = b[4 + idx]; 341 s6 = b[5 + idx]; 342 while (nz--) { 343 idx = 6 * (*vi++); 344 x1 = t[idx]; 345 x2 = t[1 + idx]; 346 x3 = t[2 + idx]; 347 x4 = t[3 + idx]; 348 x5 = t[4 + idx]; 349 x6 = t[5 + idx]; 350 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 351 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 352 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 353 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 354 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 355 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 356 v += 36; 357 } 358 idx = 6 * i; 359 t[idx] = s1; 360 t[1 + idx] = s2; 361 t[2 + idx] = s3; 362 t[3 + idx] = s4; 363 t[4 + idx] = s5; 364 t[5 + idx] = s6; 365 } 366 /* backward solve the upper triangular */ 367 for (i = n - 1; i >= 0; i--) { 368 v = aa + 36 * diag[i] + 36; 369 vi = aj + diag[i] + 1; 370 nz = ai[i + 1] - diag[i] - 1; 371 idt = 6 * i; 372 s1 = t[idt]; 373 s2 = t[1 + idt]; 374 s3 = t[2 + idt]; 375 s4 = t[3 + idt]; 376 s5 = t[4 + idt]; 377 s6 = t[5 + idt]; 378 while (nz--) { 379 idx = 6 * (*vi++); 380 x1 = t[idx]; 381 x2 = t[1 + idx]; 382 x3 = t[2 + idx]; 383 x4 = t[3 + idx]; 384 x5 = t[4 + idx]; 385 x6 = t[5 + idx]; 386 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 387 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 388 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 389 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 390 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 391 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 392 v += 36; 393 } 394 idc = 6 * (*c--); 395 v = aa + 36 * diag[i]; 396 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 397 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 398 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 399 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 400 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 401 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 402 } 403 404 PetscCall(ISRestoreIndices(isrow, &rout)); 405 PetscCall(ISRestoreIndices(iscol, &cout)); 406 PetscCall(VecRestoreArrayRead(bb, &b)); 407 PetscCall(VecRestoreArray(xx, &x)); 408 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 409 PetscFunctionReturn(PETSC_SUCCESS); 410 } 411 412 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) 413 { 414 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 415 IS iscol = a->col, isrow = a->row; 416 const PetscInt *r, *c, *rout, *cout; 417 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 418 PetscInt i, nz, idx, idt, idc, m; 419 const MatScalar *aa = a->a, *v; 420 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 421 const PetscScalar *b; 422 423 PetscFunctionBegin; 424 PetscCall(VecGetArrayRead(bb, &b)); 425 PetscCall(VecGetArray(xx, &x)); 426 t = a->solve_work; 427 428 PetscCall(ISGetIndices(isrow, &rout)); 429 r = rout; 430 PetscCall(ISGetIndices(iscol, &cout)); 431 c = cout; 432 433 /* forward solve the lower triangular */ 434 idx = 6 * r[0]; 435 t[0] = b[idx]; 436 t[1] = b[1 + idx]; 437 t[2] = b[2 + idx]; 438 t[3] = b[3 + idx]; 439 t[4] = b[4 + idx]; 440 t[5] = b[5 + idx]; 441 for (i = 1; i < n; i++) { 442 v = aa + 36 * ai[i]; 443 vi = aj + ai[i]; 444 nz = ai[i + 1] - ai[i]; 445 idx = 6 * r[i]; 446 s1 = b[idx]; 447 s2 = b[1 + idx]; 448 s3 = b[2 + idx]; 449 s4 = b[3 + idx]; 450 s5 = b[4 + idx]; 451 s6 = b[5 + idx]; 452 for (m = 0; m < nz; m++) { 453 idx = 6 * vi[m]; 454 x1 = t[idx]; 455 x2 = t[1 + idx]; 456 x3 = t[2 + idx]; 457 x4 = t[3 + idx]; 458 x5 = t[4 + idx]; 459 x6 = t[5 + idx]; 460 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 461 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 462 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 463 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 464 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 465 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 466 v += 36; 467 } 468 idx = 6 * i; 469 t[idx] = s1; 470 t[1 + idx] = s2; 471 t[2 + idx] = s3; 472 t[3 + idx] = s4; 473 t[4 + idx] = s5; 474 t[5 + idx] = s6; 475 } 476 /* backward solve the upper triangular */ 477 for (i = n - 1; i >= 0; i--) { 478 v = aa + 36 * (adiag[i + 1] + 1); 479 vi = aj + adiag[i + 1] + 1; 480 nz = adiag[i] - adiag[i + 1] - 1; 481 idt = 6 * i; 482 s1 = t[idt]; 483 s2 = t[1 + idt]; 484 s3 = t[2 + idt]; 485 s4 = t[3 + idt]; 486 s5 = t[4 + idt]; 487 s6 = t[5 + idt]; 488 for (m = 0; m < nz; m++) { 489 idx = 6 * vi[m]; 490 x1 = t[idx]; 491 x2 = t[1 + idx]; 492 x3 = t[2 + idx]; 493 x4 = t[3 + idx]; 494 x5 = t[4 + idx]; 495 x6 = t[5 + idx]; 496 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 497 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 498 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 499 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 500 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 501 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 502 v += 36; 503 } 504 idc = 6 * c[i]; 505 x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 506 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 507 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 508 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 509 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 510 x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 511 } 512 513 PetscCall(ISRestoreIndices(isrow, &rout)); 514 PetscCall(ISRestoreIndices(iscol, &cout)); 515 PetscCall(VecRestoreArrayRead(bb, &b)); 516 PetscCall(VecRestoreArray(xx, &x)); 517 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 518 PetscFunctionReturn(PETSC_SUCCESS); 519 } 520 521 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 522 { 523 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 524 IS iscol = a->col, isrow = a->row; 525 const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 526 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 527 PetscInt i, nz, idx, idt, idc; 528 const MatScalar *aa = a->a, *v; 529 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 530 const PetscScalar *b; 531 532 PetscFunctionBegin; 533 PetscCall(VecGetArrayRead(bb, &b)); 534 PetscCall(VecGetArray(xx, &x)); 535 t = a->solve_work; 536 537 PetscCall(ISGetIndices(isrow, &rout)); 538 r = rout; 539 PetscCall(ISGetIndices(iscol, &cout)); 540 c = cout + (n - 1); 541 542 /* forward solve the lower triangular */ 543 idx = 5 * (*r++); 544 t[0] = b[idx]; 545 t[1] = b[1 + idx]; 546 t[2] = b[2 + idx]; 547 t[3] = b[3 + idx]; 548 t[4] = b[4 + idx]; 549 for (i = 1; i < n; i++) { 550 v = aa + 25 * ai[i]; 551 vi = aj + ai[i]; 552 nz = diag[i] - ai[i]; 553 idx = 5 * (*r++); 554 s1 = b[idx]; 555 s2 = b[1 + idx]; 556 s3 = b[2 + idx]; 557 s4 = b[3 + idx]; 558 s5 = b[4 + idx]; 559 while (nz--) { 560 idx = 5 * (*vi++); 561 x1 = t[idx]; 562 x2 = t[1 + idx]; 563 x3 = t[2 + idx]; 564 x4 = t[3 + idx]; 565 x5 = t[4 + idx]; 566 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 567 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 568 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 569 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 570 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 571 v += 25; 572 } 573 idx = 5 * i; 574 t[idx] = s1; 575 t[1 + idx] = s2; 576 t[2 + idx] = s3; 577 t[3 + idx] = s4; 578 t[4 + idx] = s5; 579 } 580 /* backward solve the upper triangular */ 581 for (i = n - 1; i >= 0; i--) { 582 v = aa + 25 * diag[i] + 25; 583 vi = aj + diag[i] + 1; 584 nz = ai[i + 1] - diag[i] - 1; 585 idt = 5 * i; 586 s1 = t[idt]; 587 s2 = t[1 + idt]; 588 s3 = t[2 + idt]; 589 s4 = t[3 + idt]; 590 s5 = t[4 + idt]; 591 while (nz--) { 592 idx = 5 * (*vi++); 593 x1 = t[idx]; 594 x2 = t[1 + idx]; 595 x3 = t[2 + idx]; 596 x4 = t[3 + idx]; 597 x5 = t[4 + idx]; 598 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 599 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 600 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 601 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 602 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 603 v += 25; 604 } 605 idc = 5 * (*c--); 606 v = aa + 25 * diag[i]; 607 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 608 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 609 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 610 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 611 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 612 } 613 614 PetscCall(ISRestoreIndices(isrow, &rout)); 615 PetscCall(ISRestoreIndices(iscol, &cout)); 616 PetscCall(VecRestoreArrayRead(bb, &b)); 617 PetscCall(VecRestoreArray(xx, &x)); 618 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 619 PetscFunctionReturn(PETSC_SUCCESS); 620 } 621 622 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) 623 { 624 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 625 IS iscol = a->col, isrow = a->row; 626 const PetscInt *r, *c, *rout, *cout; 627 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 628 PetscInt i, nz, idx, idt, idc, m; 629 const MatScalar *aa = a->a, *v; 630 PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 631 const PetscScalar *b; 632 633 PetscFunctionBegin; 634 PetscCall(VecGetArrayRead(bb, &b)); 635 PetscCall(VecGetArray(xx, &x)); 636 t = a->solve_work; 637 638 PetscCall(ISGetIndices(isrow, &rout)); 639 r = rout; 640 PetscCall(ISGetIndices(iscol, &cout)); 641 c = cout; 642 643 /* forward solve the lower triangular */ 644 idx = 5 * r[0]; 645 t[0] = b[idx]; 646 t[1] = b[1 + idx]; 647 t[2] = b[2 + idx]; 648 t[3] = b[3 + idx]; 649 t[4] = b[4 + idx]; 650 for (i = 1; i < n; i++) { 651 v = aa + 25 * ai[i]; 652 vi = aj + ai[i]; 653 nz = ai[i + 1] - ai[i]; 654 idx = 5 * r[i]; 655 s1 = b[idx]; 656 s2 = b[1 + idx]; 657 s3 = b[2 + idx]; 658 s4 = b[3 + idx]; 659 s5 = b[4 + idx]; 660 for (m = 0; m < nz; m++) { 661 idx = 5 * vi[m]; 662 x1 = t[idx]; 663 x2 = t[1 + idx]; 664 x3 = t[2 + idx]; 665 x4 = t[3 + idx]; 666 x5 = t[4 + idx]; 667 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 668 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 669 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 670 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 671 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 672 v += 25; 673 } 674 idx = 5 * i; 675 t[idx] = s1; 676 t[1 + idx] = s2; 677 t[2 + idx] = s3; 678 t[3 + idx] = s4; 679 t[4 + idx] = s5; 680 } 681 /* backward solve the upper triangular */ 682 for (i = n - 1; i >= 0; i--) { 683 v = aa + 25 * (adiag[i + 1] + 1); 684 vi = aj + adiag[i + 1] + 1; 685 nz = adiag[i] - adiag[i + 1] - 1; 686 idt = 5 * i; 687 s1 = t[idt]; 688 s2 = t[1 + idt]; 689 s3 = t[2 + idt]; 690 s4 = t[3 + idt]; 691 s5 = t[4 + idt]; 692 for (m = 0; m < nz; m++) { 693 idx = 5 * vi[m]; 694 x1 = t[idx]; 695 x2 = t[1 + idx]; 696 x3 = t[2 + idx]; 697 x4 = t[3 + idx]; 698 x5 = t[4 + idx]; 699 s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 700 s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 701 s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 702 s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 703 s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 704 v += 25; 705 } 706 idc = 5 * c[i]; 707 x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 708 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 709 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 710 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 711 x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 712 } 713 714 PetscCall(ISRestoreIndices(isrow, &rout)); 715 PetscCall(ISRestoreIndices(iscol, &cout)); 716 PetscCall(VecRestoreArrayRead(bb, &b)); 717 PetscCall(VecRestoreArray(xx, &x)); 718 PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 719 PetscFunctionReturn(PETSC_SUCCESS); 720 } 721 722 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 723 { 724 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 725 IS iscol = a->col, isrow = a->row; 726 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 727 PetscInt i, nz, idx, idt, idc; 728 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 729 const MatScalar *aa = a->a, *v; 730 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 731 const PetscScalar *b; 732 733 PetscFunctionBegin; 734 PetscCall(VecGetArrayRead(bb, &b)); 735 PetscCall(VecGetArray(xx, &x)); 736 t = a->solve_work; 737 738 PetscCall(ISGetIndices(isrow, &rout)); 739 r = rout; 740 PetscCall(ISGetIndices(iscol, &cout)); 741 c = cout + (n - 1); 742 743 /* forward solve the lower triangular */ 744 idx = 4 * (*r++); 745 t[0] = b[idx]; 746 t[1] = b[1 + idx]; 747 t[2] = b[2 + idx]; 748 t[3] = b[3 + idx]; 749 for (i = 1; i < n; i++) { 750 v = aa + 16 * ai[i]; 751 vi = aj + ai[i]; 752 nz = diag[i] - ai[i]; 753 idx = 4 * (*r++); 754 s1 = b[idx]; 755 s2 = b[1 + idx]; 756 s3 = b[2 + idx]; 757 s4 = b[3 + idx]; 758 while (nz--) { 759 idx = 4 * (*vi++); 760 x1 = t[idx]; 761 x2 = t[1 + idx]; 762 x3 = t[2 + idx]; 763 x4 = t[3 + idx]; 764 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 765 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 766 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 767 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 768 v += 16; 769 } 770 idx = 4 * i; 771 t[idx] = s1; 772 t[1 + idx] = s2; 773 t[2 + idx] = s3; 774 t[3 + idx] = s4; 775 } 776 /* backward solve the upper triangular */ 777 for (i = n - 1; i >= 0; i--) { 778 v = aa + 16 * diag[i] + 16; 779 vi = aj + diag[i] + 1; 780 nz = ai[i + 1] - diag[i] - 1; 781 idt = 4 * i; 782 s1 = t[idt]; 783 s2 = t[1 + idt]; 784 s3 = t[2 + idt]; 785 s4 = t[3 + idt]; 786 while (nz--) { 787 idx = 4 * (*vi++); 788 x1 = t[idx]; 789 x2 = t[1 + idx]; 790 x3 = t[2 + idx]; 791 x4 = t[3 + idx]; 792 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 793 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 794 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 795 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 796 v += 16; 797 } 798 idc = 4 * (*c--); 799 v = aa + 16 * diag[i]; 800 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 801 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 802 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 803 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 804 } 805 806 PetscCall(ISRestoreIndices(isrow, &rout)); 807 PetscCall(ISRestoreIndices(iscol, &cout)); 808 PetscCall(VecRestoreArrayRead(bb, &b)); 809 PetscCall(VecRestoreArray(xx, &x)); 810 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 811 PetscFunctionReturn(PETSC_SUCCESS); 812 } 813 814 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) 815 { 816 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 817 IS iscol = a->col, isrow = a->row; 818 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 819 PetscInt i, nz, idx, idt, idc, m; 820 const PetscInt *r, *c, *rout, *cout; 821 const MatScalar *aa = a->a, *v; 822 PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 823 const PetscScalar *b; 824 825 PetscFunctionBegin; 826 PetscCall(VecGetArrayRead(bb, &b)); 827 PetscCall(VecGetArray(xx, &x)); 828 t = a->solve_work; 829 830 PetscCall(ISGetIndices(isrow, &rout)); 831 r = rout; 832 PetscCall(ISGetIndices(iscol, &cout)); 833 c = cout; 834 835 /* forward solve the lower triangular */ 836 idx = 4 * r[0]; 837 t[0] = b[idx]; 838 t[1] = b[1 + idx]; 839 t[2] = b[2 + idx]; 840 t[3] = b[3 + idx]; 841 for (i = 1; i < n; i++) { 842 v = aa + 16 * ai[i]; 843 vi = aj + ai[i]; 844 nz = ai[i + 1] - ai[i]; 845 idx = 4 * r[i]; 846 s1 = b[idx]; 847 s2 = b[1 + idx]; 848 s3 = b[2 + idx]; 849 s4 = b[3 + idx]; 850 for (m = 0; m < nz; m++) { 851 idx = 4 * vi[m]; 852 x1 = t[idx]; 853 x2 = t[1 + idx]; 854 x3 = t[2 + idx]; 855 x4 = t[3 + idx]; 856 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 857 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 858 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 859 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 860 v += 16; 861 } 862 idx = 4 * i; 863 t[idx] = s1; 864 t[1 + idx] = s2; 865 t[2 + idx] = s3; 866 t[3 + idx] = s4; 867 } 868 /* backward solve the upper triangular */ 869 for (i = n - 1; i >= 0; i--) { 870 v = aa + 16 * (adiag[i + 1] + 1); 871 vi = aj + adiag[i + 1] + 1; 872 nz = adiag[i] - adiag[i + 1] - 1; 873 idt = 4 * i; 874 s1 = t[idt]; 875 s2 = t[1 + idt]; 876 s3 = t[2 + idt]; 877 s4 = t[3 + idt]; 878 for (m = 0; m < nz; m++) { 879 idx = 4 * vi[m]; 880 x1 = t[idx]; 881 x2 = t[1 + idx]; 882 x3 = t[2 + idx]; 883 x4 = t[3 + idx]; 884 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 885 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 886 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 887 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 888 v += 16; 889 } 890 idc = 4 * c[i]; 891 x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 892 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 893 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 894 x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 895 } 896 897 PetscCall(ISRestoreIndices(isrow, &rout)); 898 PetscCall(ISRestoreIndices(iscol, &cout)); 899 PetscCall(VecRestoreArrayRead(bb, &b)); 900 PetscCall(VecRestoreArray(xx, &x)); 901 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 902 PetscFunctionReturn(PETSC_SUCCESS); 903 } 904 905 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) 906 { 907 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 908 IS iscol = a->col, isrow = a->row; 909 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 910 PetscInt i, nz, idx, idt, idc; 911 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 912 const MatScalar *aa = a->a, *v; 913 MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 914 PetscScalar *x; 915 const PetscScalar *b; 916 917 PetscFunctionBegin; 918 PetscCall(VecGetArrayRead(bb, &b)); 919 PetscCall(VecGetArray(xx, &x)); 920 t = (MatScalar *)a->solve_work; 921 922 PetscCall(ISGetIndices(isrow, &rout)); 923 r = rout; 924 PetscCall(ISGetIndices(iscol, &cout)); 925 c = cout + (n - 1); 926 927 /* forward solve the lower triangular */ 928 idx = 4 * (*r++); 929 t[0] = (MatScalar)b[idx]; 930 t[1] = (MatScalar)b[1 + idx]; 931 t[2] = (MatScalar)b[2 + idx]; 932 t[3] = (MatScalar)b[3 + idx]; 933 for (i = 1; i < n; i++) { 934 v = aa + 16 * ai[i]; 935 vi = aj + ai[i]; 936 nz = diag[i] - ai[i]; 937 idx = 4 * (*r++); 938 s1 = (MatScalar)b[idx]; 939 s2 = (MatScalar)b[1 + idx]; 940 s3 = (MatScalar)b[2 + idx]; 941 s4 = (MatScalar)b[3 + idx]; 942 while (nz--) { 943 idx = 4 * (*vi++); 944 x1 = t[idx]; 945 x2 = t[1 + idx]; 946 x3 = t[2 + idx]; 947 x4 = t[3 + idx]; 948 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 949 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 950 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 951 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 952 v += 16; 953 } 954 idx = 4 * i; 955 t[idx] = s1; 956 t[1 + idx] = s2; 957 t[2 + idx] = s3; 958 t[3 + idx] = s4; 959 } 960 /* backward solve the upper triangular */ 961 for (i = n - 1; i >= 0; i--) { 962 v = aa + 16 * diag[i] + 16; 963 vi = aj + diag[i] + 1; 964 nz = ai[i + 1] - diag[i] - 1; 965 idt = 4 * i; 966 s1 = t[idt]; 967 s2 = t[1 + idt]; 968 s3 = t[2 + idt]; 969 s4 = t[3 + idt]; 970 while (nz--) { 971 idx = 4 * (*vi++); 972 x1 = t[idx]; 973 x2 = t[1 + idx]; 974 x3 = t[2 + idx]; 975 x4 = t[3 + idx]; 976 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 977 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 978 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 979 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 980 v += 16; 981 } 982 idc = 4 * (*c--); 983 v = aa + 16 * diag[i]; 984 t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 985 t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 986 t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 987 t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 988 x[idc] = (PetscScalar)t[idt]; 989 x[1 + idc] = (PetscScalar)t[1 + idt]; 990 x[2 + idc] = (PetscScalar)t[2 + idt]; 991 x[3 + idc] = (PetscScalar)t[3 + idt]; 992 } 993 994 PetscCall(ISRestoreIndices(isrow, &rout)); 995 PetscCall(ISRestoreIndices(iscol, &cout)); 996 PetscCall(VecRestoreArrayRead(bb, &b)); 997 PetscCall(VecRestoreArray(xx, &x)); 998 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 999 PetscFunctionReturn(PETSC_SUCCESS); 1000 } 1001 1002 #if defined(PETSC_HAVE_SSE) 1003 1004 #include PETSC_HAVE_SSE 1005 1006 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) 1007 { 1008 /* 1009 Note: This code uses demotion of double 1010 to float when performing the mixed-mode computation. 1011 This may not be numerically reasonable for all applications. 1012 */ 1013 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1014 IS iscol = a->col, isrow = a->row; 1015 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 1016 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1017 MatScalar *aa = a->a, *v; 1018 PetscScalar *x, *b, *t; 1019 1020 /* Make space in temp stack for 16 Byte Aligned arrays */ 1021 float ssealignedspace[11], *tmps, *tmpx; 1022 unsigned long offset; 1023 1024 PetscFunctionBegin; 1025 SSE_SCOPE_BEGIN; 1026 1027 offset = (unsigned long)ssealignedspace % 16; 1028 if (offset) offset = (16 - offset) / 4; 1029 tmps = &ssealignedspace[offset]; 1030 tmpx = &ssealignedspace[offset + 4]; 1031 PREFETCH_NTA(aa + 16 * ai[1]); 1032 1033 PetscCall(VecGetArray(bb, &b)); 1034 PetscCall(VecGetArray(xx, &x)); 1035 t = a->solve_work; 1036 1037 PetscCall(ISGetIndices(isrow, &rout)); 1038 r = rout; 1039 PetscCall(ISGetIndices(iscol, &cout)); 1040 c = cout + (n - 1); 1041 1042 /* forward solve the lower triangular */ 1043 idx = 4 * (*r++); 1044 t[0] = b[idx]; 1045 t[1] = b[1 + idx]; 1046 t[2] = b[2 + idx]; 1047 t[3] = b[3 + idx]; 1048 v = aa + 16 * ai[1]; 1049 1050 for (i = 1; i < n;) { 1051 PREFETCH_NTA(&v[8]); 1052 vi = aj + ai[i]; 1053 nz = diag[i] - ai[i]; 1054 idx = 4 * (*r++); 1055 1056 /* Demote sum from double to float */ 1057 CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 1058 LOAD_PS(tmps, XMM7); 1059 1060 while (nz--) { 1061 PREFETCH_NTA(&v[16]); 1062 idx = 4 * (*vi++); 1063 1064 /* Demote solution (so far) from double to float */ 1065 CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 1066 1067 /* 4x4 Matrix-Vector product with negative accumulation: */ 1068 SSE_INLINE_BEGIN_2(tmpx, v) 1069 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1070 1071 /* First Column */ 1072 SSE_COPY_PS(XMM0, XMM6) 1073 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1074 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1075 SSE_SUB_PS(XMM7, XMM0) 1076 1077 /* Second Column */ 1078 SSE_COPY_PS(XMM1, XMM6) 1079 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1080 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1081 SSE_SUB_PS(XMM7, XMM1) 1082 1083 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1084 1085 /* Third Column */ 1086 SSE_COPY_PS(XMM2, XMM6) 1087 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1088 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1089 SSE_SUB_PS(XMM7, XMM2) 1090 1091 /* Fourth Column */ 1092 SSE_COPY_PS(XMM3, XMM6) 1093 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1094 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1095 SSE_SUB_PS(XMM7, XMM3) 1096 SSE_INLINE_END_2 1097 1098 v += 16; 1099 } 1100 idx = 4 * i; 1101 v = aa + 16 * ai[++i]; 1102 PREFETCH_NTA(v); 1103 STORE_PS(tmps, XMM7); 1104 1105 /* Promote result from float to double */ 1106 CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 1107 } 1108 /* backward solve the upper triangular */ 1109 idt = 4 * (n - 1); 1110 ai16 = 16 * diag[n - 1]; 1111 v = aa + ai16 + 16; 1112 for (i = n - 1; i >= 0;) { 1113 PREFETCH_NTA(&v[8]); 1114 vi = aj + diag[i] + 1; 1115 nz = ai[i + 1] - diag[i] - 1; 1116 1117 /* Demote accumulator from double to float */ 1118 CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 1119 LOAD_PS(tmps, XMM7); 1120 1121 while (nz--) { 1122 PREFETCH_NTA(&v[16]); 1123 idx = 4 * (*vi++); 1124 1125 /* Demote solution (so far) from double to float */ 1126 CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 1127 1128 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1129 SSE_INLINE_BEGIN_2(tmpx, v) 1130 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 1131 1132 /* First Column */ 1133 SSE_COPY_PS(XMM0, XMM6) 1134 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1135 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 1136 SSE_SUB_PS(XMM7, XMM0) 1137 1138 /* Second Column */ 1139 SSE_COPY_PS(XMM1, XMM6) 1140 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1141 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 1142 SSE_SUB_PS(XMM7, XMM1) 1143 1144 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 1145 1146 /* Third Column */ 1147 SSE_COPY_PS(XMM2, XMM6) 1148 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1149 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 1150 SSE_SUB_PS(XMM7, XMM2) 1151 1152 /* Fourth Column */ 1153 SSE_COPY_PS(XMM3, XMM6) 1154 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1155 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 1156 SSE_SUB_PS(XMM7, XMM3) 1157 SSE_INLINE_END_2 1158 v += 16; 1159 } 1160 v = aa + ai16; 1161 ai16 = 16 * diag[--i]; 1162 PREFETCH_NTA(aa + ai16 + 16); 1163 /* 1164 Scale the result by the diagonal 4x4 block, 1165 which was inverted as part of the factorization 1166 */ 1167 SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 1168 /* First Column */ 1169 SSE_COPY_PS(XMM0, XMM7) 1170 SSE_SHUFFLE(XMM0, XMM0, 0x00) 1171 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 1172 1173 /* Second Column */ 1174 SSE_COPY_PS(XMM1, XMM7) 1175 SSE_SHUFFLE(XMM1, XMM1, 0x55) 1176 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 1177 SSE_ADD_PS(XMM0, XMM1) 1178 1179 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 1180 1181 /* Third Column */ 1182 SSE_COPY_PS(XMM2, XMM7) 1183 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 1184 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 1185 SSE_ADD_PS(XMM0, XMM2) 1186 1187 /* Fourth Column */ 1188 SSE_COPY_PS(XMM3, XMM7) 1189 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 1190 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 1191 SSE_ADD_PS(XMM0, XMM3) 1192 1193 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 1194 SSE_INLINE_END_3 1195 1196 /* Promote solution from float to double */ 1197 CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 1198 1199 /* Apply reordering to t and stream into x. */ 1200 /* This way, x doesn't pollute the cache. */ 1201 /* Be careful with size: 2 doubles = 4 floats! */ 1202 idc = 4 * (*c--); 1203 SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 1204 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1205 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 1206 SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 1207 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1208 SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 1209 SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 1210 SSE_INLINE_END_2 1211 v = aa + ai16 + 16; 1212 idt -= 4; 1213 } 1214 1215 PetscCall(ISRestoreIndices(isrow, &rout)); 1216 PetscCall(ISRestoreIndices(iscol, &cout)); 1217 PetscCall(VecRestoreArray(bb, &b)); 1218 PetscCall(VecRestoreArray(xx, &x)); 1219 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 1220 SSE_SCOPE_END; 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 #endif 1225 1226 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1227 { 1228 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1229 IS iscol = a->col, isrow = a->row; 1230 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1231 PetscInt i, nz, idx, idt, idc; 1232 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1233 const MatScalar *aa = a->a, *v; 1234 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1235 const PetscScalar *b; 1236 1237 PetscFunctionBegin; 1238 PetscCall(VecGetArrayRead(bb, &b)); 1239 PetscCall(VecGetArray(xx, &x)); 1240 t = a->solve_work; 1241 1242 PetscCall(ISGetIndices(isrow, &rout)); 1243 r = rout; 1244 PetscCall(ISGetIndices(iscol, &cout)); 1245 c = cout + (n - 1); 1246 1247 /* forward solve the lower triangular */ 1248 idx = 3 * (*r++); 1249 t[0] = b[idx]; 1250 t[1] = b[1 + idx]; 1251 t[2] = b[2 + idx]; 1252 for (i = 1; i < n; i++) { 1253 v = aa + 9 * ai[i]; 1254 vi = aj + ai[i]; 1255 nz = diag[i] - ai[i]; 1256 idx = 3 * (*r++); 1257 s1 = b[idx]; 1258 s2 = b[1 + idx]; 1259 s3 = b[2 + idx]; 1260 while (nz--) { 1261 idx = 3 * (*vi++); 1262 x1 = t[idx]; 1263 x2 = t[1 + idx]; 1264 x3 = t[2 + idx]; 1265 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1266 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1267 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1268 v += 9; 1269 } 1270 idx = 3 * i; 1271 t[idx] = s1; 1272 t[1 + idx] = s2; 1273 t[2 + idx] = s3; 1274 } 1275 /* backward solve the upper triangular */ 1276 for (i = n - 1; i >= 0; i--) { 1277 v = aa + 9 * diag[i] + 9; 1278 vi = aj + diag[i] + 1; 1279 nz = ai[i + 1] - diag[i] - 1; 1280 idt = 3 * i; 1281 s1 = t[idt]; 1282 s2 = t[1 + idt]; 1283 s3 = t[2 + idt]; 1284 while (nz--) { 1285 idx = 3 * (*vi++); 1286 x1 = t[idx]; 1287 x2 = t[1 + idx]; 1288 x3 = t[2 + idx]; 1289 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1290 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1291 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1292 v += 9; 1293 } 1294 idc = 3 * (*c--); 1295 v = aa + 9 * diag[i]; 1296 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1297 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1298 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1299 } 1300 PetscCall(ISRestoreIndices(isrow, &rout)); 1301 PetscCall(ISRestoreIndices(iscol, &cout)); 1302 PetscCall(VecRestoreArrayRead(bb, &b)); 1303 PetscCall(VecRestoreArray(xx, &x)); 1304 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1305 PetscFunctionReturn(PETSC_SUCCESS); 1306 } 1307 1308 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) 1309 { 1310 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1311 IS iscol = a->col, isrow = a->row; 1312 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1313 PetscInt i, nz, idx, idt, idc, m; 1314 const PetscInt *r, *c, *rout, *cout; 1315 const MatScalar *aa = a->a, *v; 1316 PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 1317 const PetscScalar *b; 1318 1319 PetscFunctionBegin; 1320 PetscCall(VecGetArrayRead(bb, &b)); 1321 PetscCall(VecGetArray(xx, &x)); 1322 t = a->solve_work; 1323 1324 PetscCall(ISGetIndices(isrow, &rout)); 1325 r = rout; 1326 PetscCall(ISGetIndices(iscol, &cout)); 1327 c = cout; 1328 1329 /* forward solve the lower triangular */ 1330 idx = 3 * r[0]; 1331 t[0] = b[idx]; 1332 t[1] = b[1 + idx]; 1333 t[2] = b[2 + idx]; 1334 for (i = 1; i < n; i++) { 1335 v = aa + 9 * ai[i]; 1336 vi = aj + ai[i]; 1337 nz = ai[i + 1] - ai[i]; 1338 idx = 3 * r[i]; 1339 s1 = b[idx]; 1340 s2 = b[1 + idx]; 1341 s3 = b[2 + idx]; 1342 for (m = 0; m < nz; m++) { 1343 idx = 3 * vi[m]; 1344 x1 = t[idx]; 1345 x2 = t[1 + idx]; 1346 x3 = t[2 + idx]; 1347 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1348 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1349 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1350 v += 9; 1351 } 1352 idx = 3 * i; 1353 t[idx] = s1; 1354 t[1 + idx] = s2; 1355 t[2 + idx] = s3; 1356 } 1357 /* backward solve the upper triangular */ 1358 for (i = n - 1; i >= 0; i--) { 1359 v = aa + 9 * (adiag[i + 1] + 1); 1360 vi = aj + adiag[i + 1] + 1; 1361 nz = adiag[i] - adiag[i + 1] - 1; 1362 idt = 3 * i; 1363 s1 = t[idt]; 1364 s2 = t[1 + idt]; 1365 s3 = t[2 + idt]; 1366 for (m = 0; m < nz; m++) { 1367 idx = 3 * vi[m]; 1368 x1 = t[idx]; 1369 x2 = t[1 + idx]; 1370 x3 = t[2 + idx]; 1371 s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 1372 s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 1373 s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 1374 v += 9; 1375 } 1376 idc = 3 * c[i]; 1377 x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 1378 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 1379 x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 1380 } 1381 PetscCall(ISRestoreIndices(isrow, &rout)); 1382 PetscCall(ISRestoreIndices(iscol, &cout)); 1383 PetscCall(VecRestoreArrayRead(bb, &b)); 1384 PetscCall(VecRestoreArray(xx, &x)); 1385 PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1390 { 1391 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1392 IS iscol = a->col, isrow = a->row; 1393 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1394 PetscInt i, nz, idx, idt, idc; 1395 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1396 const MatScalar *aa = a->a, *v; 1397 PetscScalar *x, s1, s2, x1, x2, *t; 1398 const PetscScalar *b; 1399 1400 PetscFunctionBegin; 1401 PetscCall(VecGetArrayRead(bb, &b)); 1402 PetscCall(VecGetArray(xx, &x)); 1403 t = a->solve_work; 1404 1405 PetscCall(ISGetIndices(isrow, &rout)); 1406 r = rout; 1407 PetscCall(ISGetIndices(iscol, &cout)); 1408 c = cout + (n - 1); 1409 1410 /* forward solve the lower triangular */ 1411 idx = 2 * (*r++); 1412 t[0] = b[idx]; 1413 t[1] = b[1 + idx]; 1414 for (i = 1; i < n; i++) { 1415 v = aa + 4 * ai[i]; 1416 vi = aj + ai[i]; 1417 nz = diag[i] - ai[i]; 1418 idx = 2 * (*r++); 1419 s1 = b[idx]; 1420 s2 = b[1 + idx]; 1421 while (nz--) { 1422 idx = 2 * (*vi++); 1423 x1 = t[idx]; 1424 x2 = t[1 + idx]; 1425 s1 -= v[0] * x1 + v[2] * x2; 1426 s2 -= v[1] * x1 + v[3] * x2; 1427 v += 4; 1428 } 1429 idx = 2 * i; 1430 t[idx] = s1; 1431 t[1 + idx] = s2; 1432 } 1433 /* backward solve the upper triangular */ 1434 for (i = n - 1; i >= 0; i--) { 1435 v = aa + 4 * diag[i] + 4; 1436 vi = aj + diag[i] + 1; 1437 nz = ai[i + 1] - diag[i] - 1; 1438 idt = 2 * i; 1439 s1 = t[idt]; 1440 s2 = t[1 + idt]; 1441 while (nz--) { 1442 idx = 2 * (*vi++); 1443 x1 = t[idx]; 1444 x2 = t[1 + idx]; 1445 s1 -= v[0] * x1 + v[2] * x2; 1446 s2 -= v[1] * x1 + v[3] * x2; 1447 v += 4; 1448 } 1449 idc = 2 * (*c--); 1450 v = aa + 4 * diag[i]; 1451 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1452 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1453 } 1454 PetscCall(ISRestoreIndices(isrow, &rout)); 1455 PetscCall(ISRestoreIndices(iscol, &cout)); 1456 PetscCall(VecRestoreArrayRead(bb, &b)); 1457 PetscCall(VecRestoreArray(xx, &x)); 1458 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) 1463 { 1464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1465 IS iscol = a->col, isrow = a->row; 1466 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 1467 PetscInt i, nz, idx, jdx, idt, idc, m; 1468 const PetscInt *r, *c, *rout, *cout; 1469 const MatScalar *aa = a->a, *v; 1470 PetscScalar *x, s1, s2, x1, x2, *t; 1471 const PetscScalar *b; 1472 1473 PetscFunctionBegin; 1474 PetscCall(VecGetArrayRead(bb, &b)); 1475 PetscCall(VecGetArray(xx, &x)); 1476 t = a->solve_work; 1477 1478 PetscCall(ISGetIndices(isrow, &rout)); 1479 r = rout; 1480 PetscCall(ISGetIndices(iscol, &cout)); 1481 c = cout; 1482 1483 /* forward solve the lower triangular */ 1484 idx = 2 * r[0]; 1485 t[0] = b[idx]; 1486 t[1] = b[1 + idx]; 1487 for (i = 1; i < n; i++) { 1488 v = aa + 4 * ai[i]; 1489 vi = aj + ai[i]; 1490 nz = ai[i + 1] - ai[i]; 1491 idx = 2 * r[i]; 1492 s1 = b[idx]; 1493 s2 = b[1 + idx]; 1494 for (m = 0; m < nz; m++) { 1495 jdx = 2 * vi[m]; 1496 x1 = t[jdx]; 1497 x2 = t[1 + jdx]; 1498 s1 -= v[0] * x1 + v[2] * x2; 1499 s2 -= v[1] * x1 + v[3] * x2; 1500 v += 4; 1501 } 1502 idx = 2 * i; 1503 t[idx] = s1; 1504 t[1 + idx] = s2; 1505 } 1506 /* backward solve the upper triangular */ 1507 for (i = n - 1; i >= 0; i--) { 1508 v = aa + 4 * (adiag[i + 1] + 1); 1509 vi = aj + adiag[i + 1] + 1; 1510 nz = adiag[i] - adiag[i + 1] - 1; 1511 idt = 2 * i; 1512 s1 = t[idt]; 1513 s2 = t[1 + idt]; 1514 for (m = 0; m < nz; m++) { 1515 idx = 2 * vi[m]; 1516 x1 = t[idx]; 1517 x2 = t[1 + idx]; 1518 s1 -= v[0] * x1 + v[2] * x2; 1519 s2 -= v[1] * x1 + v[3] * x2; 1520 v += 4; 1521 } 1522 idc = 2 * c[i]; 1523 x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 1524 x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 1525 } 1526 PetscCall(ISRestoreIndices(isrow, &rout)); 1527 PetscCall(ISRestoreIndices(iscol, &cout)); 1528 PetscCall(VecRestoreArrayRead(bb, &b)); 1529 PetscCall(VecRestoreArray(xx, &x)); 1530 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1531 PetscFunctionReturn(PETSC_SUCCESS); 1532 } 1533 1534 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1535 { 1536 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1537 IS iscol = a->col, isrow = a->row; 1538 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 1539 PetscInt i, nz; 1540 const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 1541 const MatScalar *aa = a->a, *v; 1542 PetscScalar *x, s1, *t; 1543 const PetscScalar *b; 1544 1545 PetscFunctionBegin; 1546 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 1547 1548 PetscCall(VecGetArrayRead(bb, &b)); 1549 PetscCall(VecGetArray(xx, &x)); 1550 t = a->solve_work; 1551 1552 PetscCall(ISGetIndices(isrow, &rout)); 1553 r = rout; 1554 PetscCall(ISGetIndices(iscol, &cout)); 1555 c = cout + (n - 1); 1556 1557 /* forward solve the lower triangular */ 1558 t[0] = b[*r++]; 1559 for (i = 1; i < n; i++) { 1560 v = aa + ai[i]; 1561 vi = aj + ai[i]; 1562 nz = diag[i] - ai[i]; 1563 s1 = b[*r++]; 1564 while (nz--) s1 -= (*v++) * t[*vi++]; 1565 t[i] = s1; 1566 } 1567 /* backward solve the upper triangular */ 1568 for (i = n - 1; i >= 0; i--) { 1569 v = aa + diag[i] + 1; 1570 vi = aj + diag[i] + 1; 1571 nz = ai[i + 1] - diag[i] - 1; 1572 s1 = t[i]; 1573 while (nz--) s1 -= (*v++) * t[*vi++]; 1574 x[*c--] = t[i] = aa[diag[i]] * s1; 1575 } 1576 1577 PetscCall(ISRestoreIndices(isrow, &rout)); 1578 PetscCall(ISRestoreIndices(iscol, &cout)); 1579 PetscCall(VecRestoreArrayRead(bb, &b)); 1580 PetscCall(VecRestoreArray(xx, &x)); 1581 PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 1582 PetscFunctionReturn(PETSC_SUCCESS); 1583 } 1584 1585 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) 1586 { 1587 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1588 IS iscol = a->col, isrow = a->row; 1589 PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 1590 const PetscInt *rout, *cout, *r, *c; 1591 PetscScalar *x, *tmp, sum; 1592 const PetscScalar *b; 1593 const MatScalar *aa = a->a, *v; 1594 1595 PetscFunctionBegin; 1596 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 1597 1598 PetscCall(VecGetArrayRead(bb, &b)); 1599 PetscCall(VecGetArray(xx, &x)); 1600 tmp = a->solve_work; 1601 1602 PetscCall(ISGetIndices(isrow, &rout)); 1603 r = rout; 1604 PetscCall(ISGetIndices(iscol, &cout)); 1605 c = cout; 1606 1607 /* forward solve the lower triangular */ 1608 tmp[0] = b[r[0]]; 1609 v = aa; 1610 vi = aj; 1611 for (i = 1; i < n; i++) { 1612 nz = ai[i + 1] - ai[i]; 1613 sum = b[r[i]]; 1614 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1615 tmp[i] = sum; 1616 v += nz; 1617 vi += nz; 1618 } 1619 1620 /* backward solve the upper triangular */ 1621 for (i = n - 1; i >= 0; i--) { 1622 v = aa + adiag[i + 1] + 1; 1623 vi = aj + adiag[i + 1] + 1; 1624 nz = adiag[i] - adiag[i + 1] - 1; 1625 sum = tmp[i]; 1626 PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 1627 x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 1628 } 1629 1630 PetscCall(ISRestoreIndices(isrow, &rout)); 1631 PetscCall(ISRestoreIndices(iscol, &cout)); 1632 PetscCall(VecRestoreArrayRead(bb, &b)); 1633 PetscCall(VecRestoreArray(xx, &x)); 1634 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1635 PetscFunctionReturn(PETSC_SUCCESS); 1636 } 1637