1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 /* 8 Version for when blocks are 3 by 3 9 */ 10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 13 IS isrow = b->row, isicol = b->icol; 14 const PetscInt *r, *ic; 15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 16 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 17 const PetscInt *diag_offset; 18 PetscInt idx, *pj; 19 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9; 22 MatScalar *ba = b->a, *aa = a->a; 23 PetscReal shift = info->shiftamount; 24 PetscBool allowzeropivot, zeropivotdetected; 25 26 PetscFunctionBegin; 27 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 28 A->factortype = MAT_FACTOR_NONE; 29 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 30 A->factortype = MAT_FACTOR_ILU; 31 PetscCall(ISGetIndices(isrow, &r)); 32 PetscCall(ISGetIndices(isicol, &ic)); 33 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp)); 34 allowzeropivot = PetscNot(A->erroriffailure); 35 36 for (i = 0; i < n; i++) { 37 nz = bi[i + 1] - bi[i]; 38 ajtmp = bj + bi[i]; 39 for (j = 0; j < nz; j++) { 40 x = rtmp + 9 * ajtmp[j]; 41 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 42 } 43 /* load in initial (unfactored row) */ 44 idx = r[i]; 45 nz = ai[idx + 1] - ai[idx]; 46 ajtmpold = aj + ai[idx]; 47 v = aa + 9 * ai[idx]; 48 for (j = 0; j < nz; j++) { 49 x = rtmp + 9 * ic[ajtmpold[j]]; 50 x[0] = v[0]; 51 x[1] = v[1]; 52 x[2] = v[2]; 53 x[3] = v[3]; 54 x[4] = v[4]; 55 x[5] = v[5]; 56 x[6] = v[6]; 57 x[7] = v[7]; 58 x[8] = v[8]; 59 v += 9; 60 } 61 row = *ajtmp++; 62 while (row < i) { 63 pc = rtmp + 9 * row; 64 p1 = pc[0]; 65 p2 = pc[1]; 66 p3 = pc[2]; 67 p4 = pc[3]; 68 p5 = pc[4]; 69 p6 = pc[5]; 70 p7 = pc[6]; 71 p8 = pc[7]; 72 p9 = pc[8]; 73 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 74 pv = ba + 9 * diag_offset[row]; 75 pj = bj + diag_offset[row] + 1; 76 x1 = pv[0]; 77 x2 = pv[1]; 78 x3 = pv[2]; 79 x4 = pv[3]; 80 x5 = pv[4]; 81 x6 = pv[5]; 82 x7 = pv[6]; 83 x8 = pv[7]; 84 x9 = pv[8]; 85 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3; 86 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3; 87 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3; 88 89 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6; 90 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6; 91 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6; 92 93 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9; 94 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9; 95 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9; 96 nz = bi[row + 1] - diag_offset[row] - 1; 97 pv += 9; 98 for (j = 0; j < nz; j++) { 99 x1 = pv[0]; 100 x2 = pv[1]; 101 x3 = pv[2]; 102 x4 = pv[3]; 103 x5 = pv[4]; 104 x6 = pv[5]; 105 x7 = pv[6]; 106 x8 = pv[7]; 107 x9 = pv[8]; 108 x = rtmp + 9 * pj[j]; 109 x[0] -= m1 * x1 + m4 * x2 + m7 * x3; 110 x[1] -= m2 * x1 + m5 * x2 + m8 * x3; 111 x[2] -= m3 * x1 + m6 * x2 + m9 * x3; 112 113 x[3] -= m1 * x4 + m4 * x5 + m7 * x6; 114 x[4] -= m2 * x4 + m5 * x5 + m8 * x6; 115 x[5] -= m3 * x4 + m6 * x5 + m9 * x6; 116 117 x[6] -= m1 * x7 + m4 * x8 + m7 * x9; 118 x[7] -= m2 * x7 + m5 * x8 + m8 * x9; 119 x[8] -= m3 * x7 + m6 * x8 + m9 * x9; 120 pv += 9; 121 } 122 PetscCall(PetscLogFlops(54.0 * nz + 36.0)); 123 } 124 row = *ajtmp++; 125 } 126 /* finished row so stick it into b->a */ 127 pv = ba + 9 * bi[i]; 128 pj = bj + bi[i]; 129 nz = bi[i + 1] - bi[i]; 130 for (j = 0; j < nz; j++) { 131 x = rtmp + 9 * pj[j]; 132 pv[0] = x[0]; 133 pv[1] = x[1]; 134 pv[2] = x[2]; 135 pv[3] = x[3]; 136 pv[4] = x[4]; 137 pv[5] = x[5]; 138 pv[6] = x[6]; 139 pv[7] = x[7]; 140 pv[8] = x[8]; 141 pv += 9; 142 } 143 /* invert diagonal block */ 144 w = ba + 9 * diag_offset[i]; 145 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected)); 146 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 147 } 148 149 PetscCall(PetscFree(rtmp)); 150 PetscCall(ISRestoreIndices(isicol, &ic)); 151 PetscCall(ISRestoreIndices(isrow, &r)); 152 153 C->ops->solve = MatSolve_SeqBAIJ_3_inplace; 154 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace; 155 C->assembled = PETSC_TRUE; 156 157 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */ 158 PetscFunctionReturn(PETSC_SUCCESS); 159 } 160 161 /* MatLUFactorNumeric_SeqBAIJ_3 - 162 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 163 PetscKernel_A_gets_A_times_B() 164 PetscKernel_A_gets_A_minus_B_times_C() 165 PetscKernel_A_gets_inverse_A() 166 */ 167 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info) 168 { 169 Mat C = B; 170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 171 IS isrow = b->row, isicol = b->icol; 172 const PetscInt *r, *ic; 173 PetscInt i, j, k, nz, nzL, row; 174 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 175 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 176 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 177 PetscInt flg; 178 PetscReal shift = info->shiftamount; 179 PetscBool allowzeropivot, zeropivotdetected; 180 181 PetscFunctionBegin; 182 PetscCall(ISGetIndices(isrow, &r)); 183 PetscCall(ISGetIndices(isicol, &ic)); 184 allowzeropivot = PetscNot(A->erroriffailure); 185 186 /* generate work space needed by the factorization */ 187 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 188 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 189 190 for (i = 0; i < n; i++) { 191 /* zero rtmp */ 192 /* L part */ 193 nz = bi[i + 1] - bi[i]; 194 bjtmp = bj + bi[i]; 195 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 196 197 /* U part */ 198 nz = bdiag[i] - bdiag[i + 1]; 199 bjtmp = bj + bdiag[i + 1] + 1; 200 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 201 202 /* load in initial (unfactored row) */ 203 nz = ai[r[i] + 1] - ai[r[i]]; 204 ajtmp = aj + ai[r[i]]; 205 v = aa + bs2 * ai[r[i]]; 206 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 207 208 /* elimination */ 209 bjtmp = bj + bi[i]; 210 nzL = bi[i + 1] - bi[i]; 211 for (k = 0; k < nzL; k++) { 212 row = bjtmp[k]; 213 pc = rtmp + bs2 * row; 214 for (flg = 0, j = 0; j < bs2; j++) { 215 if (pc[j] != 0.0) { 216 flg = 1; 217 break; 218 } 219 } 220 if (flg) { 221 pv = b->a + bs2 * bdiag[row]; 222 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 223 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork)); 224 225 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 226 pv = b->a + bs2 * (bdiag[row + 1] + 1); 227 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 228 for (j = 0; j < nz; j++) { 229 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 230 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 231 v = rtmp + bs2 * pj[j]; 232 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv)); 233 pv += bs2; 234 } 235 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 236 } 237 } 238 239 /* finished row so stick it into b->a */ 240 /* L part */ 241 pv = b->a + bs2 * bi[i]; 242 pj = b->j + bi[i]; 243 nz = bi[i + 1] - bi[i]; 244 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 245 246 /* Mark diagonal and invert diagonal for simpler triangular solves */ 247 pv = b->a + bs2 * bdiag[i]; 248 pj = b->j + bdiag[i]; 249 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 250 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected)); 251 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 252 253 /* U part */ 254 pj = b->j + bdiag[i + 1] + 1; 255 pv = b->a + bs2 * (bdiag[i + 1] + 1); 256 nz = bdiag[i] - bdiag[i + 1] - 1; 257 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 258 } 259 260 PetscCall(PetscFree2(rtmp, mwork)); 261 PetscCall(ISRestoreIndices(isicol, &ic)); 262 PetscCall(ISRestoreIndices(isrow, &r)); 263 264 C->ops->solve = MatSolve_SeqBAIJ_3; 265 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 266 C->assembled = PETSC_TRUE; 267 268 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */ 269 PetscFunctionReturn(PETSC_SUCCESS); 270 } 271 272 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 273 { 274 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 275 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 276 PetscInt *ajtmpold, *ajtmp, nz, row; 277 const PetscInt *diag_offset; 278 PetscInt *ai = a->i, *aj = a->j, *pj; 279 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 280 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 281 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9; 282 MatScalar *ba = b->a, *aa = a->a; 283 PetscReal shift = info->shiftamount; 284 PetscBool allowzeropivot, zeropivotdetected; 285 286 PetscFunctionBegin; 287 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 288 A->factortype = MAT_FACTOR_NONE; 289 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 290 A->factortype = MAT_FACTOR_ILU; 291 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp)); 292 allowzeropivot = PetscNot(A->erroriffailure); 293 294 for (i = 0; i < n; i++) { 295 nz = bi[i + 1] - bi[i]; 296 ajtmp = bj + bi[i]; 297 for (j = 0; j < nz; j++) { 298 x = rtmp + 9 * ajtmp[j]; 299 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 300 } 301 /* load in initial (unfactored row) */ 302 nz = ai[i + 1] - ai[i]; 303 ajtmpold = aj + ai[i]; 304 v = aa + 9 * ai[i]; 305 for (j = 0; j < nz; j++) { 306 x = rtmp + 9 * ajtmpold[j]; 307 x[0] = v[0]; 308 x[1] = v[1]; 309 x[2] = v[2]; 310 x[3] = v[3]; 311 x[4] = v[4]; 312 x[5] = v[5]; 313 x[6] = v[6]; 314 x[7] = v[7]; 315 x[8] = v[8]; 316 v += 9; 317 } 318 row = *ajtmp++; 319 while (row < i) { 320 pc = rtmp + 9 * row; 321 p1 = pc[0]; 322 p2 = pc[1]; 323 p3 = pc[2]; 324 p4 = pc[3]; 325 p5 = pc[4]; 326 p6 = pc[5]; 327 p7 = pc[6]; 328 p8 = pc[7]; 329 p9 = pc[8]; 330 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 331 pv = ba + 9 * diag_offset[row]; 332 pj = bj + diag_offset[row] + 1; 333 x1 = pv[0]; 334 x2 = pv[1]; 335 x3 = pv[2]; 336 x4 = pv[3]; 337 x5 = pv[4]; 338 x6 = pv[5]; 339 x7 = pv[6]; 340 x8 = pv[7]; 341 x9 = pv[8]; 342 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3; 343 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3; 344 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3; 345 346 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6; 347 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6; 348 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6; 349 350 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9; 351 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9; 352 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9; 353 354 nz = bi[row + 1] - diag_offset[row] - 1; 355 pv += 9; 356 for (j = 0; j < nz; j++) { 357 x1 = pv[0]; 358 x2 = pv[1]; 359 x3 = pv[2]; 360 x4 = pv[3]; 361 x5 = pv[4]; 362 x6 = pv[5]; 363 x7 = pv[6]; 364 x8 = pv[7]; 365 x9 = pv[8]; 366 x = rtmp + 9 * pj[j]; 367 x[0] -= m1 * x1 + m4 * x2 + m7 * x3; 368 x[1] -= m2 * x1 + m5 * x2 + m8 * x3; 369 x[2] -= m3 * x1 + m6 * x2 + m9 * x3; 370 371 x[3] -= m1 * x4 + m4 * x5 + m7 * x6; 372 x[4] -= m2 * x4 + m5 * x5 + m8 * x6; 373 x[5] -= m3 * x4 + m6 * x5 + m9 * x6; 374 375 x[6] -= m1 * x7 + m4 * x8 + m7 * x9; 376 x[7] -= m2 * x7 + m5 * x8 + m8 * x9; 377 x[8] -= m3 * x7 + m6 * x8 + m9 * x9; 378 pv += 9; 379 } 380 PetscCall(PetscLogFlops(54.0 * nz + 36.0)); 381 } 382 row = *ajtmp++; 383 } 384 /* finished row so stick it into b->a */ 385 pv = ba + 9 * bi[i]; 386 pj = bj + bi[i]; 387 nz = bi[i + 1] - bi[i]; 388 for (j = 0; j < nz; j++) { 389 x = rtmp + 9 * pj[j]; 390 pv[0] = x[0]; 391 pv[1] = x[1]; 392 pv[2] = x[2]; 393 pv[3] = x[3]; 394 pv[4] = x[4]; 395 pv[5] = x[5]; 396 pv[6] = x[6]; 397 pv[7] = x[7]; 398 pv[8] = x[8]; 399 pv += 9; 400 } 401 /* invert diagonal block */ 402 w = ba + 9 * diag_offset[i]; 403 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected)); 404 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 405 } 406 407 PetscCall(PetscFree(rtmp)); 408 409 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace; 410 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace; 411 C->assembled = PETSC_TRUE; 412 413 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */ 414 PetscFunctionReturn(PETSC_SUCCESS); 415 } 416 417 /* 418 MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering - 419 copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace() 420 */ 421 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 422 { 423 Mat C = B; 424 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 425 PetscInt i, j, k, nz, nzL, row; 426 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 427 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 428 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 429 PetscInt flg; 430 PetscReal shift = info->shiftamount; 431 PetscBool allowzeropivot, zeropivotdetected; 432 433 PetscFunctionBegin; 434 allowzeropivot = PetscNot(A->erroriffailure); 435 436 /* generate work space needed by the factorization */ 437 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 438 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 439 440 for (i = 0; i < n; i++) { 441 /* zero rtmp */ 442 /* L part */ 443 nz = bi[i + 1] - bi[i]; 444 bjtmp = bj + bi[i]; 445 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 446 447 /* U part */ 448 nz = bdiag[i] - bdiag[i + 1]; 449 bjtmp = bj + bdiag[i + 1] + 1; 450 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 451 452 /* load in initial (unfactored row) */ 453 nz = ai[i + 1] - ai[i]; 454 ajtmp = aj + ai[i]; 455 v = aa + bs2 * ai[i]; 456 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 457 458 /* elimination */ 459 bjtmp = bj + bi[i]; 460 nzL = bi[i + 1] - bi[i]; 461 for (k = 0; k < nzL; k++) { 462 row = bjtmp[k]; 463 pc = rtmp + bs2 * row; 464 for (flg = 0, j = 0; j < bs2; j++) { 465 if (pc[j] != 0.0) { 466 flg = 1; 467 break; 468 } 469 } 470 if (flg) { 471 pv = b->a + bs2 * bdiag[row]; 472 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 473 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork)); 474 475 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 476 pv = b->a + bs2 * (bdiag[row + 1] + 1); 477 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 478 for (j = 0; j < nz; j++) { 479 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 480 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 481 v = rtmp + bs2 * pj[j]; 482 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv)); 483 pv += bs2; 484 } 485 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 486 } 487 } 488 489 /* finished row so stick it into b->a */ 490 /* L part */ 491 pv = b->a + bs2 * bi[i]; 492 pj = b->j + bi[i]; 493 nz = bi[i + 1] - bi[i]; 494 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 495 496 /* Mark diagonal and invert diagonal for simpler triangular solves */ 497 pv = b->a + bs2 * bdiag[i]; 498 pj = b->j + bdiag[i]; 499 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 500 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected)); 501 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 502 503 /* U part */ 504 pv = b->a + bs2 * (bdiag[i + 1] + 1); 505 pj = b->j + bdiag[i + 1] + 1; 506 nz = bdiag[i] - bdiag[i + 1] - 1; 507 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 508 } 509 PetscCall(PetscFree2(rtmp, mwork)); 510 511 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 512 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_3_NaturalOrdering; 513 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering; 514 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 515 C->assembled = PETSC_TRUE; 516 517 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */ 518 PetscFunctionReturn(PETSC_SUCCESS); 519 } 520