1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* 9 Version for when blocks are 3 by 3 10 */ 11 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info) { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 13 IS isrow = b->row, isicol = b->icol; 14 const PetscInt *r, *ic; 15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 16 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 17 PetscInt *diag_offset = b->diag, idx, *pj; 18 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 19 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 20 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9; 21 MatScalar *ba = b->a, *aa = a->a; 22 PetscReal shift = info->shiftamount; 23 PetscBool allowzeropivot, zeropivotdetected; 24 25 PetscFunctionBegin; 26 PetscCall(ISGetIndices(isrow, &r)); 27 PetscCall(ISGetIndices(isicol, &ic)); 28 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp)); 29 allowzeropivot = PetscNot(A->erroriffailure); 30 31 for (i = 0; i < n; i++) { 32 nz = bi[i + 1] - bi[i]; 33 ajtmp = bj + bi[i]; 34 for (j = 0; j < nz; j++) { 35 x = rtmp + 9 * ajtmp[j]; 36 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 37 } 38 /* load in initial (unfactored row) */ 39 idx = r[i]; 40 nz = ai[idx + 1] - ai[idx]; 41 ajtmpold = aj + ai[idx]; 42 v = aa + 9 * ai[idx]; 43 for (j = 0; j < nz; j++) { 44 x = rtmp + 9 * ic[ajtmpold[j]]; 45 x[0] = v[0]; 46 x[1] = v[1]; 47 x[2] = v[2]; 48 x[3] = v[3]; 49 x[4] = v[4]; 50 x[5] = v[5]; 51 x[6] = v[6]; 52 x[7] = v[7]; 53 x[8] = v[8]; 54 v += 9; 55 } 56 row = *ajtmp++; 57 while (row < i) { 58 pc = rtmp + 9 * row; 59 p1 = pc[0]; 60 p2 = pc[1]; 61 p3 = pc[2]; 62 p4 = pc[3]; 63 p5 = pc[4]; 64 p6 = pc[5]; 65 p7 = pc[6]; 66 p8 = pc[7]; 67 p9 = pc[8]; 68 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 69 pv = ba + 9 * diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; 72 x2 = pv[1]; 73 x3 = pv[2]; 74 x4 = pv[3]; 75 x5 = pv[4]; 76 x6 = pv[5]; 77 x7 = pv[6]; 78 x8 = pv[7]; 79 x9 = pv[8]; 80 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3; 81 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3; 82 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3; 83 84 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6; 85 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6; 86 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6; 87 88 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9; 89 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9; 90 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9; 91 nz = bi[row + 1] - diag_offset[row] - 1; 92 pv += 9; 93 for (j = 0; j < nz; j++) { 94 x1 = pv[0]; 95 x2 = pv[1]; 96 x3 = pv[2]; 97 x4 = pv[3]; 98 x5 = pv[4]; 99 x6 = pv[5]; 100 x7 = pv[6]; 101 x8 = pv[7]; 102 x9 = pv[8]; 103 x = rtmp + 9 * pj[j]; 104 x[0] -= m1 * x1 + m4 * x2 + m7 * x3; 105 x[1] -= m2 * x1 + m5 * x2 + m8 * x3; 106 x[2] -= m3 * x1 + m6 * x2 + m9 * x3; 107 108 x[3] -= m1 * x4 + m4 * x5 + m7 * x6; 109 x[4] -= m2 * x4 + m5 * x5 + m8 * x6; 110 x[5] -= m3 * x4 + m6 * x5 + m9 * x6; 111 112 x[6] -= m1 * x7 + m4 * x8 + m7 * x9; 113 x[7] -= m2 * x7 + m5 * x8 + m8 * x9; 114 x[8] -= m3 * x7 + m6 * x8 + m9 * x9; 115 pv += 9; 116 } 117 PetscCall(PetscLogFlops(54.0 * nz + 36.0)); 118 } 119 row = *ajtmp++; 120 } 121 /* finished row so stick it into b->a */ 122 pv = ba + 9 * bi[i]; 123 pj = bj + bi[i]; 124 nz = bi[i + 1] - bi[i]; 125 for (j = 0; j < nz; j++) { 126 x = rtmp + 9 * pj[j]; 127 pv[0] = x[0]; 128 pv[1] = x[1]; 129 pv[2] = x[2]; 130 pv[3] = x[3]; 131 pv[4] = x[4]; 132 pv[5] = x[5]; 133 pv[6] = x[6]; 134 pv[7] = x[7]; 135 pv[8] = x[8]; 136 pv += 9; 137 } 138 /* invert diagonal block */ 139 w = ba + 9 * diag_offset[i]; 140 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected)); 141 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 142 } 143 144 PetscCall(PetscFree(rtmp)); 145 PetscCall(ISRestoreIndices(isicol, &ic)); 146 PetscCall(ISRestoreIndices(isrow, &r)); 147 148 C->ops->solve = MatSolve_SeqBAIJ_3_inplace; 149 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace; 150 C->assembled = PETSC_TRUE; 151 152 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */ 153 PetscFunctionReturn(0); 154 } 155 156 /* MatLUFactorNumeric_SeqBAIJ_3 - 157 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 158 PetscKernel_A_gets_A_times_B() 159 PetscKernel_A_gets_A_minus_B_times_C() 160 PetscKernel_A_gets_inverse_A() 161 */ 162 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info) { 163 Mat C = B; 164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 165 IS isrow = b->row, isicol = b->icol; 166 const PetscInt *r, *ic; 167 PetscInt i, j, k, nz, nzL, row; 168 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 169 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 170 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 171 PetscInt flg; 172 PetscReal shift = info->shiftamount; 173 PetscBool allowzeropivot, zeropivotdetected; 174 175 PetscFunctionBegin; 176 PetscCall(ISGetIndices(isrow, &r)); 177 PetscCall(ISGetIndices(isicol, &ic)); 178 allowzeropivot = PetscNot(A->erroriffailure); 179 180 /* generate work space needed by the factorization */ 181 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 182 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 183 184 for (i = 0; i < n; i++) { 185 /* zero rtmp */ 186 /* L part */ 187 nz = bi[i + 1] - bi[i]; 188 bjtmp = bj + bi[i]; 189 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 190 191 /* U part */ 192 nz = bdiag[i] - bdiag[i + 1]; 193 bjtmp = bj + bdiag[i + 1] + 1; 194 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 195 196 /* load in initial (unfactored row) */ 197 nz = ai[r[i] + 1] - ai[r[i]]; 198 ajtmp = aj + ai[r[i]]; 199 v = aa + bs2 * ai[r[i]]; 200 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); } 201 202 /* elimination */ 203 bjtmp = bj + bi[i]; 204 nzL = bi[i + 1] - bi[i]; 205 for (k = 0; k < nzL; k++) { 206 row = bjtmp[k]; 207 pc = rtmp + bs2 * row; 208 for (flg = 0, j = 0; j < bs2; j++) { 209 if (pc[j] != 0.0) { 210 flg = 1; 211 break; 212 } 213 } 214 if (flg) { 215 pv = b->a + bs2 * bdiag[row]; 216 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 217 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork)); 218 219 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 220 pv = b->a + bs2 * (bdiag[row + 1] + 1); 221 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 222 for (j = 0; j < nz; j++) { 223 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 224 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 225 v = rtmp + bs2 * pj[j]; 226 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv)); 227 pv += bs2; 228 } 229 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 230 } 231 } 232 233 /* finished row so stick it into b->a */ 234 /* L part */ 235 pv = b->a + bs2 * bi[i]; 236 pj = b->j + bi[i]; 237 nz = bi[i + 1] - bi[i]; 238 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 239 240 /* Mark diagonal and invert diagonal for simpler triangular solves */ 241 pv = b->a + bs2 * bdiag[i]; 242 pj = b->j + bdiag[i]; 243 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 244 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected)); 245 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 246 247 /* U part */ 248 pj = b->j + bdiag[i + 1] + 1; 249 pv = b->a + bs2 * (bdiag[i + 1] + 1); 250 nz = bdiag[i] - bdiag[i + 1] - 1; 251 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 252 } 253 254 PetscCall(PetscFree2(rtmp, mwork)); 255 PetscCall(ISRestoreIndices(isicol, &ic)); 256 PetscCall(ISRestoreIndices(isrow, &r)); 257 258 C->ops->solve = MatSolve_SeqBAIJ_3; 259 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 260 C->assembled = PETSC_TRUE; 261 262 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */ 263 PetscFunctionReturn(0); 264 } 265 266 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 267 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 268 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 269 PetscInt *ajtmpold, *ajtmp, nz, row; 270 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 271 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 272 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 273 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9; 274 MatScalar *ba = b->a, *aa = a->a; 275 PetscReal shift = info->shiftamount; 276 PetscBool allowzeropivot, zeropivotdetected; 277 278 PetscFunctionBegin; 279 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp)); 280 allowzeropivot = PetscNot(A->erroriffailure); 281 282 for (i = 0; i < n; i++) { 283 nz = bi[i + 1] - bi[i]; 284 ajtmp = bj + bi[i]; 285 for (j = 0; j < nz; j++) { 286 x = rtmp + 9 * ajtmp[j]; 287 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0; 288 } 289 /* load in initial (unfactored row) */ 290 nz = ai[i + 1] - ai[i]; 291 ajtmpold = aj + ai[i]; 292 v = aa + 9 * ai[i]; 293 for (j = 0; j < nz; j++) { 294 x = rtmp + 9 * ajtmpold[j]; 295 x[0] = v[0]; 296 x[1] = v[1]; 297 x[2] = v[2]; 298 x[3] = v[3]; 299 x[4] = v[4]; 300 x[5] = v[5]; 301 x[6] = v[6]; 302 x[7] = v[7]; 303 x[8] = v[8]; 304 v += 9; 305 } 306 row = *ajtmp++; 307 while (row < i) { 308 pc = rtmp + 9 * row; 309 p1 = pc[0]; 310 p2 = pc[1]; 311 p3 = pc[2]; 312 p4 = pc[3]; 313 p5 = pc[4]; 314 p6 = pc[5]; 315 p7 = pc[6]; 316 p8 = pc[7]; 317 p9 = pc[8]; 318 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) { 319 pv = ba + 9 * diag_offset[row]; 320 pj = bj + diag_offset[row] + 1; 321 x1 = pv[0]; 322 x2 = pv[1]; 323 x3 = pv[2]; 324 x4 = pv[3]; 325 x5 = pv[4]; 326 x6 = pv[5]; 327 x7 = pv[6]; 328 x8 = pv[7]; 329 x9 = pv[8]; 330 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3; 331 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3; 332 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3; 333 334 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6; 335 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6; 336 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6; 337 338 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9; 339 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9; 340 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9; 341 342 nz = bi[row + 1] - diag_offset[row] - 1; 343 pv += 9; 344 for (j = 0; j < nz; j++) { 345 x1 = pv[0]; 346 x2 = pv[1]; 347 x3 = pv[2]; 348 x4 = pv[3]; 349 x5 = pv[4]; 350 x6 = pv[5]; 351 x7 = pv[6]; 352 x8 = pv[7]; 353 x9 = pv[8]; 354 x = rtmp + 9 * pj[j]; 355 x[0] -= m1 * x1 + m4 * x2 + m7 * x3; 356 x[1] -= m2 * x1 + m5 * x2 + m8 * x3; 357 x[2] -= m3 * x1 + m6 * x2 + m9 * x3; 358 359 x[3] -= m1 * x4 + m4 * x5 + m7 * x6; 360 x[4] -= m2 * x4 + m5 * x5 + m8 * x6; 361 x[5] -= m3 * x4 + m6 * x5 + m9 * x6; 362 363 x[6] -= m1 * x7 + m4 * x8 + m7 * x9; 364 x[7] -= m2 * x7 + m5 * x8 + m8 * x9; 365 x[8] -= m3 * x7 + m6 * x8 + m9 * x9; 366 pv += 9; 367 } 368 PetscCall(PetscLogFlops(54.0 * nz + 36.0)); 369 } 370 row = *ajtmp++; 371 } 372 /* finished row so stick it into b->a */ 373 pv = ba + 9 * bi[i]; 374 pj = bj + bi[i]; 375 nz = bi[i + 1] - bi[i]; 376 for (j = 0; j < nz; j++) { 377 x = rtmp + 9 * pj[j]; 378 pv[0] = x[0]; 379 pv[1] = x[1]; 380 pv[2] = x[2]; 381 pv[3] = x[3]; 382 pv[4] = x[4]; 383 pv[5] = x[5]; 384 pv[6] = x[6]; 385 pv[7] = x[7]; 386 pv[8] = x[8]; 387 pv += 9; 388 } 389 /* invert diagonal block */ 390 w = ba + 9 * diag_offset[i]; 391 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected)); 392 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 393 } 394 395 PetscCall(PetscFree(rtmp)); 396 397 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace; 398 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace; 399 C->assembled = PETSC_TRUE; 400 401 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */ 402 PetscFunctionReturn(0); 403 } 404 405 /* 406 MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering - 407 copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace() 408 */ 409 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 410 Mat C = B; 411 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 412 PetscInt i, j, k, nz, nzL, row; 413 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 414 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 415 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 416 PetscInt flg; 417 PetscReal shift = info->shiftamount; 418 PetscBool allowzeropivot, zeropivotdetected; 419 420 PetscFunctionBegin; 421 allowzeropivot = PetscNot(A->erroriffailure); 422 423 /* generate work space needed by the factorization */ 424 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 425 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 426 427 for (i = 0; i < n; i++) { 428 /* zero rtmp */ 429 /* L part */ 430 nz = bi[i + 1] - bi[i]; 431 bjtmp = bj + bi[i]; 432 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 433 434 /* U part */ 435 nz = bdiag[i] - bdiag[i + 1]; 436 bjtmp = bj + bdiag[i + 1] + 1; 437 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 438 439 /* load in initial (unfactored row) */ 440 nz = ai[i + 1] - ai[i]; 441 ajtmp = aj + ai[i]; 442 v = aa + bs2 * ai[i]; 443 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); } 444 445 /* elimination */ 446 bjtmp = bj + bi[i]; 447 nzL = bi[i + 1] - bi[i]; 448 for (k = 0; k < nzL; k++) { 449 row = bjtmp[k]; 450 pc = rtmp + bs2 * row; 451 for (flg = 0, j = 0; j < bs2; j++) { 452 if (pc[j] != 0.0) { 453 flg = 1; 454 break; 455 } 456 } 457 if (flg) { 458 pv = b->a + bs2 * bdiag[row]; 459 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 460 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork)); 461 462 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 463 pv = b->a + bs2 * (bdiag[row + 1] + 1); 464 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 465 for (j = 0; j < nz; j++) { 466 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 467 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 468 v = rtmp + bs2 * pj[j]; 469 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv)); 470 pv += bs2; 471 } 472 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 473 } 474 } 475 476 /* finished row so stick it into b->a */ 477 /* L part */ 478 pv = b->a + bs2 * bi[i]; 479 pj = b->j + bi[i]; 480 nz = bi[i + 1] - bi[i]; 481 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 482 483 /* Mark diagonal and invert diagonal for simpler triangular solves */ 484 pv = b->a + bs2 * bdiag[i]; 485 pj = b->j + bdiag[i]; 486 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 487 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected)); 488 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 489 490 /* U part */ 491 pv = b->a + bs2 * (bdiag[i + 1] + 1); 492 pj = b->j + bdiag[i + 1] + 1; 493 nz = bdiag[i] - bdiag[i + 1] - 1; 494 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 495 } 496 PetscCall(PetscFree2(rtmp, mwork)); 497 498 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 499 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_3_NaturalOrdering; 500 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering; 501 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 502 C->assembled = PETSC_TRUE; 503 504 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */ 505 PetscFunctionReturn(0); 506 } 507