1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 /* 8 Version for when blocks are 4 by 4 9 */ 10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 13 IS isrow = b->row, isicol = b->icol; 14 const PetscInt *r, *ic; 15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 16 PetscInt *ajtmpold, *ajtmp, nz, row; 17 const PetscInt *diag_offset; 18 PetscInt idx, *ai = a->i, *aj = a->j, *pj; 19 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 22 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 23 MatScalar m13, m14, m15, m16; 24 MatScalar *ba = b->a, *aa = a->a; 25 PetscBool pivotinblocks = b->pivotinblocks; 26 PetscReal shift = info->shiftamount; 27 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 28 29 PetscFunctionBegin; 30 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 31 A->factortype = MAT_FACTOR_NONE; 32 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 33 A->factortype = MAT_FACTOR_ILU; 34 PetscCall(ISGetIndices(isrow, &r)); 35 PetscCall(ISGetIndices(isicol, &ic)); 36 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 37 allowzeropivot = PetscNot(A->erroriffailure); 38 39 for (i = 0; i < n; i++) { 40 nz = bi[i + 1] - bi[i]; 41 ajtmp = bj + bi[i]; 42 for (j = 0; j < nz; j++) { 43 x = rtmp + 16 * ajtmp[j]; 44 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 45 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 46 } 47 /* load in initial (unfactored row) */ 48 idx = r[i]; 49 nz = ai[idx + 1] - ai[idx]; 50 ajtmpold = aj + ai[idx]; 51 v = aa + 16 * ai[idx]; 52 for (j = 0; j < nz; j++) { 53 x = rtmp + 16 * ic[ajtmpold[j]]; 54 x[0] = v[0]; 55 x[1] = v[1]; 56 x[2] = v[2]; 57 x[3] = v[3]; 58 x[4] = v[4]; 59 x[5] = v[5]; 60 x[6] = v[6]; 61 x[7] = v[7]; 62 x[8] = v[8]; 63 x[9] = v[9]; 64 x[10] = v[10]; 65 x[11] = v[11]; 66 x[12] = v[12]; 67 x[13] = v[13]; 68 x[14] = v[14]; 69 x[15] = v[15]; 70 v += 16; 71 } 72 row = *ajtmp++; 73 while (row < i) { 74 pc = rtmp + 16 * row; 75 p1 = pc[0]; 76 p2 = pc[1]; 77 p3 = pc[2]; 78 p4 = pc[3]; 79 p5 = pc[4]; 80 p6 = pc[5]; 81 p7 = pc[6]; 82 p8 = pc[7]; 83 p9 = pc[8]; 84 p10 = pc[9]; 85 p11 = pc[10]; 86 p12 = pc[11]; 87 p13 = pc[12]; 88 p14 = pc[13]; 89 p15 = pc[14]; 90 p16 = pc[15]; 91 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 92 pv = ba + 16 * diag_offset[row]; 93 pj = bj + diag_offset[row] + 1; 94 x1 = pv[0]; 95 x2 = pv[1]; 96 x3 = pv[2]; 97 x4 = pv[3]; 98 x5 = pv[4]; 99 x6 = pv[5]; 100 x7 = pv[6]; 101 x8 = pv[7]; 102 x9 = pv[8]; 103 x10 = pv[9]; 104 x11 = pv[10]; 105 x12 = pv[11]; 106 x13 = pv[12]; 107 x14 = pv[13]; 108 x15 = pv[14]; 109 x16 = pv[15]; 110 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 111 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 112 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 113 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 114 115 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 116 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 117 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 118 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 119 120 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 121 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 122 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 123 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 124 125 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 126 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 127 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 128 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 129 130 nz = bi[row + 1] - diag_offset[row] - 1; 131 pv += 16; 132 for (j = 0; j < nz; j++) { 133 x1 = pv[0]; 134 x2 = pv[1]; 135 x3 = pv[2]; 136 x4 = pv[3]; 137 x5 = pv[4]; 138 x6 = pv[5]; 139 x7 = pv[6]; 140 x8 = pv[7]; 141 x9 = pv[8]; 142 x10 = pv[9]; 143 x11 = pv[10]; 144 x12 = pv[11]; 145 x13 = pv[12]; 146 x14 = pv[13]; 147 x15 = pv[14]; 148 x16 = pv[15]; 149 x = rtmp + 16 * pj[j]; 150 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 151 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 152 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 153 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 154 155 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 156 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 157 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 158 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 159 160 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 161 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 162 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 163 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 164 165 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 166 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 167 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 168 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 169 170 pv += 16; 171 } 172 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 173 } 174 row = *ajtmp++; 175 } 176 /* finished row so stick it into b->a */ 177 pv = ba + 16 * bi[i]; 178 pj = bj + bi[i]; 179 nz = bi[i + 1] - bi[i]; 180 for (j = 0; j < nz; j++) { 181 x = rtmp + 16 * pj[j]; 182 pv[0] = x[0]; 183 pv[1] = x[1]; 184 pv[2] = x[2]; 185 pv[3] = x[3]; 186 pv[4] = x[4]; 187 pv[5] = x[5]; 188 pv[6] = x[6]; 189 pv[7] = x[7]; 190 pv[8] = x[8]; 191 pv[9] = x[9]; 192 pv[10] = x[10]; 193 pv[11] = x[11]; 194 pv[12] = x[12]; 195 pv[13] = x[13]; 196 pv[14] = x[14]; 197 pv[15] = x[15]; 198 pv += 16; 199 } 200 /* invert diagonal block */ 201 w = ba + 16 * diag_offset[i]; 202 if (pivotinblocks) { 203 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 204 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 205 } else { 206 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 207 } 208 } 209 210 PetscCall(PetscFree(rtmp)); 211 PetscCall(ISRestoreIndices(isicol, &ic)); 212 PetscCall(ISRestoreIndices(isrow, &r)); 213 214 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 215 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 216 C->assembled = PETSC_TRUE; 217 218 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 219 PetscFunctionReturn(PETSC_SUCCESS); 220 } 221 222 /* MatLUFactorNumeric_SeqBAIJ_4 - 223 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 224 PetscKernel_A_gets_A_times_B() 225 PetscKernel_A_gets_A_minus_B_times_C() 226 PetscKernel_A_gets_inverse_A() 227 */ 228 229 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) 230 { 231 Mat C = B; 232 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 233 IS isrow = b->row, isicol = b->icol; 234 const PetscInt *r, *ic; 235 PetscInt i, j, k, nz, nzL, row; 236 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 237 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 238 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 239 PetscInt flg; 240 PetscReal shift; 241 PetscBool allowzeropivot, zeropivotdetected; 242 243 PetscFunctionBegin; 244 allowzeropivot = PetscNot(A->erroriffailure); 245 PetscCall(ISGetIndices(isrow, &r)); 246 PetscCall(ISGetIndices(isicol, &ic)); 247 248 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 249 shift = 0; 250 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 251 shift = info->shiftamount; 252 } 253 254 /* generate work space needed by the factorization */ 255 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 256 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 257 258 for (i = 0; i < n; i++) { 259 /* zero rtmp */ 260 /* L part */ 261 nz = bi[i + 1] - bi[i]; 262 bjtmp = bj + bi[i]; 263 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 264 265 /* U part */ 266 nz = bdiag[i] - bdiag[i + 1]; 267 bjtmp = bj + bdiag[i + 1] + 1; 268 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 269 270 /* load in initial (unfactored row) */ 271 nz = ai[r[i] + 1] - ai[r[i]]; 272 ajtmp = aj + ai[r[i]]; 273 v = aa + bs2 * ai[r[i]]; 274 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 275 276 /* elimination */ 277 bjtmp = bj + bi[i]; 278 nzL = bi[i + 1] - bi[i]; 279 for (k = 0; k < nzL; k++) { 280 row = bjtmp[k]; 281 pc = rtmp + bs2 * row; 282 for (flg = 0, j = 0; j < bs2; j++) { 283 if (pc[j] != 0.0) { 284 flg = 1; 285 break; 286 } 287 } 288 if (flg) { 289 pv = b->a + bs2 * bdiag[row]; 290 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 291 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 292 293 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 294 pv = b->a + bs2 * (bdiag[row + 1] + 1); 295 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 296 for (j = 0; j < nz; j++) { 297 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 298 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 299 v = rtmp + bs2 * pj[j]; 300 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 301 pv += bs2; 302 } 303 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 304 } 305 } 306 307 /* finished row so stick it into b->a */ 308 /* L part */ 309 pv = b->a + bs2 * bi[i]; 310 pj = b->j + bi[i]; 311 nz = bi[i + 1] - bi[i]; 312 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 313 314 /* Mark diagonal and invert diagonal for simpler triangular solves */ 315 pv = b->a + bs2 * bdiag[i]; 316 pj = b->j + bdiag[i]; 317 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 318 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 319 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 320 321 /* U part */ 322 pv = b->a + bs2 * (bdiag[i + 1] + 1); 323 pj = b->j + bdiag[i + 1] + 1; 324 nz = bdiag[i] - bdiag[i + 1] - 1; 325 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 326 } 327 328 PetscCall(PetscFree2(rtmp, mwork)); 329 PetscCall(ISRestoreIndices(isicol, &ic)); 330 PetscCall(ISRestoreIndices(isrow, &r)); 331 332 C->ops->solve = MatSolve_SeqBAIJ_4; 333 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 334 C->assembled = PETSC_TRUE; 335 336 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 337 PetscFunctionReturn(PETSC_SUCCESS); 338 } 339 340 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 341 { 342 /* 343 Default Version for when blocks are 4 by 4 Using natural ordering 344 */ 345 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 346 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 347 PetscInt *ajtmpold, *ajtmp, nz, row; 348 const PetscInt *diag_offset; 349 PetscInt *ai = a->i, *aj = a->j, *pj; 350 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 351 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 352 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 353 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 354 MatScalar m13, m14, m15, m16; 355 MatScalar *ba = b->a, *aa = a->a; 356 PetscBool pivotinblocks = b->pivotinblocks; 357 PetscReal shift = info->shiftamount; 358 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 359 360 PetscFunctionBegin; 361 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 362 A->factortype = MAT_FACTOR_NONE; 363 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 364 A->factortype = MAT_FACTOR_ILU; 365 allowzeropivot = PetscNot(A->erroriffailure); 366 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 367 368 for (i = 0; i < n; i++) { 369 nz = bi[i + 1] - bi[i]; 370 ajtmp = bj + bi[i]; 371 for (j = 0; j < nz; j++) { 372 x = rtmp + 16 * ajtmp[j]; 373 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 374 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 375 } 376 /* load in initial (unfactored row) */ 377 nz = ai[i + 1] - ai[i]; 378 ajtmpold = aj + ai[i]; 379 v = aa + 16 * ai[i]; 380 for (j = 0; j < nz; j++) { 381 x = rtmp + 16 * ajtmpold[j]; 382 x[0] = v[0]; 383 x[1] = v[1]; 384 x[2] = v[2]; 385 x[3] = v[3]; 386 x[4] = v[4]; 387 x[5] = v[5]; 388 x[6] = v[6]; 389 x[7] = v[7]; 390 x[8] = v[8]; 391 x[9] = v[9]; 392 x[10] = v[10]; 393 x[11] = v[11]; 394 x[12] = v[12]; 395 x[13] = v[13]; 396 x[14] = v[14]; 397 x[15] = v[15]; 398 v += 16; 399 } 400 row = *ajtmp++; 401 while (row < i) { 402 pc = rtmp + 16 * row; 403 p1 = pc[0]; 404 p2 = pc[1]; 405 p3 = pc[2]; 406 p4 = pc[3]; 407 p5 = pc[4]; 408 p6 = pc[5]; 409 p7 = pc[6]; 410 p8 = pc[7]; 411 p9 = pc[8]; 412 p10 = pc[9]; 413 p11 = pc[10]; 414 p12 = pc[11]; 415 p13 = pc[12]; 416 p14 = pc[13]; 417 p15 = pc[14]; 418 p16 = pc[15]; 419 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 420 pv = ba + 16 * diag_offset[row]; 421 pj = bj + diag_offset[row] + 1; 422 x1 = pv[0]; 423 x2 = pv[1]; 424 x3 = pv[2]; 425 x4 = pv[3]; 426 x5 = pv[4]; 427 x6 = pv[5]; 428 x7 = pv[6]; 429 x8 = pv[7]; 430 x9 = pv[8]; 431 x10 = pv[9]; 432 x11 = pv[10]; 433 x12 = pv[11]; 434 x13 = pv[12]; 435 x14 = pv[13]; 436 x15 = pv[14]; 437 x16 = pv[15]; 438 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 439 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 440 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 441 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 442 443 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 444 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 445 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 446 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 447 448 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 449 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 450 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 451 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 452 453 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 454 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 455 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 456 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 457 nz = bi[row + 1] - diag_offset[row] - 1; 458 pv += 16; 459 for (j = 0; j < nz; j++) { 460 x1 = pv[0]; 461 x2 = pv[1]; 462 x3 = pv[2]; 463 x4 = pv[3]; 464 x5 = pv[4]; 465 x6 = pv[5]; 466 x7 = pv[6]; 467 x8 = pv[7]; 468 x9 = pv[8]; 469 x10 = pv[9]; 470 x11 = pv[10]; 471 x12 = pv[11]; 472 x13 = pv[12]; 473 x14 = pv[13]; 474 x15 = pv[14]; 475 x16 = pv[15]; 476 x = rtmp + 16 * pj[j]; 477 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 478 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 479 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 480 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 481 482 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 483 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 484 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 485 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 486 487 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 488 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 489 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 490 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 491 492 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 493 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 494 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 495 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 496 497 pv += 16; 498 } 499 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 500 } 501 row = *ajtmp++; 502 } 503 /* finished row so stick it into b->a */ 504 pv = ba + 16 * bi[i]; 505 pj = bj + bi[i]; 506 nz = bi[i + 1] - bi[i]; 507 for (j = 0; j < nz; j++) { 508 x = rtmp + 16 * pj[j]; 509 pv[0] = x[0]; 510 pv[1] = x[1]; 511 pv[2] = x[2]; 512 pv[3] = x[3]; 513 pv[4] = x[4]; 514 pv[5] = x[5]; 515 pv[6] = x[6]; 516 pv[7] = x[7]; 517 pv[8] = x[8]; 518 pv[9] = x[9]; 519 pv[10] = x[10]; 520 pv[11] = x[11]; 521 pv[12] = x[12]; 522 pv[13] = x[13]; 523 pv[14] = x[14]; 524 pv[15] = x[15]; 525 pv += 16; 526 } 527 /* invert diagonal block */ 528 w = ba + 16 * diag_offset[i]; 529 if (pivotinblocks) { 530 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 531 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 532 } else { 533 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 534 } 535 } 536 537 PetscCall(PetscFree(rtmp)); 538 539 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 540 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 541 C->assembled = PETSC_TRUE; 542 543 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 544 PetscFunctionReturn(PETSC_SUCCESS); 545 } 546 547 /* 548 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 549 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 550 */ 551 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 552 { 553 Mat C = B; 554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 555 PetscInt i, j, k, nz, nzL, row; 556 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 557 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 558 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 559 PetscInt flg; 560 PetscReal shift; 561 PetscBool allowzeropivot, zeropivotdetected; 562 563 PetscFunctionBegin; 564 allowzeropivot = PetscNot(A->erroriffailure); 565 566 /* generate work space needed by the factorization */ 567 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 568 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 569 570 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 571 shift = 0; 572 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 573 shift = info->shiftamount; 574 } 575 576 for (i = 0; i < n; i++) { 577 /* zero rtmp */ 578 /* L part */ 579 nz = bi[i + 1] - bi[i]; 580 bjtmp = bj + bi[i]; 581 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 582 583 /* U part */ 584 nz = bdiag[i] - bdiag[i + 1]; 585 bjtmp = bj + bdiag[i + 1] + 1; 586 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 587 588 /* load in initial (unfactored row) */ 589 nz = ai[i + 1] - ai[i]; 590 ajtmp = aj + ai[i]; 591 v = aa + bs2 * ai[i]; 592 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 593 594 /* elimination */ 595 bjtmp = bj + bi[i]; 596 nzL = bi[i + 1] - bi[i]; 597 for (k = 0; k < nzL; k++) { 598 row = bjtmp[k]; 599 pc = rtmp + bs2 * row; 600 for (flg = 0, j = 0; j < bs2; j++) { 601 if (pc[j] != 0.0) { 602 flg = 1; 603 break; 604 } 605 } 606 if (flg) { 607 pv = b->a + bs2 * bdiag[row]; 608 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 609 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 610 611 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 612 pv = b->a + bs2 * (bdiag[row + 1] + 1); 613 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 614 for (j = 0; j < nz; j++) { 615 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 616 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 617 v = rtmp + bs2 * pj[j]; 618 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 619 pv += bs2; 620 } 621 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 622 } 623 } 624 625 /* finished row so stick it into b->a */ 626 /* L part */ 627 pv = b->a + bs2 * bi[i]; 628 pj = b->j + bi[i]; 629 nz = bi[i + 1] - bi[i]; 630 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 631 632 /* Mark diagonal and invert diagonal for simpler triangular solves */ 633 pv = b->a + bs2 * bdiag[i]; 634 pj = b->j + bdiag[i]; 635 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 636 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 637 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 638 639 /* U part */ 640 pv = b->a + bs2 * (bdiag[i + 1] + 1); 641 pj = b->j + bdiag[i + 1] + 1; 642 nz = bdiag[i] - bdiag[i + 1] - 1; 643 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 644 } 645 PetscCall(PetscFree2(rtmp, mwork)); 646 647 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 648 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 649 C->assembled = PETSC_TRUE; 650 651 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 652 PetscFunctionReturn(PETSC_SUCCESS); 653 } 654