1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 8 { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 14 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 15 MatScalar *rtmp, *pc, *mwork, *pv; 16 MatScalar *aa = a->a, *v; 17 PetscReal shift = info->shiftamount; 18 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 19 20 PetscFunctionBegin; 21 PetscCall(ISGetIndices(isrow, &r)); 22 PetscCall(ISGetIndices(isicol, &ic)); 23 24 /* generate work space needed by the factorization */ 25 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 26 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 27 28 for (PetscInt i = 0; i < n; i++) { 29 /* zero rtmp */ 30 /* L part */ 31 PetscInt *pj; 32 PetscInt nzL, nz = bi[i + 1] - bi[i]; 33 bjtmp = bj + bi[i]; 34 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 35 36 /* U part */ 37 nz = bdiag[i] - bdiag[i + 1]; 38 bjtmp = bj + bdiag[i + 1] + 1; 39 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 40 41 /* load in initial (unfactored row) */ 42 nz = ai[r[i] + 1] - ai[r[i]]; 43 ajtmp = aj + ai[r[i]]; 44 v = aa + bs2 * ai[r[i]]; 45 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 46 47 /* elimination */ 48 bjtmp = bj + bi[i]; 49 nzL = bi[i + 1] - bi[i]; 50 for (PetscInt k = 0; k < nzL; k++) { 51 PetscBool flg = PETSC_FALSE; 52 PetscInt row = bjtmp[k]; 53 54 pc = rtmp + bs2 * row; 55 for (PetscInt j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = PETSC_TRUE; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (PetscInt j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 { 92 PetscBool zeropivotdetected; 93 94 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 95 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 96 } 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 PetscCall(ISRestoreIndices(isicol, &ic)); 107 PetscCall(ISRestoreIndices(isrow, &r)); 108 109 C->ops->solve = MatSolve_SeqBAIJ_2; 110 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 111 C->assembled = PETSC_TRUE; 112 113 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 118 { 119 Mat C = B; 120 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 121 PetscInt i, j, k, nz, nzL, row, *pj; 122 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 123 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 124 MatScalar *rtmp, *pc, *mwork, *pv; 125 MatScalar *aa = a->a, *v; 126 PetscInt flg; 127 PetscReal shift = info->shiftamount; 128 PetscBool allowzeropivot, zeropivotdetected; 129 130 PetscFunctionBegin; 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 /* generate work space needed by the factorization */ 134 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 135 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 136 137 for (i = 0; i < n; i++) { 138 /* zero rtmp */ 139 /* L part */ 140 nz = bi[i + 1] - bi[i]; 141 bjtmp = bj + bi[i]; 142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 143 144 /* U part */ 145 nz = bdiag[i] - bdiag[i + 1]; 146 bjtmp = bj + bdiag[i + 1] + 1; 147 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 148 149 /* load in initial (unfactored row) */ 150 nz = ai[i + 1] - ai[i]; 151 ajtmp = aj + ai[i]; 152 v = aa + bs2 * ai[i]; 153 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 154 155 /* elimination */ 156 bjtmp = bj + bi[i]; 157 nzL = bi[i + 1] - bi[i]; 158 for (k = 0; k < nzL; k++) { 159 row = bjtmp[k]; 160 pc = rtmp + bs2 * row; 161 for (flg = 0, j = 0; j < bs2; j++) { 162 if (pc[j] != (PetscScalar)0.0) { 163 flg = 1; 164 break; 165 } 166 } 167 if (flg) { 168 pv = b->a + bs2 * bdiag[row]; 169 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 170 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 171 172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 173 pv = b->a + bs2 * (bdiag[row + 1] + 1); 174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 175 for (j = 0; j < nz; j++) { 176 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 177 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 178 v = rtmp + 4 * pj[j]; 179 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 180 pv += 4; 181 } 182 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 183 } 184 } 185 186 /* finished row so stick it into b->a */ 187 /* L part */ 188 pv = b->a + bs2 * bi[i]; 189 pj = b->j + bi[i]; 190 nz = bi[i + 1] - bi[i]; 191 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 192 193 /* Mark diagonal and invert diagonal for simpler triangular solves */ 194 pv = b->a + bs2 * bdiag[i]; 195 pj = b->j + bdiag[i]; 196 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 197 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 198 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 199 200 /* U part */ 201 /* 202 pv = b->a + bs2*bi[2*n-i]; 203 pj = b->j + bi[2*n-i]; 204 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 205 */ 206 pv = b->a + bs2 * (bdiag[i + 1] + 1); 207 pj = b->j + bdiag[i + 1] + 1; 208 nz = bdiag[i] - bdiag[i + 1] - 1; 209 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 210 } 211 PetscCall(PetscFree2(rtmp, mwork)); 212 213 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 214 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 217 C->assembled = PETSC_TRUE; 218 219 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 224 { 225 Mat C = B; 226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 227 IS isrow = b->row, isicol = b->icol; 228 const PetscInt *r, *ic; 229 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 230 PetscInt *ajtmpold, *ajtmp, nz, row; 231 PetscInt idx, *ai = a->i, *aj = a->j, *pj; 232 const PetscInt *diag_offset; 233 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 234 MatScalar p1, p2, p3, p4; 235 MatScalar *ba = b->a, *aa = a->a; 236 PetscReal shift = info->shiftamount; 237 PetscBool allowzeropivot, zeropivotdetected; 238 239 PetscFunctionBegin; 240 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 241 A->factortype = MAT_FACTOR_NONE; 242 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 243 A->factortype = MAT_FACTOR_ILU; 244 allowzeropivot = PetscNot(A->erroriffailure); 245 PetscCall(ISGetIndices(isrow, &r)); 246 PetscCall(ISGetIndices(isicol, &ic)); 247 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 248 249 for (i = 0; i < n; i++) { 250 nz = bi[i + 1] - bi[i]; 251 ajtmp = bj + bi[i]; 252 for (j = 0; j < nz; j++) { 253 x = rtmp + 4 * ajtmp[j]; 254 x[0] = x[1] = x[2] = x[3] = 0.0; 255 } 256 /* load in initial (unfactored row) */ 257 idx = r[i]; 258 nz = ai[idx + 1] - ai[idx]; 259 ajtmpold = aj + ai[idx]; 260 v = aa + 4 * ai[idx]; 261 for (j = 0; j < nz; j++) { 262 x = rtmp + 4 * ic[ajtmpold[j]]; 263 x[0] = v[0]; 264 x[1] = v[1]; 265 x[2] = v[2]; 266 x[3] = v[3]; 267 v += 4; 268 } 269 row = *ajtmp++; 270 while (row < i) { 271 pc = rtmp + 4 * row; 272 p1 = pc[0]; 273 p2 = pc[1]; 274 p3 = pc[2]; 275 p4 = pc[3]; 276 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 277 pv = ba + 4 * diag_offset[row]; 278 pj = bj + diag_offset[row] + 1; 279 x1 = pv[0]; 280 x2 = pv[1]; 281 x3 = pv[2]; 282 x4 = pv[3]; 283 pc[0] = m1 = p1 * x1 + p3 * x2; 284 pc[1] = m2 = p2 * x1 + p4 * x2; 285 pc[2] = m3 = p1 * x3 + p3 * x4; 286 pc[3] = m4 = p2 * x3 + p4 * x4; 287 nz = bi[row + 1] - diag_offset[row] - 1; 288 pv += 4; 289 for (j = 0; j < nz; j++) { 290 x1 = pv[0]; 291 x2 = pv[1]; 292 x3 = pv[2]; 293 x4 = pv[3]; 294 x = rtmp + 4 * pj[j]; 295 x[0] -= m1 * x1 + m3 * x2; 296 x[1] -= m2 * x1 + m4 * x2; 297 x[2] -= m1 * x3 + m3 * x4; 298 x[3] -= m2 * x3 + m4 * x4; 299 pv += 4; 300 } 301 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 302 } 303 row = *ajtmp++; 304 } 305 /* finished row so stick it into b->a */ 306 pv = ba + 4 * bi[i]; 307 pj = bj + bi[i]; 308 nz = bi[i + 1] - bi[i]; 309 for (j = 0; j < nz; j++) { 310 x = rtmp + 4 * pj[j]; 311 pv[0] = x[0]; 312 pv[1] = x[1]; 313 pv[2] = x[2]; 314 pv[3] = x[3]; 315 pv += 4; 316 } 317 /* invert diagonal block */ 318 w = ba + 4 * diag_offset[i]; 319 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 320 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 321 } 322 323 PetscCall(PetscFree(rtmp)); 324 PetscCall(ISRestoreIndices(isicol, &ic)); 325 PetscCall(ISRestoreIndices(isrow, &r)); 326 327 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 328 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 329 C->assembled = PETSC_TRUE; 330 331 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 /* 335 Version for when blocks are 2 by 2 Using natural ordering 336 */ 337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 338 { 339 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 340 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 341 PetscInt *ajtmpold, *ajtmp, nz, row; 342 PetscInt *ai = a->i, *aj = a->j, *pj; 343 const PetscInt *diag_offset; 344 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 345 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 346 MatScalar *ba = b->a, *aa = a->a; 347 PetscReal shift = info->shiftamount; 348 PetscBool allowzeropivot, zeropivotdetected; 349 350 PetscFunctionBegin; 351 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 352 A->factortype = MAT_FACTOR_NONE; 353 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 354 A->factortype = MAT_FACTOR_ILU; 355 allowzeropivot = PetscNot(A->erroriffailure); 356 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 357 for (i = 0; i < n; i++) { 358 nz = bi[i + 1] - bi[i]; 359 ajtmp = bj + bi[i]; 360 for (j = 0; j < nz; j++) { 361 x = rtmp + 4 * ajtmp[j]; 362 x[0] = x[1] = x[2] = x[3] = 0.0; 363 } 364 /* load in initial (unfactored row) */ 365 nz = ai[i + 1] - ai[i]; 366 ajtmpold = aj + ai[i]; 367 v = aa + 4 * ai[i]; 368 for (j = 0; j < nz; j++) { 369 x = rtmp + 4 * ajtmpold[j]; 370 x[0] = v[0]; 371 x[1] = v[1]; 372 x[2] = v[2]; 373 x[3] = v[3]; 374 v += 4; 375 } 376 row = *ajtmp++; 377 while (row < i) { 378 pc = rtmp + 4 * row; 379 p1 = pc[0]; 380 p2 = pc[1]; 381 p3 = pc[2]; 382 p4 = pc[3]; 383 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 384 pv = ba + 4 * diag_offset[row]; 385 pj = bj + diag_offset[row] + 1; 386 x1 = pv[0]; 387 x2 = pv[1]; 388 x3 = pv[2]; 389 x4 = pv[3]; 390 pc[0] = m1 = p1 * x1 + p3 * x2; 391 pc[1] = m2 = p2 * x1 + p4 * x2; 392 pc[2] = m3 = p1 * x3 + p3 * x4; 393 pc[3] = m4 = p2 * x3 + p4 * x4; 394 nz = bi[row + 1] - diag_offset[row] - 1; 395 pv += 4; 396 for (j = 0; j < nz; j++) { 397 x1 = pv[0]; 398 x2 = pv[1]; 399 x3 = pv[2]; 400 x4 = pv[3]; 401 x = rtmp + 4 * pj[j]; 402 x[0] -= m1 * x1 + m3 * x2; 403 x[1] -= m2 * x1 + m4 * x2; 404 x[2] -= m1 * x3 + m3 * x4; 405 x[3] -= m2 * x3 + m4 * x4; 406 pv += 4; 407 } 408 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 409 } 410 row = *ajtmp++; 411 } 412 /* finished row so stick it into b->a */ 413 pv = ba + 4 * bi[i]; 414 pj = bj + bi[i]; 415 nz = bi[i + 1] - bi[i]; 416 for (j = 0; j < nz; j++) { 417 x = rtmp + 4 * pj[j]; 418 pv[0] = x[0]; 419 pv[1] = x[1]; 420 pv[2] = x[2]; 421 pv[3] = x[3]; 422 /* 423 printf(" col %d:",pj[j]); 424 PetscInt j1; 425 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 426 printf("\n"); 427 */ 428 pv += 4; 429 } 430 /* invert diagonal block */ 431 w = ba + 4 * diag_offset[i]; 432 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 433 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 434 } 435 436 PetscCall(PetscFree(rtmp)); 437 438 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 439 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 440 C->assembled = PETSC_TRUE; 441 442 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 /* 447 Version for when blocks are 1 by 1. 448 */ 449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 450 { 451 Mat C = B; 452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 453 IS isrow = b->row, isicol = b->icol; 454 const PetscInt *r, *ic, *ics; 455 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 456 PetscInt i, j, k, nz, nzL, row, *pj; 457 const PetscInt *ajtmp, *bjtmp; 458 MatScalar *rtmp, *pc, multiplier, *pv; 459 const MatScalar *aa = a->a, *v; 460 PetscBool row_identity, col_identity; 461 FactorShiftCtx sctx; 462 const PetscInt *ddiag; 463 PetscReal rs; 464 MatScalar d; 465 466 PetscFunctionBegin; 467 /* MatPivotSetUp(): initialize shift context sctx */ 468 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 469 470 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 471 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL)); 472 sctx.shift_top = info->zeropivot; 473 for (i = 0; i < n; i++) { 474 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 475 d = (aa)[ddiag[i]]; 476 rs = -PetscAbsScalar(d) - PetscRealPart(d); 477 v = aa + ai[i]; 478 nz = ai[i + 1] - ai[i]; 479 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 480 if (rs > sctx.shift_top) sctx.shift_top = rs; 481 } 482 sctx.shift_top *= 1.1; 483 sctx.nshift_max = 5; 484 sctx.shift_lo = 0.; 485 sctx.shift_hi = 1.; 486 } 487 488 PetscCall(ISGetIndices(isrow, &r)); 489 PetscCall(ISGetIndices(isicol, &ic)); 490 PetscCall(PetscMalloc1(n + 1, &rtmp)); 491 ics = ic; 492 493 do { 494 sctx.newshift = PETSC_FALSE; 495 for (i = 0; i < n; i++) { 496 /* zero rtmp */ 497 /* L part */ 498 nz = bi[i + 1] - bi[i]; 499 bjtmp = bj + bi[i]; 500 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 501 502 /* U part */ 503 nz = bdiag[i] - bdiag[i + 1]; 504 bjtmp = bj + bdiag[i + 1] + 1; 505 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 506 507 /* load in initial (unfactored row) */ 508 nz = ai[r[i] + 1] - ai[r[i]]; 509 ajtmp = aj + ai[r[i]]; 510 v = aa + ai[r[i]]; 511 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 512 513 /* ZeropivotApply() */ 514 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 515 516 /* elimination */ 517 bjtmp = bj + bi[i]; 518 row = *bjtmp++; 519 nzL = bi[i + 1] - bi[i]; 520 for (k = 0; k < nzL; k++) { 521 pc = rtmp + row; 522 if (*pc != (PetscScalar)0.0) { 523 pv = b->a + bdiag[row]; 524 multiplier = *pc * (*pv); 525 *pc = multiplier; 526 527 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 528 pv = b->a + bdiag[row + 1] + 1; 529 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 530 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 531 PetscCall(PetscLogFlops(2.0 * nz)); 532 } 533 row = *bjtmp++; 534 } 535 536 /* finished row so stick it into b->a */ 537 rs = 0.0; 538 /* L part */ 539 pv = b->a + bi[i]; 540 pj = b->j + bi[i]; 541 nz = bi[i + 1] - bi[i]; 542 for (j = 0; j < nz; j++) { 543 pv[j] = rtmp[pj[j]]; 544 rs += PetscAbsScalar(pv[j]); 545 } 546 547 /* U part */ 548 pv = b->a + bdiag[i + 1] + 1; 549 pj = b->j + bdiag[i + 1] + 1; 550 nz = bdiag[i] - bdiag[i + 1] - 1; 551 for (j = 0; j < nz; j++) { 552 pv[j] = rtmp[pj[j]]; 553 rs += PetscAbsScalar(pv[j]); 554 } 555 556 sctx.rs = rs; 557 sctx.pv = rtmp[i]; 558 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 559 if (sctx.newshift) break; /* break for-loop */ 560 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 561 562 /* Mark diagonal and invert diagonal for simpler triangular solves */ 563 pv = b->a + bdiag[i]; 564 *pv = (PetscScalar)1.0 / rtmp[i]; 565 566 } /* endof for (i=0; i<n; i++) { */ 567 568 /* MatPivotRefine() */ 569 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 570 /* 571 * if no shift in this attempt & shifting & started shifting & can refine, 572 * then try lower shift 573 */ 574 sctx.shift_hi = sctx.shift_fraction; 575 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 576 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 577 sctx.newshift = PETSC_TRUE; 578 sctx.nshift++; 579 } 580 } while (sctx.newshift); 581 582 PetscCall(PetscFree(rtmp)); 583 PetscCall(ISRestoreIndices(isicol, &ic)); 584 PetscCall(ISRestoreIndices(isrow, &r)); 585 586 PetscCall(ISIdentity(isrow, &row_identity)); 587 PetscCall(ISIdentity(isicol, &col_identity)); 588 if (row_identity && col_identity) { 589 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 590 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 591 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 592 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 593 } else { 594 C->ops->solve = MatSolve_SeqBAIJ_1; 595 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 596 } 597 C->assembled = PETSC_TRUE; 598 PetscCall(PetscLogFlops(C->cmap->n)); 599 600 /* MatShiftView(A,info,&sctx) */ 601 if (sctx.nshift) { 602 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 603 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 604 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 605 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 606 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 607 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 608 } 609 } 610 PetscFunctionReturn(PETSC_SUCCESS); 611 } 612 613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 614 { 615 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 616 IS isrow = b->row, isicol = b->icol; 617 const PetscInt *r, *ic; 618 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 619 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 620 PetscInt diag, *pj; 621 const PetscInt *diag_offset; 622 MatScalar *pv, *v, *rtmp, multiplier, *pc; 623 MatScalar *ba = b->a, *aa = a->a; 624 PetscBool row_identity, col_identity; 625 626 PetscFunctionBegin; 627 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 628 A->factortype = MAT_FACTOR_NONE; 629 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 630 A->factortype = MAT_FACTOR_ILU; 631 PetscCall(ISGetIndices(isrow, &r)); 632 PetscCall(ISGetIndices(isicol, &ic)); 633 PetscCall(PetscMalloc1(n + 1, &rtmp)); 634 635 for (i = 0; i < n; i++) { 636 nz = bi[i + 1] - bi[i]; 637 ajtmp = bj + bi[i]; 638 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 639 640 /* load in initial (unfactored row) */ 641 nz = ai[r[i] + 1] - ai[r[i]]; 642 ajtmpold = aj + ai[r[i]]; 643 v = aa + ai[r[i]]; 644 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 645 646 row = *ajtmp++; 647 while (row < i) { 648 pc = rtmp + row; 649 if (*pc != 0.0) { 650 pv = ba + diag_offset[row]; 651 pj = bj + diag_offset[row] + 1; 652 multiplier = *pc * *pv++; 653 *pc = multiplier; 654 nz = bi[row + 1] - diag_offset[row] - 1; 655 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 656 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 657 } 658 row = *ajtmp++; 659 } 660 /* finished row so stick it into b->a */ 661 pv = ba + bi[i]; 662 pj = bj + bi[i]; 663 nz = bi[i + 1] - bi[i]; 664 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 665 diag = diag_offset[i] - bi[i]; 666 /* check pivot entry for current row */ 667 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 668 pv[diag] = 1.0 / pv[diag]; 669 } 670 671 PetscCall(PetscFree(rtmp)); 672 PetscCall(ISRestoreIndices(isicol, &ic)); 673 PetscCall(ISRestoreIndices(isrow, &r)); 674 PetscCall(ISIdentity(isrow, &row_identity)); 675 PetscCall(ISIdentity(isicol, &col_identity)); 676 if (row_identity && col_identity) { 677 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 678 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 679 } else { 680 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 681 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 682 } 683 C->assembled = PETSC_TRUE; 684 PetscCall(PetscLogFlops(C->cmap->n)); 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 689 { 690 PetscFunctionBegin; 691 *type = MATSOLVERPETSC; 692 PetscFunctionReturn(PETSC_SUCCESS); 693 } 694 695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 696 { 697 PetscInt n = A->rmap->n; 698 699 PetscFunctionBegin; 700 #if defined(PETSC_USE_COMPLEX) 701 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 702 #endif 703 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 704 PetscCall(MatSetSizes(*B, n, n, n, n)); 705 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 706 PetscCall(MatSetType(*B, MATSEQBAIJ)); 707 708 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 709 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 710 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 711 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 712 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 713 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 714 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 715 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 716 717 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 718 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 719 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 720 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 721 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 722 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 723 (*B)->factortype = ftype; 724 (*B)->canuseordering = PETSC_TRUE; 725 726 PetscCall(PetscFree((*B)->solvertype)); 727 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 728 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 732 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 733 { 734 Mat C; 735 736 PetscFunctionBegin; 737 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 738 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 739 PetscCall(MatLUFactorNumeric(C, A, info)); 740 741 A->ops->solve = C->ops->solve; 742 A->ops->solvetranspose = C->ops->solvetranspose; 743 744 PetscCall(MatHeaderMerge(A, &C)); 745 PetscFunctionReturn(PETSC_SUCCESS); 746 } 747 748 #include <../src/mat/impls/sbaij/seq/sbaij.h> 749 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 750 { 751 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 752 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 753 IS ip = b->row; 754 const PetscInt *rip; 755 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 756 PetscInt *ai = a->i, *aj = a->j; 757 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 758 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 759 PetscReal rs; 760 FactorShiftCtx sctx; 761 762 PetscFunctionBegin; 763 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 764 if (!a->sbaijMat) { 765 A->symmetric = PETSC_BOOL3_TRUE; 766 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 767 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 768 } 769 PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 770 PetscCall(MatDestroy(&a->sbaijMat)); 771 PetscFunctionReturn(PETSC_SUCCESS); 772 } 773 774 /* MatPivotSetUp(): initialize shift context sctx */ 775 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 776 777 PetscCall(ISGetIndices(ip, &rip)); 778 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 779 780 sctx.shift_amount = 0.; 781 sctx.nshift = 0; 782 do { 783 sctx.newshift = PETSC_FALSE; 784 for (i = 0; i < mbs; i++) { 785 rtmp[i] = 0.0; 786 jl[i] = mbs; 787 il[0] = 0; 788 } 789 790 for (k = 0; k < mbs; k++) { 791 bval = ba + bi[k]; 792 /* initialize k-th row by the perm[k]-th row of A */ 793 jmin = ai[rip[k]]; 794 jmax = ai[rip[k] + 1]; 795 for (j = jmin; j < jmax; j++) { 796 col = rip[aj[j]]; 797 if (col >= k) { /* only take upper triangular entry */ 798 rtmp[col] = aa[j]; 799 *bval++ = 0.0; /* for in-place factorization */ 800 } 801 } 802 803 /* shift the diagonal of the matrix */ 804 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 805 806 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 807 dk = rtmp[k]; 808 i = jl[k]; /* first row to be added to k_th row */ 809 810 while (i < k) { 811 nexti = jl[i]; /* next row to be added to k_th row */ 812 813 /* compute multiplier, update diag(k) and U(i,k) */ 814 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 815 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 816 dk += uikdi * ba[ili]; 817 ba[ili] = uikdi; /* -U(i,k) */ 818 819 /* add multiple of row i to k-th row */ 820 jmin = ili + 1; 821 jmax = bi[i + 1]; 822 if (jmin < jmax) { 823 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 824 /* update il and jl for row i */ 825 il[i] = jmin; 826 j = bj[jmin]; 827 jl[i] = jl[j]; 828 jl[j] = i; 829 } 830 i = nexti; 831 } 832 833 /* shift the diagonals when zero pivot is detected */ 834 /* compute rs=sum of abs(off-diagonal) */ 835 rs = 0.0; 836 jmin = bi[k] + 1; 837 nz = bi[k + 1] - jmin; 838 if (nz) { 839 bcol = bj + jmin; 840 while (nz--) { 841 rs += PetscAbsScalar(rtmp[*bcol]); 842 bcol++; 843 } 844 } 845 846 sctx.rs = rs; 847 sctx.pv = dk; 848 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 849 if (sctx.newshift) break; 850 dk = sctx.pv; 851 852 /* copy data into U(k,:) */ 853 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 854 jmin = bi[k] + 1; 855 jmax = bi[k + 1]; 856 if (jmin < jmax) { 857 for (j = jmin; j < jmax; j++) { 858 col = bj[j]; 859 ba[j] = rtmp[col]; 860 rtmp[col] = 0.0; 861 } 862 /* add the k-th row into il and jl */ 863 il[k] = jmin; 864 i = bj[jmin]; 865 jl[k] = jl[i]; 866 jl[i] = k; 867 } 868 } 869 } while (sctx.newshift); 870 PetscCall(PetscFree3(rtmp, il, jl)); 871 872 PetscCall(ISRestoreIndices(ip, &rip)); 873 874 C->assembled = PETSC_TRUE; 875 C->preallocated = PETSC_TRUE; 876 877 PetscCall(PetscLogFlops(C->rmap->N)); 878 if (sctx.nshift) { 879 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 880 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 881 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 882 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 883 } 884 } 885 PetscFunctionReturn(PETSC_SUCCESS); 886 } 887 888 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 889 { 890 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 891 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 892 PetscInt i, j, am = a->mbs; 893 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 894 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 895 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 896 PetscReal rs; 897 FactorShiftCtx sctx; 898 899 PetscFunctionBegin; 900 /* MatPivotSetUp(): initialize shift context sctx */ 901 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 902 903 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 904 905 do { 906 sctx.newshift = PETSC_FALSE; 907 for (i = 0; i < am; i++) { 908 rtmp[i] = 0.0; 909 jl[i] = am; 910 il[0] = 0; 911 } 912 913 for (k = 0; k < am; k++) { 914 /* initialize k-th row with elements nonzero in row perm(k) of A */ 915 nz = ai[k + 1] - ai[k]; 916 acol = aj + ai[k]; 917 aval = aa + ai[k]; 918 bval = ba + bi[k]; 919 while (nz--) { 920 if (*acol < k) { /* skip lower triangular entries */ 921 acol++; 922 aval++; 923 } else { 924 rtmp[*acol++] = *aval++; 925 *bval++ = 0.0; /* for in-place factorization */ 926 } 927 } 928 929 /* shift the diagonal of the matrix */ 930 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 931 932 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 933 dk = rtmp[k]; 934 i = jl[k]; /* first row to be added to k_th row */ 935 936 while (i < k) { 937 nexti = jl[i]; /* next row to be added to k_th row */ 938 /* compute multiplier, update D(k) and U(i,k) */ 939 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 940 uikdi = -ba[ili] * ba[bi[i]]; 941 dk += uikdi * ba[ili]; 942 ba[ili] = uikdi; /* -U(i,k) */ 943 944 /* add multiple of row i to k-th row ... */ 945 jmin = ili + 1; 946 nz = bi[i + 1] - jmin; 947 if (nz > 0) { 948 bcol = bj + jmin; 949 bval = ba + jmin; 950 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 951 /* update il and jl for i-th row */ 952 il[i] = jmin; 953 j = bj[jmin]; 954 jl[i] = jl[j]; 955 jl[j] = i; 956 } 957 i = nexti; 958 } 959 960 /* shift the diagonals when zero pivot is detected */ 961 /* compute rs=sum of abs(off-diagonal) */ 962 rs = 0.0; 963 jmin = bi[k] + 1; 964 nz = bi[k + 1] - jmin; 965 if (nz) { 966 bcol = bj + jmin; 967 while (nz--) { 968 rs += PetscAbsScalar(rtmp[*bcol]); 969 bcol++; 970 } 971 } 972 973 sctx.rs = rs; 974 sctx.pv = dk; 975 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 976 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 977 dk = sctx.pv; 978 979 /* copy data into U(k,:) */ 980 ba[bi[k]] = 1.0 / dk; 981 jmin = bi[k] + 1; 982 nz = bi[k + 1] - jmin; 983 if (nz) { 984 bcol = bj + jmin; 985 bval = ba + jmin; 986 while (nz--) { 987 *bval++ = rtmp[*bcol]; 988 rtmp[*bcol++] = 0.0; 989 } 990 /* add k-th row into il and jl */ 991 il[k] = jmin; 992 i = bj[jmin]; 993 jl[k] = jl[i]; 994 jl[i] = k; 995 } 996 } 997 } while (sctx.newshift); 998 PetscCall(PetscFree3(rtmp, il, jl)); 999 1000 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1001 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1002 C->assembled = PETSC_TRUE; 1003 C->preallocated = PETSC_TRUE; 1004 1005 PetscCall(PetscLogFlops(C->rmap->N)); 1006 if (sctx.nshift) { 1007 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1008 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1009 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1010 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1011 } 1012 } 1013 PetscFunctionReturn(PETSC_SUCCESS); 1014 } 1015 1016 #include <petscbt.h> 1017 #include <../src/mat/utils/freespace.h> 1018 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1019 { 1020 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1021 Mat_SeqSBAIJ *b; 1022 Mat B; 1023 PetscBool perm_identity; 1024 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1025 const PetscInt *rip; 1026 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1027 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1028 PetscReal fill = info->fill, levels = info->levels; 1029 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1030 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1031 PetscBT lnkbt; 1032 PetscBool diagDense; 1033 const PetscInt *adiag; 1034 1035 PetscFunctionBegin; 1036 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense)); 1037 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry"); 1038 1039 if (bs > 1) { 1040 if (!a->sbaijMat) { 1041 A->symmetric = PETSC_BOOL3_TRUE; 1042 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 1043 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1044 } 1045 fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1046 1047 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1048 PetscFunctionReturn(PETSC_SUCCESS); 1049 } 1050 1051 PetscCall(ISIdentity(perm, &perm_identity)); 1052 PetscCall(ISGetIndices(perm, &rip)); 1053 1054 /* special case that simply copies fill pattern */ 1055 if (!levels && perm_identity) { 1056 PetscCall(PetscMalloc1(am + 1, &ui)); 1057 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */ 1058 B = fact; 1059 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1060 1061 b = (Mat_SeqSBAIJ *)B->data; 1062 uj = b->j; 1063 for (i = 0; i < am; i++) { 1064 aj = a->j + adiag[i]; 1065 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1066 b->ilen[i] = ui[i]; 1067 } 1068 PetscCall(PetscFree(ui)); 1069 1070 B->factortype = MAT_FACTOR_NONE; 1071 1072 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1073 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1074 B->factortype = MAT_FACTOR_ICC; 1075 1076 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1077 PetscFunctionReturn(PETSC_SUCCESS); 1078 } 1079 1080 /* initialization */ 1081 PetscCall(PetscMalloc1(am + 1, &ui)); 1082 ui[0] = 0; 1083 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1084 1085 /* jl: linked list for storing indices of the pivot rows 1086 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1087 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1088 for (i = 0; i < am; i++) { 1089 jl[i] = am; 1090 il[i] = 0; 1091 } 1092 1093 /* create and initialize a linked list for storing column indices of the active row k */ 1094 nlnk = am + 1; 1095 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1096 1097 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1098 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1099 1100 current_space = free_space; 1101 1102 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1103 current_space_lvl = free_space_lvl; 1104 1105 for (k = 0; k < am; k++) { /* for each active row k */ 1106 /* initialize lnk by the column indices of row rip[k] of A */ 1107 nzk = 0; 1108 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1109 ncols_upper = 0; 1110 cols = cols_lvl + am; 1111 for (j = 0; j < ncols; j++) { 1112 i = rip[*(aj + ai[rip[k]] + j)]; 1113 if (i >= k) { /* only take upper triangular entry */ 1114 cols[ncols_upper] = i; 1115 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1116 ncols_upper++; 1117 } 1118 } 1119 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1120 nzk += nlnk; 1121 1122 /* update lnk by computing fill-in for each pivot row to be merged in */ 1123 prow = jl[k]; /* 1st pivot row */ 1124 1125 while (prow < k) { 1126 nextprow = jl[prow]; 1127 1128 /* merge prow into k-th row */ 1129 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1130 jmax = ui[prow + 1]; 1131 ncols = jmax - jmin; 1132 i = jmin - ui[prow]; 1133 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1134 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1135 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1136 nzk += nlnk; 1137 1138 /* update il and jl for prow */ 1139 if (jmin < jmax) { 1140 il[prow] = jmin; 1141 1142 j = *cols; 1143 jl[prow] = jl[j]; 1144 jl[j] = prow; 1145 } 1146 prow = nextprow; 1147 } 1148 1149 /* if free space is not available, make more free space */ 1150 if (current_space->local_remaining < nzk) { 1151 i = am - k + 1; /* num of unfactored rows */ 1152 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1153 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1154 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1155 reallocs++; 1156 } 1157 1158 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1159 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1160 1161 /* add the k-th row into il and jl */ 1162 if (nzk - 1 > 0) { 1163 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1164 jl[k] = jl[i]; 1165 jl[i] = k; 1166 il[k] = ui[k] + 1; 1167 } 1168 uj_ptr[k] = current_space->array; 1169 uj_lvl_ptr[k] = current_space_lvl->array; 1170 1171 current_space->array += nzk; 1172 current_space->local_used += nzk; 1173 current_space->local_remaining -= nzk; 1174 1175 current_space_lvl->array += nzk; 1176 current_space_lvl->local_used += nzk; 1177 current_space_lvl->local_remaining -= nzk; 1178 1179 ui[k + 1] = ui[k] + nzk; 1180 } 1181 1182 PetscCall(ISRestoreIndices(perm, &rip)); 1183 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1184 PetscCall(PetscFree(cols_lvl)); 1185 1186 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1187 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1188 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1189 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1190 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1191 1192 /* put together the new matrix in MATSEQSBAIJ format */ 1193 B = fact; 1194 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1195 1196 b = (Mat_SeqSBAIJ *)B->data; 1197 b->free_a = PETSC_TRUE; 1198 b->free_ij = PETSC_TRUE; 1199 1200 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1201 1202 b->j = uj; 1203 b->i = ui; 1204 b->diag = NULL; 1205 b->ilen = NULL; 1206 b->imax = NULL; 1207 b->row = perm; 1208 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1209 1210 PetscCall(PetscObjectReference((PetscObject)perm)); 1211 1212 b->icol = perm; 1213 1214 PetscCall(PetscObjectReference((PetscObject)perm)); 1215 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1216 1217 b->maxnz = b->nz = ui[am]; 1218 1219 B->info.factor_mallocs = reallocs; 1220 B->info.fill_ratio_given = fill; 1221 if (ai[am] != 0.) { 1222 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1223 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1224 } else { 1225 B->info.fill_ratio_needed = 0.0; 1226 } 1227 #if defined(PETSC_USE_INFO) 1228 if (ai[am] != 0) { 1229 PetscReal af = B->info.fill_ratio_needed; 1230 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1231 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1232 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1233 } else { 1234 PetscCall(PetscInfo(A, "Empty matrix\n")); 1235 } 1236 #endif 1237 if (perm_identity) { 1238 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1239 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1240 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1241 } else { 1242 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1243 } 1244 PetscFunctionReturn(PETSC_SUCCESS); 1245 } 1246 1247 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1248 { 1249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1250 Mat_SeqSBAIJ *b; 1251 Mat B; 1252 PetscBool perm_identity; 1253 PetscReal fill = info->fill; 1254 const PetscInt *rip; 1255 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1256 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1257 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1258 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1259 PetscBT lnkbt; 1260 PetscBool diagDense; 1261 1262 PetscFunctionBegin; 1263 if (bs > 1) { /* convert to seqsbaij */ 1264 if (!a->sbaijMat) { 1265 A->symmetric = PETSC_BOOL3_TRUE; 1266 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 1267 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1268 } 1269 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1270 1271 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1272 PetscFunctionReturn(PETSC_SUCCESS); 1273 } 1274 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense)); 1275 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry"); 1276 1277 /* check whether perm is the identity mapping */ 1278 PetscCall(ISIdentity(perm, &perm_identity)); 1279 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1280 PetscCall(ISGetIndices(perm, &rip)); 1281 1282 /* initialization */ 1283 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1284 ui[0] = 0; 1285 1286 /* jl: linked list for storing indices of the pivot rows 1287 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1288 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1289 for (i = 0; i < mbs; i++) { 1290 jl[i] = mbs; 1291 il[i] = 0; 1292 } 1293 1294 /* create and initialize a linked list for storing column indices of the active row k */ 1295 nlnk = mbs + 1; 1296 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1297 1298 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1299 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1300 1301 current_space = free_space; 1302 1303 for (k = 0; k < mbs; k++) { /* for each active row k */ 1304 /* initialize lnk by the column indices of row rip[k] of A */ 1305 nzk = 0; 1306 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1307 ncols_upper = 0; 1308 for (j = 0; j < ncols; j++) { 1309 i = rip[*(aj + ai[rip[k]] + j)]; 1310 if (i >= k) { /* only take upper triangular entry */ 1311 cols[ncols_upper] = i; 1312 ncols_upper++; 1313 } 1314 } 1315 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1316 nzk += nlnk; 1317 1318 /* update lnk by computing fill-in for each pivot row to be merged in */ 1319 prow = jl[k]; /* 1st pivot row */ 1320 1321 while (prow < k) { 1322 nextprow = jl[prow]; 1323 /* merge prow into k-th row */ 1324 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1325 jmax = ui[prow + 1]; 1326 ncols = jmax - jmin; 1327 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1328 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1329 nzk += nlnk; 1330 1331 /* update il and jl for prow */ 1332 if (jmin < jmax) { 1333 il[prow] = jmin; 1334 j = *uj_ptr; 1335 jl[prow] = jl[j]; 1336 jl[j] = prow; 1337 } 1338 prow = nextprow; 1339 } 1340 1341 /* if free space is not available, make more free space */ 1342 if (current_space->local_remaining < nzk) { 1343 i = mbs - k + 1; /* num of unfactored rows */ 1344 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1345 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1346 reallocs++; 1347 } 1348 1349 /* copy data into free space, then initialize lnk */ 1350 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1351 1352 /* add the k-th row into il and jl */ 1353 if (nzk - 1 > 0) { 1354 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1355 jl[k] = jl[i]; 1356 jl[i] = k; 1357 il[k] = ui[k] + 1; 1358 } 1359 ui_ptr[k] = current_space->array; 1360 current_space->array += nzk; 1361 current_space->local_used += nzk; 1362 current_space->local_remaining -= nzk; 1363 1364 ui[k + 1] = ui[k] + nzk; 1365 } 1366 1367 PetscCall(ISRestoreIndices(perm, &rip)); 1368 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1369 1370 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1371 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1372 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1373 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1374 1375 /* put together the new matrix in MATSEQSBAIJ format */ 1376 B = fact; 1377 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1378 1379 b = (Mat_SeqSBAIJ *)B->data; 1380 b->free_a = PETSC_TRUE; 1381 b->free_ij = PETSC_TRUE; 1382 1383 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1384 1385 b->j = uj; 1386 b->i = ui; 1387 b->diag = NULL; 1388 b->ilen = NULL; 1389 b->imax = NULL; 1390 b->row = perm; 1391 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1392 1393 PetscCall(PetscObjectReference((PetscObject)perm)); 1394 b->icol = perm; 1395 PetscCall(PetscObjectReference((PetscObject)perm)); 1396 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1397 b->maxnz = b->nz = ui[mbs]; 1398 1399 B->info.factor_mallocs = reallocs; 1400 B->info.fill_ratio_given = fill; 1401 if (ai[mbs] != 0.) { 1402 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1403 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1404 } else { 1405 B->info.fill_ratio_needed = 0.0; 1406 } 1407 #if defined(PETSC_USE_INFO) 1408 if (ai[mbs] != 0.) { 1409 PetscReal af = B->info.fill_ratio_needed; 1410 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1411 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1412 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1413 } else { 1414 PetscCall(PetscInfo(A, "Empty matrix\n")); 1415 } 1416 #endif 1417 if (perm_identity) { 1418 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1419 } else { 1420 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1421 } 1422 PetscFunctionReturn(PETSC_SUCCESS); 1423 } 1424 1425 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1426 { 1427 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1428 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1429 PetscInt i, k, n = a->mbs; 1430 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1431 const MatScalar *aa = a->a, *v; 1432 PetscScalar *x, *s, *t, *ls; 1433 const PetscScalar *b; 1434 1435 PetscFunctionBegin; 1436 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1437 PetscCall(VecGetArrayRead(bb, &b)); 1438 PetscCall(VecGetArray(xx, &x)); 1439 t = a->solve_work; 1440 1441 /* forward solve the lower triangular */ 1442 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1443 1444 for (i = 1; i < n; i++) { 1445 v = aa + bs2 * ai[i]; 1446 vi = aj + ai[i]; 1447 nz = ai[i + 1] - ai[i]; 1448 s = t + bs * i; 1449 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1450 for (k = 0; k < nz; k++) { 1451 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1452 v += bs2; 1453 } 1454 } 1455 1456 /* backward solve the upper triangular */ 1457 ls = a->solve_work + A->cmap->n; 1458 for (i = n - 1; i >= 0; i--) { 1459 v = aa + bs2 * (adiag[i + 1] + 1); 1460 vi = aj + adiag[i + 1] + 1; 1461 nz = adiag[i] - adiag[i + 1] - 1; 1462 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1463 for (k = 0; k < nz; k++) { 1464 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1465 v += bs2; 1466 } 1467 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1468 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1469 } 1470 1471 PetscCall(VecRestoreArrayRead(bb, &b)); 1472 PetscCall(VecRestoreArray(xx, &x)); 1473 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1474 PetscFunctionReturn(PETSC_SUCCESS); 1475 } 1476 1477 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1478 { 1479 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1480 IS iscol = a->col, isrow = a->row; 1481 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1482 PetscInt i, m, n = a->mbs; 1483 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1484 const MatScalar *aa = a->a, *v; 1485 PetscScalar *x, *s, *t, *ls; 1486 const PetscScalar *b; 1487 1488 PetscFunctionBegin; 1489 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1490 PetscCall(VecGetArrayRead(bb, &b)); 1491 PetscCall(VecGetArray(xx, &x)); 1492 t = a->solve_work; 1493 1494 PetscCall(ISGetIndices(isrow, &rout)); 1495 r = rout; 1496 PetscCall(ISGetIndices(iscol, &cout)); 1497 c = cout; 1498 1499 /* forward solve the lower triangular */ 1500 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1501 for (i = 1; i < n; i++) { 1502 v = aa + bs2 * ai[i]; 1503 vi = aj + ai[i]; 1504 nz = ai[i + 1] - ai[i]; 1505 s = t + bs * i; 1506 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1507 for (m = 0; m < nz; m++) { 1508 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1509 v += bs2; 1510 } 1511 } 1512 1513 /* backward solve the upper triangular */ 1514 ls = a->solve_work + A->cmap->n; 1515 for (i = n - 1; i >= 0; i--) { 1516 v = aa + bs2 * (adiag[i + 1] + 1); 1517 vi = aj + adiag[i + 1] + 1; 1518 nz = adiag[i] - adiag[i + 1] - 1; 1519 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1520 for (m = 0; m < nz; m++) { 1521 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1522 v += bs2; 1523 } 1524 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1525 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1526 } 1527 PetscCall(ISRestoreIndices(isrow, &rout)); 1528 PetscCall(ISRestoreIndices(iscol, &cout)); 1529 PetscCall(VecRestoreArrayRead(bb, &b)); 1530 PetscCall(VecRestoreArray(xx, &x)); 1531 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1532 PetscFunctionReturn(PETSC_SUCCESS); 1533 } 1534