1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 8 { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 14 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 15 MatScalar *rtmp, *pc, *mwork, *pv; 16 MatScalar *aa = a->a, *v; 17 PetscReal shift = info->shiftamount; 18 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 19 20 PetscFunctionBegin; 21 PetscCall(ISGetIndices(isrow, &r)); 22 PetscCall(ISGetIndices(isicol, &ic)); 23 24 /* generate work space needed by the factorization */ 25 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 26 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 27 28 for (PetscInt i = 0; i < n; i++) { 29 /* zero rtmp */ 30 /* L part */ 31 PetscInt *pj; 32 PetscInt nzL, nz = bi[i + 1] - bi[i]; 33 bjtmp = bj + bi[i]; 34 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 35 36 /* U part */ 37 nz = bdiag[i] - bdiag[i + 1]; 38 bjtmp = bj + bdiag[i + 1] + 1; 39 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 40 41 /* load in initial (unfactored row) */ 42 nz = ai[r[i] + 1] - ai[r[i]]; 43 ajtmp = aj + ai[r[i]]; 44 v = aa + bs2 * ai[r[i]]; 45 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 46 47 /* elimination */ 48 bjtmp = bj + bi[i]; 49 nzL = bi[i + 1] - bi[i]; 50 for (PetscInt k = 0; k < nzL; k++) { 51 PetscBool flg = PETSC_FALSE; 52 PetscInt row = bjtmp[k]; 53 54 pc = rtmp + bs2 * row; 55 for (PetscInt j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = PETSC_TRUE; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (PetscInt j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 { 92 PetscBool zeropivotdetected; 93 94 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 95 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 96 } 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 PetscCall(ISRestoreIndices(isicol, &ic)); 107 PetscCall(ISRestoreIndices(isrow, &r)); 108 109 C->ops->solve = MatSolve_SeqBAIJ_2; 110 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 111 C->assembled = PETSC_TRUE; 112 113 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 118 { 119 Mat C = B; 120 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 121 PetscInt i, j, k, nz, nzL, row, *pj; 122 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 123 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 124 MatScalar *rtmp, *pc, *mwork, *pv; 125 MatScalar *aa = a->a, *v; 126 PetscInt flg; 127 PetscReal shift = info->shiftamount; 128 PetscBool allowzeropivot, zeropivotdetected; 129 130 PetscFunctionBegin; 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 /* generate work space needed by the factorization */ 134 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 135 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 136 137 for (i = 0; i < n; i++) { 138 /* zero rtmp */ 139 /* L part */ 140 nz = bi[i + 1] - bi[i]; 141 bjtmp = bj + bi[i]; 142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 143 144 /* U part */ 145 nz = bdiag[i] - bdiag[i + 1]; 146 bjtmp = bj + bdiag[i + 1] + 1; 147 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 148 149 /* load in initial (unfactored row) */ 150 nz = ai[i + 1] - ai[i]; 151 ajtmp = aj + ai[i]; 152 v = aa + bs2 * ai[i]; 153 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 154 155 /* elimination */ 156 bjtmp = bj + bi[i]; 157 nzL = bi[i + 1] - bi[i]; 158 for (k = 0; k < nzL; k++) { 159 row = bjtmp[k]; 160 pc = rtmp + bs2 * row; 161 for (flg = 0, j = 0; j < bs2; j++) { 162 if (pc[j] != (PetscScalar)0.0) { 163 flg = 1; 164 break; 165 } 166 } 167 if (flg) { 168 pv = b->a + bs2 * bdiag[row]; 169 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 170 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 171 172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 173 pv = b->a + bs2 * (bdiag[row + 1] + 1); 174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 175 for (j = 0; j < nz; j++) { 176 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 177 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 178 v = rtmp + 4 * pj[j]; 179 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 180 pv += 4; 181 } 182 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 183 } 184 } 185 186 /* finished row so stick it into b->a */ 187 /* L part */ 188 pv = b->a + bs2 * bi[i]; 189 pj = b->j + bi[i]; 190 nz = bi[i + 1] - bi[i]; 191 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 192 193 /* Mark diagonal and invert diagonal for simpler triangular solves */ 194 pv = b->a + bs2 * bdiag[i]; 195 pj = b->j + bdiag[i]; 196 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 197 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 198 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 199 200 /* U part */ 201 /* 202 pv = b->a + bs2*bi[2*n-i]; 203 pj = b->j + bi[2*n-i]; 204 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 205 */ 206 pv = b->a + bs2 * (bdiag[i + 1] + 1); 207 pj = b->j + bdiag[i + 1] + 1; 208 nz = bdiag[i] - bdiag[i + 1] - 1; 209 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 210 } 211 PetscCall(PetscFree2(rtmp, mwork)); 212 213 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 214 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 217 C->assembled = PETSC_TRUE; 218 219 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 224 { 225 Mat C = B; 226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 227 IS isrow = b->row, isicol = b->icol; 228 const PetscInt *r, *ic; 229 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 230 PetscInt *ajtmpold, *ajtmp, nz, row; 231 PetscInt idx, *ai = a->i, *aj = a->j, *pj; 232 const PetscInt *diag_offset; 233 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 234 MatScalar p1, p2, p3, p4; 235 MatScalar *ba = b->a, *aa = a->a; 236 PetscReal shift = info->shiftamount; 237 PetscBool allowzeropivot, zeropivotdetected; 238 239 PetscFunctionBegin; 240 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 241 A->factortype = MAT_FACTOR_NONE; 242 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 243 A->factortype = MAT_FACTOR_ILU; 244 allowzeropivot = PetscNot(A->erroriffailure); 245 PetscCall(ISGetIndices(isrow, &r)); 246 PetscCall(ISGetIndices(isicol, &ic)); 247 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 248 249 for (i = 0; i < n; i++) { 250 nz = bi[i + 1] - bi[i]; 251 ajtmp = bj + bi[i]; 252 for (j = 0; j < nz; j++) { 253 x = rtmp + 4 * ajtmp[j]; 254 x[0] = x[1] = x[2] = x[3] = 0.0; 255 } 256 /* load in initial (unfactored row) */ 257 idx = r[i]; 258 nz = ai[idx + 1] - ai[idx]; 259 ajtmpold = aj + ai[idx]; 260 v = aa + 4 * ai[idx]; 261 for (j = 0; j < nz; j++) { 262 x = rtmp + 4 * ic[ajtmpold[j]]; 263 x[0] = v[0]; 264 x[1] = v[1]; 265 x[2] = v[2]; 266 x[3] = v[3]; 267 v += 4; 268 } 269 row = *ajtmp++; 270 while (row < i) { 271 pc = rtmp + 4 * row; 272 p1 = pc[0]; 273 p2 = pc[1]; 274 p3 = pc[2]; 275 p4 = pc[3]; 276 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 277 pv = ba + 4 * diag_offset[row]; 278 pj = bj + diag_offset[row] + 1; 279 x1 = pv[0]; 280 x2 = pv[1]; 281 x3 = pv[2]; 282 x4 = pv[3]; 283 pc[0] = m1 = p1 * x1 + p3 * x2; 284 pc[1] = m2 = p2 * x1 + p4 * x2; 285 pc[2] = m3 = p1 * x3 + p3 * x4; 286 pc[3] = m4 = p2 * x3 + p4 * x4; 287 nz = bi[row + 1] - diag_offset[row] - 1; 288 pv += 4; 289 for (j = 0; j < nz; j++) { 290 x1 = pv[0]; 291 x2 = pv[1]; 292 x3 = pv[2]; 293 x4 = pv[3]; 294 x = rtmp + 4 * pj[j]; 295 x[0] -= m1 * x1 + m3 * x2; 296 x[1] -= m2 * x1 + m4 * x2; 297 x[2] -= m1 * x3 + m3 * x4; 298 x[3] -= m2 * x3 + m4 * x4; 299 pv += 4; 300 } 301 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 302 } 303 row = *ajtmp++; 304 } 305 /* finished row so stick it into b->a */ 306 pv = ba + 4 * bi[i]; 307 pj = bj + bi[i]; 308 nz = bi[i + 1] - bi[i]; 309 for (j = 0; j < nz; j++) { 310 x = rtmp + 4 * pj[j]; 311 pv[0] = x[0]; 312 pv[1] = x[1]; 313 pv[2] = x[2]; 314 pv[3] = x[3]; 315 pv += 4; 316 } 317 /* invert diagonal block */ 318 w = ba + 4 * diag_offset[i]; 319 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 320 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 321 } 322 323 PetscCall(PetscFree(rtmp)); 324 PetscCall(ISRestoreIndices(isicol, &ic)); 325 PetscCall(ISRestoreIndices(isrow, &r)); 326 327 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 328 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 329 C->assembled = PETSC_TRUE; 330 331 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 /* 335 Version for when blocks are 2 by 2 Using natural ordering 336 */ 337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 338 { 339 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 340 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 341 PetscInt *ajtmpold, *ajtmp, nz, row; 342 PetscInt *ai = a->i, *aj = a->j, *pj; 343 const PetscInt *diag_offset; 344 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 345 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 346 MatScalar *ba = b->a, *aa = a->a; 347 PetscReal shift = info->shiftamount; 348 PetscBool allowzeropivot, zeropivotdetected; 349 350 PetscFunctionBegin; 351 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 352 A->factortype = MAT_FACTOR_NONE; 353 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 354 A->factortype = MAT_FACTOR_ILU; 355 allowzeropivot = PetscNot(A->erroriffailure); 356 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 357 for (i = 0; i < n; i++) { 358 nz = bi[i + 1] - bi[i]; 359 ajtmp = bj + bi[i]; 360 for (j = 0; j < nz; j++) { 361 x = rtmp + 4 * ajtmp[j]; 362 x[0] = x[1] = x[2] = x[3] = 0.0; 363 } 364 /* load in initial (unfactored row) */ 365 nz = ai[i + 1] - ai[i]; 366 ajtmpold = aj + ai[i]; 367 v = aa + 4 * ai[i]; 368 for (j = 0; j < nz; j++) { 369 x = rtmp + 4 * ajtmpold[j]; 370 x[0] = v[0]; 371 x[1] = v[1]; 372 x[2] = v[2]; 373 x[3] = v[3]; 374 v += 4; 375 } 376 row = *ajtmp++; 377 while (row < i) { 378 pc = rtmp + 4 * row; 379 p1 = pc[0]; 380 p2 = pc[1]; 381 p3 = pc[2]; 382 p4 = pc[3]; 383 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 384 pv = ba + 4 * diag_offset[row]; 385 pj = bj + diag_offset[row] + 1; 386 x1 = pv[0]; 387 x2 = pv[1]; 388 x3 = pv[2]; 389 x4 = pv[3]; 390 pc[0] = m1 = p1 * x1 + p3 * x2; 391 pc[1] = m2 = p2 * x1 + p4 * x2; 392 pc[2] = m3 = p1 * x3 + p3 * x4; 393 pc[3] = m4 = p2 * x3 + p4 * x4; 394 nz = bi[row + 1] - diag_offset[row] - 1; 395 pv += 4; 396 for (j = 0; j < nz; j++) { 397 x1 = pv[0]; 398 x2 = pv[1]; 399 x3 = pv[2]; 400 x4 = pv[3]; 401 x = rtmp + 4 * pj[j]; 402 x[0] -= m1 * x1 + m3 * x2; 403 x[1] -= m2 * x1 + m4 * x2; 404 x[2] -= m1 * x3 + m3 * x4; 405 x[3] -= m2 * x3 + m4 * x4; 406 pv += 4; 407 } 408 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 409 } 410 row = *ajtmp++; 411 } 412 /* finished row so stick it into b->a */ 413 pv = ba + 4 * bi[i]; 414 pj = bj + bi[i]; 415 nz = bi[i + 1] - bi[i]; 416 for (j = 0; j < nz; j++) { 417 x = rtmp + 4 * pj[j]; 418 pv[0] = x[0]; 419 pv[1] = x[1]; 420 pv[2] = x[2]; 421 pv[3] = x[3]; 422 /* 423 printf(" col %d:",pj[j]); 424 PetscInt j1; 425 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 426 printf("\n"); 427 */ 428 pv += 4; 429 } 430 /* invert diagonal block */ 431 w = ba + 4 * diag_offset[i]; 432 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 433 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 434 } 435 436 PetscCall(PetscFree(rtmp)); 437 438 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 439 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 440 C->assembled = PETSC_TRUE; 441 442 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 443 PetscFunctionReturn(PETSC_SUCCESS); 444 } 445 446 /* 447 Version for when blocks are 1 by 1. 448 */ 449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 450 { 451 Mat C = B; 452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 453 IS isrow = b->row, isicol = b->icol; 454 const PetscInt *r, *ic, *ics; 455 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 456 PetscInt i, j, k, nz, nzL, row, *pj; 457 const PetscInt *ajtmp, *bjtmp; 458 MatScalar *rtmp, *pc, multiplier, *pv; 459 const MatScalar *aa = a->a, *v; 460 PetscBool row_identity, col_identity; 461 FactorShiftCtx sctx; 462 const PetscInt *ddiag; 463 PetscReal rs; 464 MatScalar d; 465 466 PetscFunctionBegin; 467 /* MatPivotSetUp(): initialize shift context sctx */ 468 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 469 470 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 471 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL)); 472 sctx.shift_top = info->zeropivot; 473 for (i = 0; i < n; i++) { 474 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 475 d = (aa)[ddiag[i]]; 476 rs = -PetscAbsScalar(d) - PetscRealPart(d); 477 v = aa + ai[i]; 478 nz = ai[i + 1] - ai[i]; 479 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 480 if (rs > sctx.shift_top) sctx.shift_top = rs; 481 } 482 sctx.shift_top *= 1.1; 483 sctx.nshift_max = 5; 484 sctx.shift_lo = 0.; 485 sctx.shift_hi = 1.; 486 } 487 488 PetscCall(ISGetIndices(isrow, &r)); 489 PetscCall(ISGetIndices(isicol, &ic)); 490 PetscCall(PetscMalloc1(n + 1, &rtmp)); 491 ics = ic; 492 493 do { 494 sctx.newshift = PETSC_FALSE; 495 for (i = 0; i < n; i++) { 496 /* zero rtmp */ 497 /* L part */ 498 nz = bi[i + 1] - bi[i]; 499 bjtmp = bj + bi[i]; 500 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 501 502 /* U part */ 503 nz = bdiag[i] - bdiag[i + 1]; 504 bjtmp = bj + bdiag[i + 1] + 1; 505 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 506 507 /* load in initial (unfactored row) */ 508 nz = ai[r[i] + 1] - ai[r[i]]; 509 ajtmp = aj + ai[r[i]]; 510 v = aa + ai[r[i]]; 511 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 512 513 /* ZeropivotApply() */ 514 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 515 516 /* elimination */ 517 bjtmp = bj + bi[i]; 518 row = *bjtmp++; 519 nzL = bi[i + 1] - bi[i]; 520 for (k = 0; k < nzL; k++) { 521 pc = rtmp + row; 522 if (*pc != (PetscScalar)0.0) { 523 pv = b->a + bdiag[row]; 524 multiplier = *pc * (*pv); 525 *pc = multiplier; 526 527 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 528 pv = b->a + bdiag[row + 1] + 1; 529 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 530 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 531 PetscCall(PetscLogFlops(2.0 * nz)); 532 } 533 row = *bjtmp++; 534 } 535 536 /* finished row so stick it into b->a */ 537 rs = 0.0; 538 /* L part */ 539 pv = b->a + bi[i]; 540 pj = b->j + bi[i]; 541 nz = bi[i + 1] - bi[i]; 542 for (j = 0; j < nz; j++) { 543 pv[j] = rtmp[pj[j]]; 544 rs += PetscAbsScalar(pv[j]); 545 } 546 547 /* U part */ 548 pv = b->a + bdiag[i + 1] + 1; 549 pj = b->j + bdiag[i + 1] + 1; 550 nz = bdiag[i] - bdiag[i + 1] - 1; 551 for (j = 0; j < nz; j++) { 552 pv[j] = rtmp[pj[j]]; 553 rs += PetscAbsScalar(pv[j]); 554 } 555 556 sctx.rs = rs; 557 sctx.pv = rtmp[i]; 558 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 559 if (sctx.newshift) break; /* break for-loop */ 560 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 561 562 /* Mark diagonal and invert diagonal for simpler triangular solves */ 563 pv = b->a + bdiag[i]; 564 *pv = (PetscScalar)1.0 / rtmp[i]; 565 566 } /* endof for (i=0; i<n; i++) { */ 567 568 /* MatPivotRefine() */ 569 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 570 /* 571 * if no shift in this attempt & shifting & started shifting & can refine, 572 * then try lower shift 573 */ 574 sctx.shift_hi = sctx.shift_fraction; 575 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 576 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 577 sctx.newshift = PETSC_TRUE; 578 sctx.nshift++; 579 } 580 } while (sctx.newshift); 581 582 PetscCall(PetscFree(rtmp)); 583 PetscCall(ISRestoreIndices(isicol, &ic)); 584 PetscCall(ISRestoreIndices(isrow, &r)); 585 586 PetscCall(ISIdentity(isrow, &row_identity)); 587 PetscCall(ISIdentity(isicol, &col_identity)); 588 if (row_identity && col_identity) { 589 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 590 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 591 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 592 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 593 } else { 594 C->ops->solve = MatSolve_SeqBAIJ_1; 595 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 596 } 597 C->assembled = PETSC_TRUE; 598 PetscCall(PetscLogFlops(C->cmap->n)); 599 600 /* MatShiftView(A,info,&sctx) */ 601 if (sctx.nshift) { 602 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 603 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 604 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 605 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 606 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 607 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 608 } 609 } 610 PetscFunctionReturn(PETSC_SUCCESS); 611 } 612 613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 614 { 615 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 616 IS isrow = b->row, isicol = b->icol; 617 const PetscInt *r, *ic; 618 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 619 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 620 PetscInt diag, *pj; 621 const PetscInt *diag_offset; 622 MatScalar *pv, *v, *rtmp, multiplier, *pc; 623 MatScalar *ba = b->a, *aa = a->a; 624 PetscBool row_identity, col_identity; 625 626 PetscFunctionBegin; 627 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */ 628 A->factortype = MAT_FACTOR_NONE; 629 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL)); 630 A->factortype = MAT_FACTOR_ILU; 631 PetscCall(ISGetIndices(isrow, &r)); 632 PetscCall(ISGetIndices(isicol, &ic)); 633 PetscCall(PetscMalloc1(n + 1, &rtmp)); 634 635 for (i = 0; i < n; i++) { 636 nz = bi[i + 1] - bi[i]; 637 ajtmp = bj + bi[i]; 638 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 639 640 /* load in initial (unfactored row) */ 641 nz = ai[r[i] + 1] - ai[r[i]]; 642 ajtmpold = aj + ai[r[i]]; 643 v = aa + ai[r[i]]; 644 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 645 646 row = *ajtmp++; 647 while (row < i) { 648 pc = rtmp + row; 649 if (*pc != 0.0) { 650 pv = ba + diag_offset[row]; 651 pj = bj + diag_offset[row] + 1; 652 multiplier = *pc * *pv++; 653 *pc = multiplier; 654 nz = bi[row + 1] - diag_offset[row] - 1; 655 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 656 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 657 } 658 row = *ajtmp++; 659 } 660 /* finished row so stick it into b->a */ 661 pv = ba + bi[i]; 662 pj = bj + bi[i]; 663 nz = bi[i + 1] - bi[i]; 664 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 665 diag = diag_offset[i] - bi[i]; 666 /* check pivot entry for current row */ 667 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 668 pv[diag] = 1.0 / pv[diag]; 669 } 670 671 PetscCall(PetscFree(rtmp)); 672 PetscCall(ISRestoreIndices(isicol, &ic)); 673 PetscCall(ISRestoreIndices(isrow, &r)); 674 PetscCall(ISIdentity(isrow, &row_identity)); 675 PetscCall(ISIdentity(isicol, &col_identity)); 676 if (row_identity && col_identity) { 677 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 678 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 679 } else { 680 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 681 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 682 } 683 C->assembled = PETSC_TRUE; 684 PetscCall(PetscLogFlops(C->cmap->n)); 685 PetscFunctionReturn(PETSC_SUCCESS); 686 } 687 688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 689 { 690 PetscFunctionBegin; 691 *type = MATSOLVERPETSC; 692 PetscFunctionReturn(PETSC_SUCCESS); 693 } 694 695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 696 { 697 PetscInt n = A->rmap->n; 698 699 PetscFunctionBegin; 700 PetscCheck(!PetscDefined(USE_COMPLEX) || A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 701 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 702 PetscCall(MatSetSizes(*B, n, n, n, n)); 703 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 704 PetscCall(MatSetType(*B, MATSEQBAIJ)); 705 706 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 707 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 708 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 709 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 710 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 711 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 712 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 713 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 714 715 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 716 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 717 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 718 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 719 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 720 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 721 (*B)->factortype = ftype; 722 (*B)->canuseordering = PETSC_TRUE; 723 724 PetscCall(PetscFree((*B)->solvertype)); 725 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 726 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 727 PetscFunctionReturn(PETSC_SUCCESS); 728 } 729 730 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 731 { 732 Mat C; 733 734 PetscFunctionBegin; 735 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 736 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 737 PetscCall(MatLUFactorNumeric(C, A, info)); 738 739 A->ops->solve = C->ops->solve; 740 A->ops->solvetranspose = C->ops->solvetranspose; 741 742 PetscCall(MatHeaderMerge(A, &C)); 743 PetscFunctionReturn(PETSC_SUCCESS); 744 } 745 746 #include <../src/mat/impls/sbaij/seq/sbaij.h> 747 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 748 { 749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 750 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 751 IS ip = b->row; 752 const PetscInt *rip; 753 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 754 PetscInt *ai = a->i, *aj = a->j; 755 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 756 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 757 PetscReal rs; 758 FactorShiftCtx sctx; 759 760 PetscFunctionBegin; 761 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 762 if (!a->sbaijMat) { 763 A->symmetric = PETSC_BOOL3_TRUE; 764 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 765 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 766 } 767 PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 768 PetscCall(MatDestroy(&a->sbaijMat)); 769 PetscFunctionReturn(PETSC_SUCCESS); 770 } 771 772 /* MatPivotSetUp(): initialize shift context sctx */ 773 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 774 775 PetscCall(ISGetIndices(ip, &rip)); 776 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 777 778 sctx.shift_amount = 0.; 779 sctx.nshift = 0; 780 do { 781 sctx.newshift = PETSC_FALSE; 782 for (i = 0; i < mbs; i++) { 783 rtmp[i] = 0.0; 784 jl[i] = mbs; 785 il[0] = 0; 786 } 787 788 for (k = 0; k < mbs; k++) { 789 bval = ba + bi[k]; 790 /* initialize k-th row by the perm[k]-th row of A */ 791 jmin = ai[rip[k]]; 792 jmax = ai[rip[k] + 1]; 793 for (j = jmin; j < jmax; j++) { 794 col = rip[aj[j]]; 795 if (col >= k) { /* only take upper triangular entry */ 796 rtmp[col] = aa[j]; 797 *bval++ = 0.0; /* for in-place factorization */ 798 } 799 } 800 801 /* shift the diagonal of the matrix */ 802 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 803 804 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 805 dk = rtmp[k]; 806 i = jl[k]; /* first row to be added to k_th row */ 807 808 while (i < k) { 809 nexti = jl[i]; /* next row to be added to k_th row */ 810 811 /* compute multiplier, update diag(k) and U(i,k) */ 812 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 813 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 814 dk += uikdi * ba[ili]; 815 ba[ili] = uikdi; /* -U(i,k) */ 816 817 /* add multiple of row i to k-th row */ 818 jmin = ili + 1; 819 jmax = bi[i + 1]; 820 if (jmin < jmax) { 821 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 822 /* update il and jl for row i */ 823 il[i] = jmin; 824 j = bj[jmin]; 825 jl[i] = jl[j]; 826 jl[j] = i; 827 } 828 i = nexti; 829 } 830 831 /* shift the diagonals when zero pivot is detected */ 832 /* compute rs=sum of abs(off-diagonal) */ 833 rs = 0.0; 834 jmin = bi[k] + 1; 835 nz = bi[k + 1] - jmin; 836 if (nz) { 837 bcol = bj + jmin; 838 while (nz--) { 839 rs += PetscAbsScalar(rtmp[*bcol]); 840 bcol++; 841 } 842 } 843 844 sctx.rs = rs; 845 sctx.pv = dk; 846 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 847 if (sctx.newshift) break; 848 dk = sctx.pv; 849 850 /* copy data into U(k,:) */ 851 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 852 jmin = bi[k] + 1; 853 jmax = bi[k + 1]; 854 if (jmin < jmax) { 855 for (j = jmin; j < jmax; j++) { 856 col = bj[j]; 857 ba[j] = rtmp[col]; 858 rtmp[col] = 0.0; 859 } 860 /* add the k-th row into il and jl */ 861 il[k] = jmin; 862 i = bj[jmin]; 863 jl[k] = jl[i]; 864 jl[i] = k; 865 } 866 } 867 } while (sctx.newshift); 868 PetscCall(PetscFree3(rtmp, il, jl)); 869 870 PetscCall(ISRestoreIndices(ip, &rip)); 871 872 C->assembled = PETSC_TRUE; 873 C->preallocated = PETSC_TRUE; 874 875 PetscCall(PetscLogFlops(C->rmap->N)); 876 if (sctx.nshift) { 877 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 878 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 879 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 880 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 881 } 882 } 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 887 { 888 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 889 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 890 PetscInt i, j, am = a->mbs; 891 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 892 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 893 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 894 PetscReal rs; 895 FactorShiftCtx sctx; 896 897 PetscFunctionBegin; 898 /* MatPivotSetUp(): initialize shift context sctx */ 899 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 900 901 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 902 903 do { 904 sctx.newshift = PETSC_FALSE; 905 for (i = 0; i < am; i++) { 906 rtmp[i] = 0.0; 907 jl[i] = am; 908 il[0] = 0; 909 } 910 911 for (k = 0; k < am; k++) { 912 /* initialize k-th row with elements nonzero in row perm(k) of A */ 913 nz = ai[k + 1] - ai[k]; 914 acol = aj + ai[k]; 915 aval = aa + ai[k]; 916 bval = ba + bi[k]; 917 while (nz--) { 918 if (*acol < k) { /* skip lower triangular entries */ 919 acol++; 920 aval++; 921 } else { 922 rtmp[*acol++] = *aval++; 923 *bval++ = 0.0; /* for in-place factorization */ 924 } 925 } 926 927 /* shift the diagonal of the matrix */ 928 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 929 930 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 931 dk = rtmp[k]; 932 i = jl[k]; /* first row to be added to k_th row */ 933 934 while (i < k) { 935 nexti = jl[i]; /* next row to be added to k_th row */ 936 /* compute multiplier, update D(k) and U(i,k) */ 937 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 938 uikdi = -ba[ili] * ba[bi[i]]; 939 dk += uikdi * ba[ili]; 940 ba[ili] = uikdi; /* -U(i,k) */ 941 942 /* add multiple of row i to k-th row ... */ 943 jmin = ili + 1; 944 nz = bi[i + 1] - jmin; 945 if (nz > 0) { 946 bcol = bj + jmin; 947 bval = ba + jmin; 948 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 949 /* update il and jl for i-th row */ 950 il[i] = jmin; 951 j = bj[jmin]; 952 jl[i] = jl[j]; 953 jl[j] = i; 954 } 955 i = nexti; 956 } 957 958 /* shift the diagonals when zero pivot is detected */ 959 /* compute rs=sum of abs(off-diagonal) */ 960 rs = 0.0; 961 jmin = bi[k] + 1; 962 nz = bi[k + 1] - jmin; 963 if (nz) { 964 bcol = bj + jmin; 965 while (nz--) { 966 rs += PetscAbsScalar(rtmp[*bcol]); 967 bcol++; 968 } 969 } 970 971 sctx.rs = rs; 972 sctx.pv = dk; 973 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 974 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 975 dk = sctx.pv; 976 977 /* copy data into U(k,:) */ 978 ba[bi[k]] = 1.0 / dk; 979 jmin = bi[k] + 1; 980 nz = bi[k + 1] - jmin; 981 if (nz) { 982 bcol = bj + jmin; 983 bval = ba + jmin; 984 while (nz--) { 985 *bval++ = rtmp[*bcol]; 986 rtmp[*bcol++] = 0.0; 987 } 988 /* add k-th row into il and jl */ 989 il[k] = jmin; 990 i = bj[jmin]; 991 jl[k] = jl[i]; 992 jl[i] = k; 993 } 994 } 995 } while (sctx.newshift); 996 PetscCall(PetscFree3(rtmp, il, jl)); 997 998 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 999 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1000 C->assembled = PETSC_TRUE; 1001 C->preallocated = PETSC_TRUE; 1002 1003 PetscCall(PetscLogFlops(C->rmap->N)); 1004 if (sctx.nshift) { 1005 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1006 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1007 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1008 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1009 } 1010 } 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 #include <petscbt.h> 1015 #include <../src/mat/utils/freespace.h> 1016 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1017 { 1018 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1019 Mat_SeqSBAIJ *b; 1020 Mat B; 1021 PetscBool perm_identity; 1022 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1023 const PetscInt *rip; 1024 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1025 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1026 PetscReal fill = info->fill, levels = info->levels; 1027 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1028 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1029 PetscBT lnkbt; 1030 PetscBool diagDense; 1031 const PetscInt *adiag; 1032 1033 PetscFunctionBegin; 1034 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense)); 1035 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry"); 1036 1037 if (bs > 1) { 1038 if (!a->sbaijMat) { 1039 A->symmetric = PETSC_BOOL3_TRUE; 1040 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 1041 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1042 } 1043 fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1044 1045 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1046 PetscFunctionReturn(PETSC_SUCCESS); 1047 } 1048 1049 PetscCall(ISIdentity(perm, &perm_identity)); 1050 PetscCall(ISGetIndices(perm, &rip)); 1051 1052 /* special case that simply copies fill pattern */ 1053 if (!levels && perm_identity) { 1054 PetscCall(PetscMalloc1(am + 1, &ui)); 1055 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */ 1056 B = fact; 1057 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1058 1059 b = (Mat_SeqSBAIJ *)B->data; 1060 uj = b->j; 1061 for (i = 0; i < am; i++) { 1062 aj = a->j + adiag[i]; 1063 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1064 b->ilen[i] = ui[i]; 1065 } 1066 PetscCall(PetscFree(ui)); 1067 1068 B->factortype = MAT_FACTOR_NONE; 1069 1070 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1071 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1072 B->factortype = MAT_FACTOR_ICC; 1073 1074 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1075 PetscFunctionReturn(PETSC_SUCCESS); 1076 } 1077 1078 /* initialization */ 1079 PetscCall(PetscMalloc1(am + 1, &ui)); 1080 ui[0] = 0; 1081 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1082 1083 /* jl: linked list for storing indices of the pivot rows 1084 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1085 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1086 for (i = 0; i < am; i++) { 1087 jl[i] = am; 1088 il[i] = 0; 1089 } 1090 1091 /* create and initialize a linked list for storing column indices of the active row k */ 1092 nlnk = am + 1; 1093 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1094 1095 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1096 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1097 1098 current_space = free_space; 1099 1100 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1101 current_space_lvl = free_space_lvl; 1102 1103 for (k = 0; k < am; k++) { /* for each active row k */ 1104 /* initialize lnk by the column indices of row rip[k] of A */ 1105 nzk = 0; 1106 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1107 ncols_upper = 0; 1108 cols = cols_lvl + am; 1109 for (j = 0; j < ncols; j++) { 1110 i = rip[*(aj + ai[rip[k]] + j)]; 1111 if (i >= k) { /* only take upper triangular entry */ 1112 cols[ncols_upper] = i; 1113 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1114 ncols_upper++; 1115 } 1116 } 1117 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1118 nzk += nlnk; 1119 1120 /* update lnk by computing fill-in for each pivot row to be merged in */ 1121 prow = jl[k]; /* 1st pivot row */ 1122 1123 while (prow < k) { 1124 nextprow = jl[prow]; 1125 1126 /* merge prow into k-th row */ 1127 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1128 jmax = ui[prow + 1]; 1129 ncols = jmax - jmin; 1130 i = jmin - ui[prow]; 1131 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1132 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1133 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1134 nzk += nlnk; 1135 1136 /* update il and jl for prow */ 1137 if (jmin < jmax) { 1138 il[prow] = jmin; 1139 1140 j = *cols; 1141 jl[prow] = jl[j]; 1142 jl[j] = prow; 1143 } 1144 prow = nextprow; 1145 } 1146 1147 /* if free space is not available, make more free space */ 1148 if (current_space->local_remaining < nzk) { 1149 i = am - k + 1; /* num of unfactored rows */ 1150 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1151 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1152 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1153 reallocs++; 1154 } 1155 1156 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1157 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1158 1159 /* add the k-th row into il and jl */ 1160 if (nzk - 1 > 0) { 1161 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1162 jl[k] = jl[i]; 1163 jl[i] = k; 1164 il[k] = ui[k] + 1; 1165 } 1166 uj_ptr[k] = current_space->array; 1167 uj_lvl_ptr[k] = current_space_lvl->array; 1168 1169 current_space->array += nzk; 1170 current_space->local_used += nzk; 1171 current_space->local_remaining -= nzk; 1172 1173 current_space_lvl->array += nzk; 1174 current_space_lvl->local_used += nzk; 1175 current_space_lvl->local_remaining -= nzk; 1176 1177 ui[k + 1] = ui[k] + nzk; 1178 } 1179 1180 PetscCall(ISRestoreIndices(perm, &rip)); 1181 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1182 PetscCall(PetscFree(cols_lvl)); 1183 1184 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1185 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1186 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1187 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1188 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1189 1190 /* put together the new matrix in MATSEQSBAIJ format */ 1191 B = fact; 1192 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1193 1194 b = (Mat_SeqSBAIJ *)B->data; 1195 b->free_a = PETSC_TRUE; 1196 b->free_ij = PETSC_TRUE; 1197 1198 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1199 1200 b->j = uj; 1201 b->i = ui; 1202 b->diag = NULL; 1203 b->ilen = NULL; 1204 b->imax = NULL; 1205 b->row = perm; 1206 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1207 1208 PetscCall(PetscObjectReference((PetscObject)perm)); 1209 1210 b->icol = perm; 1211 1212 PetscCall(PetscObjectReference((PetscObject)perm)); 1213 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1214 1215 b->maxnz = b->nz = ui[am]; 1216 1217 B->info.factor_mallocs = reallocs; 1218 B->info.fill_ratio_given = fill; 1219 if (ai[am] != 0.) { 1220 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1221 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1222 } else { 1223 B->info.fill_ratio_needed = 0.0; 1224 } 1225 #if defined(PETSC_USE_INFO) 1226 if (ai[am] != 0) { 1227 PetscReal af = B->info.fill_ratio_needed; 1228 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1229 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1230 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1231 } else { 1232 PetscCall(PetscInfo(A, "Empty matrix\n")); 1233 } 1234 #endif 1235 if (perm_identity) { 1236 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1237 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1238 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1239 } else { 1240 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1241 } 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1246 { 1247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1248 Mat_SeqSBAIJ *b; 1249 Mat B; 1250 PetscBool perm_identity; 1251 PetscReal fill = info->fill; 1252 const PetscInt *rip; 1253 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1254 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1255 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1256 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1257 PetscBT lnkbt; 1258 PetscBool diagDense; 1259 1260 PetscFunctionBegin; 1261 if (bs > 1) { /* convert to seqsbaij */ 1262 if (!a->sbaijMat) { 1263 A->symmetric = PETSC_BOOL3_TRUE; 1264 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE; 1265 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1266 } 1267 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1268 1269 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1270 PetscFunctionReturn(PETSC_SUCCESS); 1271 } 1272 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense)); 1273 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry"); 1274 1275 /* check whether perm is the identity mapping */ 1276 PetscCall(ISIdentity(perm, &perm_identity)); 1277 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1278 PetscCall(ISGetIndices(perm, &rip)); 1279 1280 /* initialization */ 1281 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1282 ui[0] = 0; 1283 1284 /* jl: linked list for storing indices of the pivot rows 1285 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1286 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1287 for (i = 0; i < mbs; i++) { 1288 jl[i] = mbs; 1289 il[i] = 0; 1290 } 1291 1292 /* create and initialize a linked list for storing column indices of the active row k */ 1293 nlnk = mbs + 1; 1294 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1295 1296 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1297 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1298 1299 current_space = free_space; 1300 1301 for (k = 0; k < mbs; k++) { /* for each active row k */ 1302 /* initialize lnk by the column indices of row rip[k] of A */ 1303 nzk = 0; 1304 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1305 ncols_upper = 0; 1306 for (j = 0; j < ncols; j++) { 1307 i = rip[*(aj + ai[rip[k]] + j)]; 1308 if (i >= k) { /* only take upper triangular entry */ 1309 cols[ncols_upper] = i; 1310 ncols_upper++; 1311 } 1312 } 1313 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1314 nzk += nlnk; 1315 1316 /* update lnk by computing fill-in for each pivot row to be merged in */ 1317 prow = jl[k]; /* 1st pivot row */ 1318 1319 while (prow < k) { 1320 nextprow = jl[prow]; 1321 /* merge prow into k-th row */ 1322 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1323 jmax = ui[prow + 1]; 1324 ncols = jmax - jmin; 1325 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1326 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1327 nzk += nlnk; 1328 1329 /* update il and jl for prow */ 1330 if (jmin < jmax) { 1331 il[prow] = jmin; 1332 j = *uj_ptr; 1333 jl[prow] = jl[j]; 1334 jl[j] = prow; 1335 } 1336 prow = nextprow; 1337 } 1338 1339 /* if free space is not available, make more free space */ 1340 if (current_space->local_remaining < nzk) { 1341 i = mbs - k + 1; /* num of unfactored rows */ 1342 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1343 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1344 reallocs++; 1345 } 1346 1347 /* copy data into free space, then initialize lnk */ 1348 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1349 1350 /* add the k-th row into il and jl */ 1351 if (nzk - 1 > 0) { 1352 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1353 jl[k] = jl[i]; 1354 jl[i] = k; 1355 il[k] = ui[k] + 1; 1356 } 1357 ui_ptr[k] = current_space->array; 1358 current_space->array += nzk; 1359 current_space->local_used += nzk; 1360 current_space->local_remaining -= nzk; 1361 1362 ui[k + 1] = ui[k] + nzk; 1363 } 1364 1365 PetscCall(ISRestoreIndices(perm, &rip)); 1366 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1367 1368 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1369 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1370 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1371 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1372 1373 /* put together the new matrix in MATSEQSBAIJ format */ 1374 B = fact; 1375 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1376 1377 b = (Mat_SeqSBAIJ *)B->data; 1378 b->free_a = PETSC_TRUE; 1379 b->free_ij = PETSC_TRUE; 1380 1381 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1382 1383 b->j = uj; 1384 b->i = ui; 1385 b->diag = NULL; 1386 b->ilen = NULL; 1387 b->imax = NULL; 1388 b->row = perm; 1389 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1390 1391 PetscCall(PetscObjectReference((PetscObject)perm)); 1392 b->icol = perm; 1393 PetscCall(PetscObjectReference((PetscObject)perm)); 1394 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1395 b->maxnz = b->nz = ui[mbs]; 1396 1397 B->info.factor_mallocs = reallocs; 1398 B->info.fill_ratio_given = fill; 1399 if (ai[mbs] != 0.) { 1400 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1401 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1402 } else { 1403 B->info.fill_ratio_needed = 0.0; 1404 } 1405 #if defined(PETSC_USE_INFO) 1406 if (ai[mbs] != 0.) { 1407 PetscReal af = B->info.fill_ratio_needed; 1408 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1409 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1410 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1411 } else { 1412 PetscCall(PetscInfo(A, "Empty matrix\n")); 1413 } 1414 #endif 1415 if (perm_identity) { 1416 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1417 } else { 1418 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1419 } 1420 PetscFunctionReturn(PETSC_SUCCESS); 1421 } 1422 1423 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1424 { 1425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1426 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1427 PetscInt i, k, n = a->mbs; 1428 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1429 const MatScalar *aa = a->a, *v; 1430 PetscScalar *x, *s, *t, *ls; 1431 const PetscScalar *b; 1432 1433 PetscFunctionBegin; 1434 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1435 PetscCall(VecGetArrayRead(bb, &b)); 1436 PetscCall(VecGetArray(xx, &x)); 1437 t = a->solve_work; 1438 1439 /* forward solve the lower triangular */ 1440 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1441 1442 for (i = 1; i < n; i++) { 1443 v = aa + bs2 * ai[i]; 1444 vi = aj + ai[i]; 1445 nz = ai[i + 1] - ai[i]; 1446 s = t + bs * i; 1447 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1448 for (k = 0; k < nz; k++) { 1449 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1450 v += bs2; 1451 } 1452 } 1453 1454 /* backward solve the upper triangular */ 1455 ls = a->solve_work + A->cmap->n; 1456 for (i = n - 1; i >= 0; i--) { 1457 v = aa + bs2 * (adiag[i + 1] + 1); 1458 vi = aj + adiag[i + 1] + 1; 1459 nz = adiag[i] - adiag[i + 1] - 1; 1460 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1461 for (k = 0; k < nz; k++) { 1462 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1463 v += bs2; 1464 } 1465 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1466 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1467 } 1468 1469 PetscCall(VecRestoreArrayRead(bb, &b)); 1470 PetscCall(VecRestoreArray(xx, &x)); 1471 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1472 PetscFunctionReturn(PETSC_SUCCESS); 1473 } 1474 1475 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1476 { 1477 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1478 IS iscol = a->col, isrow = a->row; 1479 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1480 PetscInt i, m, n = a->mbs; 1481 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1482 const MatScalar *aa = a->a, *v; 1483 PetscScalar *x, *s, *t, *ls; 1484 const PetscScalar *b; 1485 1486 PetscFunctionBegin; 1487 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1488 PetscCall(VecGetArrayRead(bb, &b)); 1489 PetscCall(VecGetArray(xx, &x)); 1490 t = a->solve_work; 1491 1492 PetscCall(ISGetIndices(isrow, &rout)); 1493 r = rout; 1494 PetscCall(ISGetIndices(iscol, &cout)); 1495 c = cout; 1496 1497 /* forward solve the lower triangular */ 1498 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1499 for (i = 1; i < n; i++) { 1500 v = aa + bs2 * ai[i]; 1501 vi = aj + ai[i]; 1502 nz = ai[i + 1] - ai[i]; 1503 s = t + bs * i; 1504 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1505 for (m = 0; m < nz; m++) { 1506 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1507 v += bs2; 1508 } 1509 } 1510 1511 /* backward solve the upper triangular */ 1512 ls = a->solve_work + A->cmap->n; 1513 for (i = n - 1; i >= 0; i--) { 1514 v = aa + bs2 * (adiag[i + 1] + 1); 1515 vi = aj + adiag[i + 1] + 1; 1516 nz = adiag[i] - adiag[i + 1] - 1; 1517 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1518 for (m = 0; m < nz; m++) { 1519 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1520 v += bs2; 1521 } 1522 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1523 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1524 } 1525 PetscCall(ISRestoreIndices(isrow, &rout)); 1526 PetscCall(ISRestoreIndices(iscol, &cout)); 1527 PetscCall(VecRestoreArrayRead(bb, &b)); 1528 PetscCall(VecRestoreArray(xx, &x)); 1529 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532