1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 8 { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 14 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 15 MatScalar *rtmp, *pc, *mwork, *pv; 16 MatScalar *aa = a->a, *v; 17 PetscReal shift = info->shiftamount; 18 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 19 20 PetscFunctionBegin; 21 PetscCall(ISGetIndices(isrow, &r)); 22 PetscCall(ISGetIndices(isicol, &ic)); 23 24 /* generate work space needed by the factorization */ 25 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 26 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 27 28 for (PetscInt i = 0; i < n; i++) { 29 /* zero rtmp */ 30 /* L part */ 31 PetscInt *pj; 32 PetscInt nzL, nz = bi[i + 1] - bi[i]; 33 bjtmp = bj + bi[i]; 34 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 35 36 /* U part */ 37 nz = bdiag[i] - bdiag[i + 1]; 38 bjtmp = bj + bdiag[i + 1] + 1; 39 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 40 41 /* load in initial (unfactored row) */ 42 nz = ai[r[i] + 1] - ai[r[i]]; 43 ajtmp = aj + ai[r[i]]; 44 v = aa + bs2 * ai[r[i]]; 45 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 46 47 /* elimination */ 48 bjtmp = bj + bi[i]; 49 nzL = bi[i + 1] - bi[i]; 50 for (PetscInt k = 0; k < nzL; k++) { 51 PetscBool flg = PETSC_FALSE; 52 PetscInt row = bjtmp[k]; 53 54 pc = rtmp + bs2 * row; 55 for (PetscInt j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = PETSC_TRUE; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (PetscInt j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 { 92 PetscBool zeropivotdetected; 93 94 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 95 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 96 } 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 PetscCall(ISRestoreIndices(isicol, &ic)); 107 PetscCall(ISRestoreIndices(isrow, &r)); 108 109 C->ops->solve = MatSolve_SeqBAIJ_2; 110 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 111 C->assembled = PETSC_TRUE; 112 113 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 118 { 119 Mat C = B; 120 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 121 PetscInt i, j, k, nz, nzL, row, *pj; 122 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 123 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 124 MatScalar *rtmp, *pc, *mwork, *pv; 125 MatScalar *aa = a->a, *v; 126 PetscInt flg; 127 PetscReal shift = info->shiftamount; 128 PetscBool allowzeropivot, zeropivotdetected; 129 130 PetscFunctionBegin; 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 /* generate work space needed by the factorization */ 134 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 135 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 136 137 for (i = 0; i < n; i++) { 138 /* zero rtmp */ 139 /* L part */ 140 nz = bi[i + 1] - bi[i]; 141 bjtmp = bj + bi[i]; 142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 143 144 /* U part */ 145 nz = bdiag[i] - bdiag[i + 1]; 146 bjtmp = bj + bdiag[i + 1] + 1; 147 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 148 149 /* load in initial (unfactored row) */ 150 nz = ai[i + 1] - ai[i]; 151 ajtmp = aj + ai[i]; 152 v = aa + bs2 * ai[i]; 153 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 154 155 /* elimination */ 156 bjtmp = bj + bi[i]; 157 nzL = bi[i + 1] - bi[i]; 158 for (k = 0; k < nzL; k++) { 159 row = bjtmp[k]; 160 pc = rtmp + bs2 * row; 161 for (flg = 0, j = 0; j < bs2; j++) { 162 if (pc[j] != (PetscScalar)0.0) { 163 flg = 1; 164 break; 165 } 166 } 167 if (flg) { 168 pv = b->a + bs2 * bdiag[row]; 169 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 170 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 171 172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 173 pv = b->a + bs2 * (bdiag[row + 1] + 1); 174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 175 for (j = 0; j < nz; j++) { 176 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 177 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 178 v = rtmp + 4 * pj[j]; 179 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 180 pv += 4; 181 } 182 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 183 } 184 } 185 186 /* finished row so stick it into b->a */ 187 /* L part */ 188 pv = b->a + bs2 * bi[i]; 189 pj = b->j + bi[i]; 190 nz = bi[i + 1] - bi[i]; 191 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 192 193 /* Mark diagonal and invert diagonal for simpler triangular solves */ 194 pv = b->a + bs2 * bdiag[i]; 195 pj = b->j + bdiag[i]; 196 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 197 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 198 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 199 200 /* U part */ 201 /* 202 pv = b->a + bs2*bi[2*n-i]; 203 pj = b->j + bi[2*n-i]; 204 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 205 */ 206 pv = b->a + bs2 * (bdiag[i + 1] + 1); 207 pj = b->j + bdiag[i + 1] + 1; 208 nz = bdiag[i] - bdiag[i + 1] - 1; 209 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 210 } 211 PetscCall(PetscFree2(rtmp, mwork)); 212 213 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 214 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 217 C->assembled = PETSC_TRUE; 218 219 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 224 { 225 Mat C = B; 226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 227 IS isrow = b->row, isicol = b->icol; 228 const PetscInt *r, *ic; 229 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 230 PetscInt *ajtmpold, *ajtmp, nz, row; 231 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 232 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 233 MatScalar p1, p2, p3, p4; 234 MatScalar *ba = b->a, *aa = a->a; 235 PetscReal shift = info->shiftamount; 236 PetscBool allowzeropivot, zeropivotdetected; 237 238 PetscFunctionBegin; 239 allowzeropivot = PetscNot(A->erroriffailure); 240 PetscCall(ISGetIndices(isrow, &r)); 241 PetscCall(ISGetIndices(isicol, &ic)); 242 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 243 244 for (i = 0; i < n; i++) { 245 nz = bi[i + 1] - bi[i]; 246 ajtmp = bj + bi[i]; 247 for (j = 0; j < nz; j++) { 248 x = rtmp + 4 * ajtmp[j]; 249 x[0] = x[1] = x[2] = x[3] = 0.0; 250 } 251 /* load in initial (unfactored row) */ 252 idx = r[i]; 253 nz = ai[idx + 1] - ai[idx]; 254 ajtmpold = aj + ai[idx]; 255 v = aa + 4 * ai[idx]; 256 for (j = 0; j < nz; j++) { 257 x = rtmp + 4 * ic[ajtmpold[j]]; 258 x[0] = v[0]; 259 x[1] = v[1]; 260 x[2] = v[2]; 261 x[3] = v[3]; 262 v += 4; 263 } 264 row = *ajtmp++; 265 while (row < i) { 266 pc = rtmp + 4 * row; 267 p1 = pc[0]; 268 p2 = pc[1]; 269 p3 = pc[2]; 270 p4 = pc[3]; 271 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 272 pv = ba + 4 * diag_offset[row]; 273 pj = bj + diag_offset[row] + 1; 274 x1 = pv[0]; 275 x2 = pv[1]; 276 x3 = pv[2]; 277 x4 = pv[3]; 278 pc[0] = m1 = p1 * x1 + p3 * x2; 279 pc[1] = m2 = p2 * x1 + p4 * x2; 280 pc[2] = m3 = p1 * x3 + p3 * x4; 281 pc[3] = m4 = p2 * x3 + p4 * x4; 282 nz = bi[row + 1] - diag_offset[row] - 1; 283 pv += 4; 284 for (j = 0; j < nz; j++) { 285 x1 = pv[0]; 286 x2 = pv[1]; 287 x3 = pv[2]; 288 x4 = pv[3]; 289 x = rtmp + 4 * pj[j]; 290 x[0] -= m1 * x1 + m3 * x2; 291 x[1] -= m2 * x1 + m4 * x2; 292 x[2] -= m1 * x3 + m3 * x4; 293 x[3] -= m2 * x3 + m4 * x4; 294 pv += 4; 295 } 296 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 297 } 298 row = *ajtmp++; 299 } 300 /* finished row so stick it into b->a */ 301 pv = ba + 4 * bi[i]; 302 pj = bj + bi[i]; 303 nz = bi[i + 1] - bi[i]; 304 for (j = 0; j < nz; j++) { 305 x = rtmp + 4 * pj[j]; 306 pv[0] = x[0]; 307 pv[1] = x[1]; 308 pv[2] = x[2]; 309 pv[3] = x[3]; 310 pv += 4; 311 } 312 /* invert diagonal block */ 313 w = ba + 4 * diag_offset[i]; 314 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 315 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 316 } 317 318 PetscCall(PetscFree(rtmp)); 319 PetscCall(ISRestoreIndices(isicol, &ic)); 320 PetscCall(ISRestoreIndices(isrow, &r)); 321 322 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 323 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 324 C->assembled = PETSC_TRUE; 325 326 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 327 PetscFunctionReturn(PETSC_SUCCESS); 328 } 329 /* 330 Version for when blocks are 2 by 2 Using natural ordering 331 */ 332 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 333 { 334 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 335 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 336 PetscInt *ajtmpold, *ajtmp, nz, row; 337 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 338 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 339 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 340 MatScalar *ba = b->a, *aa = a->a; 341 PetscReal shift = info->shiftamount; 342 PetscBool allowzeropivot, zeropivotdetected; 343 344 PetscFunctionBegin; 345 allowzeropivot = PetscNot(A->erroriffailure); 346 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 347 for (i = 0; i < n; i++) { 348 nz = bi[i + 1] - bi[i]; 349 ajtmp = bj + bi[i]; 350 for (j = 0; j < nz; j++) { 351 x = rtmp + 4 * ajtmp[j]; 352 x[0] = x[1] = x[2] = x[3] = 0.0; 353 } 354 /* load in initial (unfactored row) */ 355 nz = ai[i + 1] - ai[i]; 356 ajtmpold = aj + ai[i]; 357 v = aa + 4 * ai[i]; 358 for (j = 0; j < nz; j++) { 359 x = rtmp + 4 * ajtmpold[j]; 360 x[0] = v[0]; 361 x[1] = v[1]; 362 x[2] = v[2]; 363 x[3] = v[3]; 364 v += 4; 365 } 366 row = *ajtmp++; 367 while (row < i) { 368 pc = rtmp + 4 * row; 369 p1 = pc[0]; 370 p2 = pc[1]; 371 p3 = pc[2]; 372 p4 = pc[3]; 373 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 374 pv = ba + 4 * diag_offset[row]; 375 pj = bj + diag_offset[row] + 1; 376 x1 = pv[0]; 377 x2 = pv[1]; 378 x3 = pv[2]; 379 x4 = pv[3]; 380 pc[0] = m1 = p1 * x1 + p3 * x2; 381 pc[1] = m2 = p2 * x1 + p4 * x2; 382 pc[2] = m3 = p1 * x3 + p3 * x4; 383 pc[3] = m4 = p2 * x3 + p4 * x4; 384 nz = bi[row + 1] - diag_offset[row] - 1; 385 pv += 4; 386 for (j = 0; j < nz; j++) { 387 x1 = pv[0]; 388 x2 = pv[1]; 389 x3 = pv[2]; 390 x4 = pv[3]; 391 x = rtmp + 4 * pj[j]; 392 x[0] -= m1 * x1 + m3 * x2; 393 x[1] -= m2 * x1 + m4 * x2; 394 x[2] -= m1 * x3 + m3 * x4; 395 x[3] -= m2 * x3 + m4 * x4; 396 pv += 4; 397 } 398 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 399 } 400 row = *ajtmp++; 401 } 402 /* finished row so stick it into b->a */ 403 pv = ba + 4 * bi[i]; 404 pj = bj + bi[i]; 405 nz = bi[i + 1] - bi[i]; 406 for (j = 0; j < nz; j++) { 407 x = rtmp + 4 * pj[j]; 408 pv[0] = x[0]; 409 pv[1] = x[1]; 410 pv[2] = x[2]; 411 pv[3] = x[3]; 412 /* 413 printf(" col %d:",pj[j]); 414 PetscInt j1; 415 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 416 printf("\n"); 417 */ 418 pv += 4; 419 } 420 /* invert diagonal block */ 421 w = ba + 4 * diag_offset[i]; 422 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 423 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 424 } 425 426 PetscCall(PetscFree(rtmp)); 427 428 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 429 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 430 C->assembled = PETSC_TRUE; 431 432 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 /* 437 Version for when blocks are 1 by 1. 438 */ 439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 440 { 441 Mat C = B; 442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 443 IS isrow = b->row, isicol = b->icol; 444 const PetscInt *r, *ic, *ics; 445 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 446 PetscInt i, j, k, nz, nzL, row, *pj; 447 const PetscInt *ajtmp, *bjtmp; 448 MatScalar *rtmp, *pc, multiplier, *pv; 449 const MatScalar *aa = a->a, *v; 450 PetscBool row_identity, col_identity; 451 FactorShiftCtx sctx; 452 const PetscInt *ddiag; 453 PetscReal rs; 454 MatScalar d; 455 456 PetscFunctionBegin; 457 /* MatPivotSetUp(): initialize shift context sctx */ 458 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 459 460 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 461 ddiag = a->diag; 462 sctx.shift_top = info->zeropivot; 463 for (i = 0; i < n; i++) { 464 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 465 d = (aa)[ddiag[i]]; 466 rs = -PetscAbsScalar(d) - PetscRealPart(d); 467 v = aa + ai[i]; 468 nz = ai[i + 1] - ai[i]; 469 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 470 if (rs > sctx.shift_top) sctx.shift_top = rs; 471 } 472 sctx.shift_top *= 1.1; 473 sctx.nshift_max = 5; 474 sctx.shift_lo = 0.; 475 sctx.shift_hi = 1.; 476 } 477 478 PetscCall(ISGetIndices(isrow, &r)); 479 PetscCall(ISGetIndices(isicol, &ic)); 480 PetscCall(PetscMalloc1(n + 1, &rtmp)); 481 ics = ic; 482 483 do { 484 sctx.newshift = PETSC_FALSE; 485 for (i = 0; i < n; i++) { 486 /* zero rtmp */ 487 /* L part */ 488 nz = bi[i + 1] - bi[i]; 489 bjtmp = bj + bi[i]; 490 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 491 492 /* U part */ 493 nz = bdiag[i] - bdiag[i + 1]; 494 bjtmp = bj + bdiag[i + 1] + 1; 495 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 496 497 /* load in initial (unfactored row) */ 498 nz = ai[r[i] + 1] - ai[r[i]]; 499 ajtmp = aj + ai[r[i]]; 500 v = aa + ai[r[i]]; 501 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 502 503 /* ZeropivotApply() */ 504 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 505 506 /* elimination */ 507 bjtmp = bj + bi[i]; 508 row = *bjtmp++; 509 nzL = bi[i + 1] - bi[i]; 510 for (k = 0; k < nzL; k++) { 511 pc = rtmp + row; 512 if (*pc != (PetscScalar)0.0) { 513 pv = b->a + bdiag[row]; 514 multiplier = *pc * (*pv); 515 *pc = multiplier; 516 517 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 518 pv = b->a + bdiag[row + 1] + 1; 519 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 520 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 521 PetscCall(PetscLogFlops(2.0 * nz)); 522 } 523 row = *bjtmp++; 524 } 525 526 /* finished row so stick it into b->a */ 527 rs = 0.0; 528 /* L part */ 529 pv = b->a + bi[i]; 530 pj = b->j + bi[i]; 531 nz = bi[i + 1] - bi[i]; 532 for (j = 0; j < nz; j++) { 533 pv[j] = rtmp[pj[j]]; 534 rs += PetscAbsScalar(pv[j]); 535 } 536 537 /* U part */ 538 pv = b->a + bdiag[i + 1] + 1; 539 pj = b->j + bdiag[i + 1] + 1; 540 nz = bdiag[i] - bdiag[i + 1] - 1; 541 for (j = 0; j < nz; j++) { 542 pv[j] = rtmp[pj[j]]; 543 rs += PetscAbsScalar(pv[j]); 544 } 545 546 sctx.rs = rs; 547 sctx.pv = rtmp[i]; 548 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 549 if (sctx.newshift) break; /* break for-loop */ 550 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 551 552 /* Mark diagonal and invert diagonal for simpler triangular solves */ 553 pv = b->a + bdiag[i]; 554 *pv = (PetscScalar)1.0 / rtmp[i]; 555 556 } /* endof for (i=0; i<n; i++) { */ 557 558 /* MatPivotRefine() */ 559 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 560 /* 561 * if no shift in this attempt & shifting & started shifting & can refine, 562 * then try lower shift 563 */ 564 sctx.shift_hi = sctx.shift_fraction; 565 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 566 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 567 sctx.newshift = PETSC_TRUE; 568 sctx.nshift++; 569 } 570 } while (sctx.newshift); 571 572 PetscCall(PetscFree(rtmp)); 573 PetscCall(ISRestoreIndices(isicol, &ic)); 574 PetscCall(ISRestoreIndices(isrow, &r)); 575 576 PetscCall(ISIdentity(isrow, &row_identity)); 577 PetscCall(ISIdentity(isicol, &col_identity)); 578 if (row_identity && col_identity) { 579 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 580 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 581 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 582 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 583 } else { 584 C->ops->solve = MatSolve_SeqBAIJ_1; 585 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 586 } 587 C->assembled = PETSC_TRUE; 588 PetscCall(PetscLogFlops(C->cmap->n)); 589 590 /* MatShiftView(A,info,&sctx) */ 591 if (sctx.nshift) { 592 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 593 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 594 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 595 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 596 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 597 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 598 } 599 } 600 PetscFunctionReturn(PETSC_SUCCESS); 601 } 602 603 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 604 { 605 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 606 IS isrow = b->row, isicol = b->icol; 607 const PetscInt *r, *ic; 608 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 609 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 610 PetscInt *diag_offset = b->diag, diag, *pj; 611 MatScalar *pv, *v, *rtmp, multiplier, *pc; 612 MatScalar *ba = b->a, *aa = a->a; 613 PetscBool row_identity, col_identity; 614 615 PetscFunctionBegin; 616 PetscCall(ISGetIndices(isrow, &r)); 617 PetscCall(ISGetIndices(isicol, &ic)); 618 PetscCall(PetscMalloc1(n + 1, &rtmp)); 619 620 for (i = 0; i < n; i++) { 621 nz = bi[i + 1] - bi[i]; 622 ajtmp = bj + bi[i]; 623 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 624 625 /* load in initial (unfactored row) */ 626 nz = ai[r[i] + 1] - ai[r[i]]; 627 ajtmpold = aj + ai[r[i]]; 628 v = aa + ai[r[i]]; 629 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 630 631 row = *ajtmp++; 632 while (row < i) { 633 pc = rtmp + row; 634 if (*pc != 0.0) { 635 pv = ba + diag_offset[row]; 636 pj = bj + diag_offset[row] + 1; 637 multiplier = *pc * *pv++; 638 *pc = multiplier; 639 nz = bi[row + 1] - diag_offset[row] - 1; 640 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 641 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 642 } 643 row = *ajtmp++; 644 } 645 /* finished row so stick it into b->a */ 646 pv = ba + bi[i]; 647 pj = bj + bi[i]; 648 nz = bi[i + 1] - bi[i]; 649 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 650 diag = diag_offset[i] - bi[i]; 651 /* check pivot entry for current row */ 652 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 653 pv[diag] = 1.0 / pv[diag]; 654 } 655 656 PetscCall(PetscFree(rtmp)); 657 PetscCall(ISRestoreIndices(isicol, &ic)); 658 PetscCall(ISRestoreIndices(isrow, &r)); 659 PetscCall(ISIdentity(isrow, &row_identity)); 660 PetscCall(ISIdentity(isicol, &col_identity)); 661 if (row_identity && col_identity) { 662 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 663 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 664 } else { 665 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 666 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 667 } 668 C->assembled = PETSC_TRUE; 669 PetscCall(PetscLogFlops(C->cmap->n)); 670 PetscFunctionReturn(PETSC_SUCCESS); 671 } 672 673 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 674 { 675 PetscFunctionBegin; 676 *type = MATSOLVERPETSC; 677 PetscFunctionReturn(PETSC_SUCCESS); 678 } 679 680 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 681 { 682 PetscInt n = A->rmap->n; 683 684 PetscFunctionBegin; 685 #if defined(PETSC_USE_COMPLEX) 686 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 687 #endif 688 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 689 PetscCall(MatSetSizes(*B, n, n, n, n)); 690 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 691 PetscCall(MatSetType(*B, MATSEQBAIJ)); 692 693 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 694 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 695 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 696 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 697 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 698 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 699 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 700 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 701 702 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 703 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 704 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 705 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 706 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 707 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 708 (*B)->factortype = ftype; 709 (*B)->canuseordering = PETSC_TRUE; 710 711 PetscCall(PetscFree((*B)->solvertype)); 712 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 713 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 718 { 719 Mat C; 720 721 PetscFunctionBegin; 722 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 723 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 724 PetscCall(MatLUFactorNumeric(C, A, info)); 725 726 A->ops->solve = C->ops->solve; 727 A->ops->solvetranspose = C->ops->solvetranspose; 728 729 PetscCall(MatHeaderMerge(A, &C)); 730 PetscFunctionReturn(PETSC_SUCCESS); 731 } 732 733 #include <../src/mat/impls/sbaij/seq/sbaij.h> 734 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 735 { 736 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 737 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 738 IS ip = b->row; 739 const PetscInt *rip; 740 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 741 PetscInt *ai = a->i, *aj = a->j; 742 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 743 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 744 PetscReal rs; 745 FactorShiftCtx sctx; 746 747 PetscFunctionBegin; 748 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 749 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 750 PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 751 PetscCall(MatDestroy(&a->sbaijMat)); 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 /* MatPivotSetUp(): initialize shift context sctx */ 756 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 757 758 PetscCall(ISGetIndices(ip, &rip)); 759 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 760 761 sctx.shift_amount = 0.; 762 sctx.nshift = 0; 763 do { 764 sctx.newshift = PETSC_FALSE; 765 for (i = 0; i < mbs; i++) { 766 rtmp[i] = 0.0; 767 jl[i] = mbs; 768 il[0] = 0; 769 } 770 771 for (k = 0; k < mbs; k++) { 772 bval = ba + bi[k]; 773 /* initialize k-th row by the perm[k]-th row of A */ 774 jmin = ai[rip[k]]; 775 jmax = ai[rip[k] + 1]; 776 for (j = jmin; j < jmax; j++) { 777 col = rip[aj[j]]; 778 if (col >= k) { /* only take upper triangular entry */ 779 rtmp[col] = aa[j]; 780 *bval++ = 0.0; /* for in-place factorization */ 781 } 782 } 783 784 /* shift the diagonal of the matrix */ 785 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 786 787 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 788 dk = rtmp[k]; 789 i = jl[k]; /* first row to be added to k_th row */ 790 791 while (i < k) { 792 nexti = jl[i]; /* next row to be added to k_th row */ 793 794 /* compute multiplier, update diag(k) and U(i,k) */ 795 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 796 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 797 dk += uikdi * ba[ili]; 798 ba[ili] = uikdi; /* -U(i,k) */ 799 800 /* add multiple of row i to k-th row */ 801 jmin = ili + 1; 802 jmax = bi[i + 1]; 803 if (jmin < jmax) { 804 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 805 /* update il and jl for row i */ 806 il[i] = jmin; 807 j = bj[jmin]; 808 jl[i] = jl[j]; 809 jl[j] = i; 810 } 811 i = nexti; 812 } 813 814 /* shift the diagonals when zero pivot is detected */ 815 /* compute rs=sum of abs(off-diagonal) */ 816 rs = 0.0; 817 jmin = bi[k] + 1; 818 nz = bi[k + 1] - jmin; 819 if (nz) { 820 bcol = bj + jmin; 821 while (nz--) { 822 rs += PetscAbsScalar(rtmp[*bcol]); 823 bcol++; 824 } 825 } 826 827 sctx.rs = rs; 828 sctx.pv = dk; 829 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 830 if (sctx.newshift) break; 831 dk = sctx.pv; 832 833 /* copy data into U(k,:) */ 834 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 835 jmin = bi[k] + 1; 836 jmax = bi[k + 1]; 837 if (jmin < jmax) { 838 for (j = jmin; j < jmax; j++) { 839 col = bj[j]; 840 ba[j] = rtmp[col]; 841 rtmp[col] = 0.0; 842 } 843 /* add the k-th row into il and jl */ 844 il[k] = jmin; 845 i = bj[jmin]; 846 jl[k] = jl[i]; 847 jl[i] = k; 848 } 849 } 850 } while (sctx.newshift); 851 PetscCall(PetscFree3(rtmp, il, jl)); 852 853 PetscCall(ISRestoreIndices(ip, &rip)); 854 855 C->assembled = PETSC_TRUE; 856 C->preallocated = PETSC_TRUE; 857 858 PetscCall(PetscLogFlops(C->rmap->N)); 859 if (sctx.nshift) { 860 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 861 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 862 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 863 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 864 } 865 } 866 PetscFunctionReturn(PETSC_SUCCESS); 867 } 868 869 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 870 { 871 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 872 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 873 PetscInt i, j, am = a->mbs; 874 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 875 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 876 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 877 PetscReal rs; 878 FactorShiftCtx sctx; 879 880 PetscFunctionBegin; 881 /* MatPivotSetUp(): initialize shift context sctx */ 882 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 883 884 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 885 886 do { 887 sctx.newshift = PETSC_FALSE; 888 for (i = 0; i < am; i++) { 889 rtmp[i] = 0.0; 890 jl[i] = am; 891 il[0] = 0; 892 } 893 894 for (k = 0; k < am; k++) { 895 /* initialize k-th row with elements nonzero in row perm(k) of A */ 896 nz = ai[k + 1] - ai[k]; 897 acol = aj + ai[k]; 898 aval = aa + ai[k]; 899 bval = ba + bi[k]; 900 while (nz--) { 901 if (*acol < k) { /* skip lower triangular entries */ 902 acol++; 903 aval++; 904 } else { 905 rtmp[*acol++] = *aval++; 906 *bval++ = 0.0; /* for in-place factorization */ 907 } 908 } 909 910 /* shift the diagonal of the matrix */ 911 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 912 913 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 914 dk = rtmp[k]; 915 i = jl[k]; /* first row to be added to k_th row */ 916 917 while (i < k) { 918 nexti = jl[i]; /* next row to be added to k_th row */ 919 /* compute multiplier, update D(k) and U(i,k) */ 920 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 921 uikdi = -ba[ili] * ba[bi[i]]; 922 dk += uikdi * ba[ili]; 923 ba[ili] = uikdi; /* -U(i,k) */ 924 925 /* add multiple of row i to k-th row ... */ 926 jmin = ili + 1; 927 nz = bi[i + 1] - jmin; 928 if (nz > 0) { 929 bcol = bj + jmin; 930 bval = ba + jmin; 931 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 932 /* update il and jl for i-th row */ 933 il[i] = jmin; 934 j = bj[jmin]; 935 jl[i] = jl[j]; 936 jl[j] = i; 937 } 938 i = nexti; 939 } 940 941 /* shift the diagonals when zero pivot is detected */ 942 /* compute rs=sum of abs(off-diagonal) */ 943 rs = 0.0; 944 jmin = bi[k] + 1; 945 nz = bi[k + 1] - jmin; 946 if (nz) { 947 bcol = bj + jmin; 948 while (nz--) { 949 rs += PetscAbsScalar(rtmp[*bcol]); 950 bcol++; 951 } 952 } 953 954 sctx.rs = rs; 955 sctx.pv = dk; 956 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 957 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 958 dk = sctx.pv; 959 960 /* copy data into U(k,:) */ 961 ba[bi[k]] = 1.0 / dk; 962 jmin = bi[k] + 1; 963 nz = bi[k + 1] - jmin; 964 if (nz) { 965 bcol = bj + jmin; 966 bval = ba + jmin; 967 while (nz--) { 968 *bval++ = rtmp[*bcol]; 969 rtmp[*bcol++] = 0.0; 970 } 971 /* add k-th row into il and jl */ 972 il[k] = jmin; 973 i = bj[jmin]; 974 jl[k] = jl[i]; 975 jl[i] = k; 976 } 977 } 978 } while (sctx.newshift); 979 PetscCall(PetscFree3(rtmp, il, jl)); 980 981 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 982 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 983 C->assembled = PETSC_TRUE; 984 C->preallocated = PETSC_TRUE; 985 986 PetscCall(PetscLogFlops(C->rmap->N)); 987 if (sctx.nshift) { 988 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 989 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 990 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 991 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 992 } 993 } 994 PetscFunctionReturn(PETSC_SUCCESS); 995 } 996 997 #include <petscbt.h> 998 #include <../src/mat/utils/freespace.h> 999 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1000 { 1001 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1002 Mat_SeqSBAIJ *b; 1003 Mat B; 1004 PetscBool perm_identity, missing; 1005 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1006 const PetscInt *rip; 1007 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1008 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1009 PetscReal fill = info->fill, levels = info->levels; 1010 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1011 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1012 PetscBT lnkbt; 1013 1014 PetscFunctionBegin; 1015 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1016 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1017 1018 if (bs > 1) { 1019 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1020 fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1021 1022 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1023 PetscFunctionReturn(PETSC_SUCCESS); 1024 } 1025 1026 PetscCall(ISIdentity(perm, &perm_identity)); 1027 PetscCall(ISGetIndices(perm, &rip)); 1028 1029 /* special case that simply copies fill pattern */ 1030 if (!levels && perm_identity) { 1031 PetscCall(PetscMalloc1(am + 1, &ui)); 1032 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1033 B = fact; 1034 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1035 1036 b = (Mat_SeqSBAIJ *)B->data; 1037 uj = b->j; 1038 for (i = 0; i < am; i++) { 1039 aj = a->j + a->diag[i]; 1040 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1041 b->ilen[i] = ui[i]; 1042 } 1043 PetscCall(PetscFree(ui)); 1044 1045 B->factortype = MAT_FACTOR_NONE; 1046 1047 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1048 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1049 B->factortype = MAT_FACTOR_ICC; 1050 1051 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1052 PetscFunctionReturn(PETSC_SUCCESS); 1053 } 1054 1055 /* initialization */ 1056 PetscCall(PetscMalloc1(am + 1, &ui)); 1057 ui[0] = 0; 1058 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1059 1060 /* jl: linked list for storing indices of the pivot rows 1061 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1062 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1063 for (i = 0; i < am; i++) { 1064 jl[i] = am; 1065 il[i] = 0; 1066 } 1067 1068 /* create and initialize a linked list for storing column indices of the active row k */ 1069 nlnk = am + 1; 1070 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1071 1072 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1073 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1074 1075 current_space = free_space; 1076 1077 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1078 current_space_lvl = free_space_lvl; 1079 1080 for (k = 0; k < am; k++) { /* for each active row k */ 1081 /* initialize lnk by the column indices of row rip[k] of A */ 1082 nzk = 0; 1083 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1084 ncols_upper = 0; 1085 cols = cols_lvl + am; 1086 for (j = 0; j < ncols; j++) { 1087 i = rip[*(aj + ai[rip[k]] + j)]; 1088 if (i >= k) { /* only take upper triangular entry */ 1089 cols[ncols_upper] = i; 1090 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1091 ncols_upper++; 1092 } 1093 } 1094 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1095 nzk += nlnk; 1096 1097 /* update lnk by computing fill-in for each pivot row to be merged in */ 1098 prow = jl[k]; /* 1st pivot row */ 1099 1100 while (prow < k) { 1101 nextprow = jl[prow]; 1102 1103 /* merge prow into k-th row */ 1104 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1105 jmax = ui[prow + 1]; 1106 ncols = jmax - jmin; 1107 i = jmin - ui[prow]; 1108 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1109 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1110 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1111 nzk += nlnk; 1112 1113 /* update il and jl for prow */ 1114 if (jmin < jmax) { 1115 il[prow] = jmin; 1116 1117 j = *cols; 1118 jl[prow] = jl[j]; 1119 jl[j] = prow; 1120 } 1121 prow = nextprow; 1122 } 1123 1124 /* if free space is not available, make more free space */ 1125 if (current_space->local_remaining < nzk) { 1126 i = am - k + 1; /* num of unfactored rows */ 1127 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1128 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1129 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1130 reallocs++; 1131 } 1132 1133 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1134 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1135 1136 /* add the k-th row into il and jl */ 1137 if (nzk - 1 > 0) { 1138 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1139 jl[k] = jl[i]; 1140 jl[i] = k; 1141 il[k] = ui[k] + 1; 1142 } 1143 uj_ptr[k] = current_space->array; 1144 uj_lvl_ptr[k] = current_space_lvl->array; 1145 1146 current_space->array += nzk; 1147 current_space->local_used += nzk; 1148 current_space->local_remaining -= nzk; 1149 1150 current_space_lvl->array += nzk; 1151 current_space_lvl->local_used += nzk; 1152 current_space_lvl->local_remaining -= nzk; 1153 1154 ui[k + 1] = ui[k] + nzk; 1155 } 1156 1157 PetscCall(ISRestoreIndices(perm, &rip)); 1158 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1159 PetscCall(PetscFree(cols_lvl)); 1160 1161 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1162 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1163 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1164 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1165 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1166 1167 /* put together the new matrix in MATSEQSBAIJ format */ 1168 B = fact; 1169 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1170 1171 b = (Mat_SeqSBAIJ *)B->data; 1172 b->free_a = PETSC_TRUE; 1173 b->free_ij = PETSC_TRUE; 1174 1175 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1176 1177 b->j = uj; 1178 b->i = ui; 1179 b->diag = NULL; 1180 b->ilen = NULL; 1181 b->imax = NULL; 1182 b->row = perm; 1183 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1184 1185 PetscCall(PetscObjectReference((PetscObject)perm)); 1186 1187 b->icol = perm; 1188 1189 PetscCall(PetscObjectReference((PetscObject)perm)); 1190 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1191 1192 b->maxnz = b->nz = ui[am]; 1193 1194 B->info.factor_mallocs = reallocs; 1195 B->info.fill_ratio_given = fill; 1196 if (ai[am] != 0.) { 1197 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1198 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1199 } else { 1200 B->info.fill_ratio_needed = 0.0; 1201 } 1202 #if defined(PETSC_USE_INFO) 1203 if (ai[am] != 0) { 1204 PetscReal af = B->info.fill_ratio_needed; 1205 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1206 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1207 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1208 } else { 1209 PetscCall(PetscInfo(A, "Empty matrix\n")); 1210 } 1211 #endif 1212 if (perm_identity) { 1213 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1214 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1215 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1216 } else { 1217 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1218 } 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1223 { 1224 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1225 Mat_SeqSBAIJ *b; 1226 Mat B; 1227 PetscBool perm_identity, missing; 1228 PetscReal fill = info->fill; 1229 const PetscInt *rip; 1230 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1231 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1232 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1233 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1234 PetscBT lnkbt; 1235 1236 PetscFunctionBegin; 1237 if (bs > 1) { /* convert to seqsbaij */ 1238 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1239 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1240 1241 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1246 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1247 1248 /* check whether perm is the identity mapping */ 1249 PetscCall(ISIdentity(perm, &perm_identity)); 1250 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1251 PetscCall(ISGetIndices(perm, &rip)); 1252 1253 /* initialization */ 1254 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1255 ui[0] = 0; 1256 1257 /* jl: linked list for storing indices of the pivot rows 1258 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1259 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1260 for (i = 0; i < mbs; i++) { 1261 jl[i] = mbs; 1262 il[i] = 0; 1263 } 1264 1265 /* create and initialize a linked list for storing column indices of the active row k */ 1266 nlnk = mbs + 1; 1267 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1268 1269 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1270 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1271 1272 current_space = free_space; 1273 1274 for (k = 0; k < mbs; k++) { /* for each active row k */ 1275 /* initialize lnk by the column indices of row rip[k] of A */ 1276 nzk = 0; 1277 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1278 ncols_upper = 0; 1279 for (j = 0; j < ncols; j++) { 1280 i = rip[*(aj + ai[rip[k]] + j)]; 1281 if (i >= k) { /* only take upper triangular entry */ 1282 cols[ncols_upper] = i; 1283 ncols_upper++; 1284 } 1285 } 1286 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1287 nzk += nlnk; 1288 1289 /* update lnk by computing fill-in for each pivot row to be merged in */ 1290 prow = jl[k]; /* 1st pivot row */ 1291 1292 while (prow < k) { 1293 nextprow = jl[prow]; 1294 /* merge prow into k-th row */ 1295 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1296 jmax = ui[prow + 1]; 1297 ncols = jmax - jmin; 1298 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1299 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1300 nzk += nlnk; 1301 1302 /* update il and jl for prow */ 1303 if (jmin < jmax) { 1304 il[prow] = jmin; 1305 j = *uj_ptr; 1306 jl[prow] = jl[j]; 1307 jl[j] = prow; 1308 } 1309 prow = nextprow; 1310 } 1311 1312 /* if free space is not available, make more free space */ 1313 if (current_space->local_remaining < nzk) { 1314 i = mbs - k + 1; /* num of unfactored rows */ 1315 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1316 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1317 reallocs++; 1318 } 1319 1320 /* copy data into free space, then initialize lnk */ 1321 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1322 1323 /* add the k-th row into il and jl */ 1324 if (nzk - 1 > 0) { 1325 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1326 jl[k] = jl[i]; 1327 jl[i] = k; 1328 il[k] = ui[k] + 1; 1329 } 1330 ui_ptr[k] = current_space->array; 1331 current_space->array += nzk; 1332 current_space->local_used += nzk; 1333 current_space->local_remaining -= nzk; 1334 1335 ui[k + 1] = ui[k] + nzk; 1336 } 1337 1338 PetscCall(ISRestoreIndices(perm, &rip)); 1339 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1340 1341 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1342 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1343 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1344 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1345 1346 /* put together the new matrix in MATSEQSBAIJ format */ 1347 B = fact; 1348 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1349 1350 b = (Mat_SeqSBAIJ *)B->data; 1351 b->free_a = PETSC_TRUE; 1352 b->free_ij = PETSC_TRUE; 1353 1354 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1355 1356 b->j = uj; 1357 b->i = ui; 1358 b->diag = NULL; 1359 b->ilen = NULL; 1360 b->imax = NULL; 1361 b->row = perm; 1362 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1363 1364 PetscCall(PetscObjectReference((PetscObject)perm)); 1365 b->icol = perm; 1366 PetscCall(PetscObjectReference((PetscObject)perm)); 1367 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1368 b->maxnz = b->nz = ui[mbs]; 1369 1370 B->info.factor_mallocs = reallocs; 1371 B->info.fill_ratio_given = fill; 1372 if (ai[mbs] != 0.) { 1373 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1374 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1375 } else { 1376 B->info.fill_ratio_needed = 0.0; 1377 } 1378 #if defined(PETSC_USE_INFO) 1379 if (ai[mbs] != 0.) { 1380 PetscReal af = B->info.fill_ratio_needed; 1381 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1382 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1383 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1384 } else { 1385 PetscCall(PetscInfo(A, "Empty matrix\n")); 1386 } 1387 #endif 1388 if (perm_identity) { 1389 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1390 } else { 1391 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1392 } 1393 PetscFunctionReturn(PETSC_SUCCESS); 1394 } 1395 1396 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1397 { 1398 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1399 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1400 PetscInt i, k, n = a->mbs; 1401 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1402 const MatScalar *aa = a->a, *v; 1403 PetscScalar *x, *s, *t, *ls; 1404 const PetscScalar *b; 1405 1406 PetscFunctionBegin; 1407 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1408 PetscCall(VecGetArrayRead(bb, &b)); 1409 PetscCall(VecGetArray(xx, &x)); 1410 t = a->solve_work; 1411 1412 /* forward solve the lower triangular */ 1413 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1414 1415 for (i = 1; i < n; i++) { 1416 v = aa + bs2 * ai[i]; 1417 vi = aj + ai[i]; 1418 nz = ai[i + 1] - ai[i]; 1419 s = t + bs * i; 1420 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1421 for (k = 0; k < nz; k++) { 1422 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1423 v += bs2; 1424 } 1425 } 1426 1427 /* backward solve the upper triangular */ 1428 ls = a->solve_work + A->cmap->n; 1429 for (i = n - 1; i >= 0; i--) { 1430 v = aa + bs2 * (adiag[i + 1] + 1); 1431 vi = aj + adiag[i + 1] + 1; 1432 nz = adiag[i] - adiag[i + 1] - 1; 1433 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1434 for (k = 0; k < nz; k++) { 1435 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1436 v += bs2; 1437 } 1438 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1439 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1440 } 1441 1442 PetscCall(VecRestoreArrayRead(bb, &b)); 1443 PetscCall(VecRestoreArray(xx, &x)); 1444 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1445 PetscFunctionReturn(PETSC_SUCCESS); 1446 } 1447 1448 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1449 { 1450 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1451 IS iscol = a->col, isrow = a->row; 1452 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1453 PetscInt i, m, n = a->mbs; 1454 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1455 const MatScalar *aa = a->a, *v; 1456 PetscScalar *x, *s, *t, *ls; 1457 const PetscScalar *b; 1458 1459 PetscFunctionBegin; 1460 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1461 PetscCall(VecGetArrayRead(bb, &b)); 1462 PetscCall(VecGetArray(xx, &x)); 1463 t = a->solve_work; 1464 1465 PetscCall(ISGetIndices(isrow, &rout)); 1466 r = rout; 1467 PetscCall(ISGetIndices(iscol, &cout)); 1468 c = cout; 1469 1470 /* forward solve the lower triangular */ 1471 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1472 for (i = 1; i < n; i++) { 1473 v = aa + bs2 * ai[i]; 1474 vi = aj + ai[i]; 1475 nz = ai[i + 1] - ai[i]; 1476 s = t + bs * i; 1477 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1478 for (m = 0; m < nz; m++) { 1479 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1480 v += bs2; 1481 } 1482 } 1483 1484 /* backward solve the upper triangular */ 1485 ls = a->solve_work + A->cmap->n; 1486 for (i = n - 1; i >= 0; i--) { 1487 v = aa + bs2 * (adiag[i + 1] + 1); 1488 vi = aj + adiag[i + 1] + 1; 1489 nz = adiag[i] - adiag[i + 1] - 1; 1490 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1491 for (m = 0; m < nz; m++) { 1492 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1493 v += bs2; 1494 } 1495 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1496 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1497 } 1498 PetscCall(ISRestoreIndices(isrow, &rout)); 1499 PetscCall(ISRestoreIndices(iscol, &cout)); 1500 PetscCall(VecRestoreArrayRead(bb, &b)); 1501 PetscCall(VecRestoreArray(xx, &x)); 1502 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1503 PetscFunctionReturn(PETSC_SUCCESS); 1504 } 1505 1506 /* 1507 For each block in an block array saves the largest absolute value in the block into another array 1508 */ 1509 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1510 { 1511 PetscInt i, j; 1512 1513 PetscFunctionBegin; 1514 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1515 for (i = 0; i < nbs; i++) { 1516 for (j = 0; j < bs2; j++) { 1517 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1518 } 1519 } 1520 PetscFunctionReturn(PETSC_SUCCESS); 1521 } 1522 1523 /* 1524 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1525 */ 1526 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1527 { 1528 Mat B = *fact; 1529 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1530 IS isicol; 1531 const PetscInt *r, *ic; 1532 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1533 PetscInt *bi, *bj, *bdiag; 1534 1535 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1536 PetscInt nlnk, *lnk; 1537 PetscBT lnkbt; 1538 PetscBool row_identity, icol_identity; 1539 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1540 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1541 1542 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1543 PetscInt nnz_max; 1544 PetscBool missing; 1545 PetscReal *vtmp_abs; 1546 MatScalar *v_work; 1547 PetscInt *v_pivots; 1548 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1549 1550 PetscFunctionBegin; 1551 /* symbolic factorization, can be reused */ 1552 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1553 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1554 adiag = a->diag; 1555 1556 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1557 1558 /* bdiag is location of diagonal in factor */ 1559 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1560 1561 /* allocate row pointers bi */ 1562 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1563 1564 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1565 dtcount = (PetscInt)info->dtcount; 1566 if (dtcount > mbs - 1) dtcount = mbs - 1; 1567 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1568 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1569 PetscCall(PetscMalloc1(nnz_max, &bj)); 1570 nnz_max = nnz_max * bs2; 1571 PetscCall(PetscMalloc1(nnz_max, &ba)); 1572 1573 /* put together the new matrix */ 1574 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1575 1576 b = (Mat_SeqBAIJ *)B->data; 1577 b->free_a = PETSC_TRUE; 1578 b->free_ij = PETSC_TRUE; 1579 1580 b->a = ba; 1581 b->j = bj; 1582 b->i = bi; 1583 b->diag = bdiag; 1584 b->ilen = NULL; 1585 b->imax = NULL; 1586 b->row = isrow; 1587 b->col = iscol; 1588 1589 PetscCall(PetscObjectReference((PetscObject)isrow)); 1590 PetscCall(PetscObjectReference((PetscObject)iscol)); 1591 1592 b->icol = isicol; 1593 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1594 b->maxnz = nnz_max / bs2; 1595 1596 B->factortype = MAT_FACTOR_ILUDT; 1597 B->info.factor_mallocs = 0; 1598 B->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1599 /* end of symbolic factorization */ 1600 PetscCall(ISGetIndices(isrow, &r)); 1601 PetscCall(ISGetIndices(isicol, &ic)); 1602 1603 /* linked list for storing column indices of the active row */ 1604 nlnk = mbs + 1; 1605 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1606 1607 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1608 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1609 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1610 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1611 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1612 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1613 1614 allowzeropivot = PetscNot(A->erroriffailure); 1615 bi[0] = 0; 1616 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1617 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1618 for (i = 0; i < mbs; i++) { 1619 /* copy initial fill into linked list */ 1620 nzi = ai[r[i] + 1] - ai[r[i]]; 1621 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1622 nzi_al = adiag[r[i]] - ai[r[i]]; 1623 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1624 1625 /* load in initial unfactored row */ 1626 ajtmp = aj + ai[r[i]]; 1627 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1628 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1629 aatmp = a->a + bs2 * ai[r[i]]; 1630 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1631 1632 /* add pivot rows into linked list */ 1633 row = lnk[mbs]; 1634 while (row < i) { 1635 nzi_bl = bi[row + 1] - bi[row] + 1; 1636 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1637 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1638 nzi += nlnk; 1639 row = lnk[row]; 1640 } 1641 1642 /* copy data from lnk into jtmp, then initialize lnk */ 1643 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1644 1645 /* numerical factorization */ 1646 bjtmp = jtmp; 1647 row = *bjtmp++; /* 1st pivot row */ 1648 1649 while (row < i) { 1650 pc = rtmp + bs2 * row; 1651 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1652 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1653 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1654 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1655 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1656 pv = ba + bs2 * (bdiag[row + 1] + 1); 1657 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1658 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1659 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1660 } 1661 row = *bjtmp++; 1662 } 1663 1664 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1665 nzi_bl = 0; 1666 j = 0; 1667 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1668 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1669 nzi_bl++; 1670 j++; 1671 } 1672 nzi_bu = nzi - nzi_bl - 1; 1673 1674 while (j < nzi) { /* U-part */ 1675 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1676 j++; 1677 } 1678 1679 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1680 1681 bjtmp = bj + bi[i]; 1682 batmp = ba + bs2 * bi[i]; 1683 /* apply level dropping rule to L part */ 1684 ncut = nzi_al + dtcount; 1685 if (ncut < nzi_bl) { 1686 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1687 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1688 } else { 1689 ncut = nzi_bl; 1690 } 1691 for (j = 0; j < ncut; j++) { 1692 bjtmp[j] = jtmp[j]; 1693 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1694 } 1695 bi[i + 1] = bi[i] + ncut; 1696 nzi = ncut + 1; 1697 1698 /* apply level dropping rule to U part */ 1699 ncut = nzi_au + dtcount; 1700 if (ncut < nzi_bu) { 1701 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1702 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1703 } else { 1704 ncut = nzi_bu; 1705 } 1706 nzi += ncut; 1707 1708 /* mark bdiagonal */ 1709 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1710 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1711 1712 bjtmp = bj + bdiag[i]; 1713 batmp = ba + bs2 * bdiag[i]; 1714 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1715 *bjtmp = i; 1716 1717 bjtmp = bj + bdiag[i + 1] + 1; 1718 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1719 1720 for (k = 0; k < ncut; k++) { 1721 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1722 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1723 } 1724 1725 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1726 1727 /* invert diagonal block for simpler triangular solves - add shift??? */ 1728 batmp = ba + bs2 * bdiag[i]; 1729 1730 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1731 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1732 } /* for (i=0; i<mbs; i++) */ 1733 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1734 1735 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1736 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1737 1738 PetscCall(ISRestoreIndices(isrow, &r)); 1739 PetscCall(ISRestoreIndices(isicol, &ic)); 1740 1741 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1742 1743 PetscCall(PetscFree2(im, jtmp)); 1744 PetscCall(PetscFree2(rtmp, vtmp)); 1745 1746 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1747 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1748 1749 PetscCall(ISIdentity(isrow, &row_identity)); 1750 PetscCall(ISIdentity(isicol, &icol_identity)); 1751 if (row_identity && icol_identity) { 1752 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1753 } else { 1754 B->ops->solve = MatSolve_SeqBAIJ_N; 1755 } 1756 1757 B->ops->solveadd = NULL; 1758 B->ops->solvetranspose = NULL; 1759 B->ops->solvetransposeadd = NULL; 1760 B->ops->matsolve = NULL; 1761 B->assembled = PETSC_TRUE; 1762 B->preallocated = PETSC_TRUE; 1763 PetscFunctionReturn(PETSC_SUCCESS); 1764 } 1765