1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 8 { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 14 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 15 MatScalar *rtmp, *pc, *mwork, *pv; 16 MatScalar *aa = a->a, *v; 17 PetscReal shift = info->shiftamount; 18 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 19 20 PetscFunctionBegin; 21 PetscCall(ISGetIndices(isrow, &r)); 22 PetscCall(ISGetIndices(isicol, &ic)); 23 24 /* generate work space needed by the factorization */ 25 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 26 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 27 28 for (PetscInt i = 0; i < n; i++) { 29 /* zero rtmp */ 30 /* L part */ 31 PetscInt *pj; 32 PetscInt nzL, nz = bi[i + 1] - bi[i]; 33 bjtmp = bj + bi[i]; 34 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 35 36 /* U part */ 37 nz = bdiag[i] - bdiag[i + 1]; 38 bjtmp = bj + bdiag[i + 1] + 1; 39 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 40 41 /* load in initial (unfactored row) */ 42 nz = ai[r[i] + 1] - ai[r[i]]; 43 ajtmp = aj + ai[r[i]]; 44 v = aa + bs2 * ai[r[i]]; 45 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 46 47 /* elimination */ 48 bjtmp = bj + bi[i]; 49 nzL = bi[i + 1] - bi[i]; 50 for (PetscInt k = 0; k < nzL; k++) { 51 PetscBool flg = PETSC_FALSE; 52 PetscInt row = bjtmp[k]; 53 54 pc = rtmp + bs2 * row; 55 for (PetscInt j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = PETSC_TRUE; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (PetscInt j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 { 92 PetscBool zeropivotdetected; 93 94 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 95 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 96 } 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 PetscCall(ISRestoreIndices(isicol, &ic)); 107 PetscCall(ISRestoreIndices(isrow, &r)); 108 109 C->ops->solve = MatSolve_SeqBAIJ_2; 110 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 111 C->assembled = PETSC_TRUE; 112 113 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 118 { 119 Mat C = B; 120 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 121 PetscInt i, j, k, nz, nzL, row, *pj; 122 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 123 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 124 MatScalar *rtmp, *pc, *mwork, *pv; 125 MatScalar *aa = a->a, *v; 126 PetscInt flg; 127 PetscReal shift = info->shiftamount; 128 PetscBool allowzeropivot, zeropivotdetected; 129 130 PetscFunctionBegin; 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 /* generate work space needed by the factorization */ 134 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 135 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 136 137 for (i = 0; i < n; i++) { 138 /* zero rtmp */ 139 /* L part */ 140 nz = bi[i + 1] - bi[i]; 141 bjtmp = bj + bi[i]; 142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 143 144 /* U part */ 145 nz = bdiag[i] - bdiag[i + 1]; 146 bjtmp = bj + bdiag[i + 1] + 1; 147 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 148 149 /* load in initial (unfactored row) */ 150 nz = ai[i + 1] - ai[i]; 151 ajtmp = aj + ai[i]; 152 v = aa + bs2 * ai[i]; 153 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 154 155 /* elimination */ 156 bjtmp = bj + bi[i]; 157 nzL = bi[i + 1] - bi[i]; 158 for (k = 0; k < nzL; k++) { 159 row = bjtmp[k]; 160 pc = rtmp + bs2 * row; 161 for (flg = 0, j = 0; j < bs2; j++) { 162 if (pc[j] != (PetscScalar)0.0) { 163 flg = 1; 164 break; 165 } 166 } 167 if (flg) { 168 pv = b->a + bs2 * bdiag[row]; 169 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 170 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 171 172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 173 pv = b->a + bs2 * (bdiag[row + 1] + 1); 174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 175 for (j = 0; j < nz; j++) { 176 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 177 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 178 v = rtmp + 4 * pj[j]; 179 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 180 pv += 4; 181 } 182 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 183 } 184 } 185 186 /* finished row so stick it into b->a */ 187 /* L part */ 188 pv = b->a + bs2 * bi[i]; 189 pj = b->j + bi[i]; 190 nz = bi[i + 1] - bi[i]; 191 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 192 193 /* Mark diagonal and invert diagonal for simpler triangular solves */ 194 pv = b->a + bs2 * bdiag[i]; 195 pj = b->j + bdiag[i]; 196 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 197 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 198 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 199 200 /* U part */ 201 /* 202 pv = b->a + bs2*bi[2*n-i]; 203 pj = b->j + bi[2*n-i]; 204 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 205 */ 206 pv = b->a + bs2 * (bdiag[i + 1] + 1); 207 pj = b->j + bdiag[i + 1] + 1; 208 nz = bdiag[i] - bdiag[i + 1] - 1; 209 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 210 } 211 PetscCall(PetscFree2(rtmp, mwork)); 212 213 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 214 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 217 C->assembled = PETSC_TRUE; 218 219 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 220 PetscFunctionReturn(PETSC_SUCCESS); 221 } 222 223 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 224 { 225 Mat C = B; 226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 227 IS isrow = b->row, isicol = b->icol; 228 const PetscInt *r, *ic; 229 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 230 PetscInt *ajtmpold, *ajtmp, nz, row; 231 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 232 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 233 MatScalar p1, p2, p3, p4; 234 MatScalar *ba = b->a, *aa = a->a; 235 PetscReal shift = info->shiftamount; 236 PetscBool allowzeropivot, zeropivotdetected; 237 238 PetscFunctionBegin; 239 allowzeropivot = PetscNot(A->erroriffailure); 240 PetscCall(ISGetIndices(isrow, &r)); 241 PetscCall(ISGetIndices(isicol, &ic)); 242 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 243 244 for (i = 0; i < n; i++) { 245 nz = bi[i + 1] - bi[i]; 246 ajtmp = bj + bi[i]; 247 for (j = 0; j < nz; j++) { 248 x = rtmp + 4 * ajtmp[j]; 249 x[0] = x[1] = x[2] = x[3] = 0.0; 250 } 251 /* load in initial (unfactored row) */ 252 idx = r[i]; 253 nz = ai[idx + 1] - ai[idx]; 254 ajtmpold = aj + ai[idx]; 255 v = aa + 4 * ai[idx]; 256 for (j = 0; j < nz; j++) { 257 x = rtmp + 4 * ic[ajtmpold[j]]; 258 x[0] = v[0]; 259 x[1] = v[1]; 260 x[2] = v[2]; 261 x[3] = v[3]; 262 v += 4; 263 } 264 row = *ajtmp++; 265 while (row < i) { 266 pc = rtmp + 4 * row; 267 p1 = pc[0]; 268 p2 = pc[1]; 269 p3 = pc[2]; 270 p4 = pc[3]; 271 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 272 pv = ba + 4 * diag_offset[row]; 273 pj = bj + diag_offset[row] + 1; 274 x1 = pv[0]; 275 x2 = pv[1]; 276 x3 = pv[2]; 277 x4 = pv[3]; 278 pc[0] = m1 = p1 * x1 + p3 * x2; 279 pc[1] = m2 = p2 * x1 + p4 * x2; 280 pc[2] = m3 = p1 * x3 + p3 * x4; 281 pc[3] = m4 = p2 * x3 + p4 * x4; 282 nz = bi[row + 1] - diag_offset[row] - 1; 283 pv += 4; 284 for (j = 0; j < nz; j++) { 285 x1 = pv[0]; 286 x2 = pv[1]; 287 x3 = pv[2]; 288 x4 = pv[3]; 289 x = rtmp + 4 * pj[j]; 290 x[0] -= m1 * x1 + m3 * x2; 291 x[1] -= m2 * x1 + m4 * x2; 292 x[2] -= m1 * x3 + m3 * x4; 293 x[3] -= m2 * x3 + m4 * x4; 294 pv += 4; 295 } 296 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 297 } 298 row = *ajtmp++; 299 } 300 /* finished row so stick it into b->a */ 301 pv = ba + 4 * bi[i]; 302 pj = bj + bi[i]; 303 nz = bi[i + 1] - bi[i]; 304 for (j = 0; j < nz; j++) { 305 x = rtmp + 4 * pj[j]; 306 pv[0] = x[0]; 307 pv[1] = x[1]; 308 pv[2] = x[2]; 309 pv[3] = x[3]; 310 pv += 4; 311 } 312 /* invert diagonal block */ 313 w = ba + 4 * diag_offset[i]; 314 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 315 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 316 } 317 318 PetscCall(PetscFree(rtmp)); 319 PetscCall(ISRestoreIndices(isicol, &ic)); 320 PetscCall(ISRestoreIndices(isrow, &r)); 321 322 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 323 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 324 C->assembled = PETSC_TRUE; 325 326 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 327 PetscFunctionReturn(PETSC_SUCCESS); 328 } 329 /* 330 Version for when blocks are 2 by 2 Using natural ordering 331 */ 332 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 333 { 334 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 335 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 336 PetscInt *ajtmpold, *ajtmp, nz, row; 337 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 338 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 339 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 340 MatScalar *ba = b->a, *aa = a->a; 341 PetscReal shift = info->shiftamount; 342 PetscBool allowzeropivot, zeropivotdetected; 343 344 PetscFunctionBegin; 345 allowzeropivot = PetscNot(A->erroriffailure); 346 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 347 for (i = 0; i < n; i++) { 348 nz = bi[i + 1] - bi[i]; 349 ajtmp = bj + bi[i]; 350 for (j = 0; j < nz; j++) { 351 x = rtmp + 4 * ajtmp[j]; 352 x[0] = x[1] = x[2] = x[3] = 0.0; 353 } 354 /* load in initial (unfactored row) */ 355 nz = ai[i + 1] - ai[i]; 356 ajtmpold = aj + ai[i]; 357 v = aa + 4 * ai[i]; 358 for (j = 0; j < nz; j++) { 359 x = rtmp + 4 * ajtmpold[j]; 360 x[0] = v[0]; 361 x[1] = v[1]; 362 x[2] = v[2]; 363 x[3] = v[3]; 364 v += 4; 365 } 366 row = *ajtmp++; 367 while (row < i) { 368 pc = rtmp + 4 * row; 369 p1 = pc[0]; 370 p2 = pc[1]; 371 p3 = pc[2]; 372 p4 = pc[3]; 373 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 374 pv = ba + 4 * diag_offset[row]; 375 pj = bj + diag_offset[row] + 1; 376 x1 = pv[0]; 377 x2 = pv[1]; 378 x3 = pv[2]; 379 x4 = pv[3]; 380 pc[0] = m1 = p1 * x1 + p3 * x2; 381 pc[1] = m2 = p2 * x1 + p4 * x2; 382 pc[2] = m3 = p1 * x3 + p3 * x4; 383 pc[3] = m4 = p2 * x3 + p4 * x4; 384 nz = bi[row + 1] - diag_offset[row] - 1; 385 pv += 4; 386 for (j = 0; j < nz; j++) { 387 x1 = pv[0]; 388 x2 = pv[1]; 389 x3 = pv[2]; 390 x4 = pv[3]; 391 x = rtmp + 4 * pj[j]; 392 x[0] -= m1 * x1 + m3 * x2; 393 x[1] -= m2 * x1 + m4 * x2; 394 x[2] -= m1 * x3 + m3 * x4; 395 x[3] -= m2 * x3 + m4 * x4; 396 pv += 4; 397 } 398 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 399 } 400 row = *ajtmp++; 401 } 402 /* finished row so stick it into b->a */ 403 pv = ba + 4 * bi[i]; 404 pj = bj + bi[i]; 405 nz = bi[i + 1] - bi[i]; 406 for (j = 0; j < nz; j++) { 407 x = rtmp + 4 * pj[j]; 408 pv[0] = x[0]; 409 pv[1] = x[1]; 410 pv[2] = x[2]; 411 pv[3] = x[3]; 412 /* 413 printf(" col %d:",pj[j]); 414 PetscInt j1; 415 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 416 printf("\n"); 417 */ 418 pv += 4; 419 } 420 /* invert diagonal block */ 421 w = ba + 4 * diag_offset[i]; 422 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 423 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 424 } 425 426 PetscCall(PetscFree(rtmp)); 427 428 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 429 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 430 C->assembled = PETSC_TRUE; 431 432 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 /* 437 Version for when blocks are 1 by 1. 438 */ 439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 440 { 441 Mat C = B; 442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 443 IS isrow = b->row, isicol = b->icol; 444 const PetscInt *r, *ic, *ics; 445 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 446 PetscInt i, j, k, nz, nzL, row, *pj; 447 const PetscInt *ajtmp, *bjtmp; 448 MatScalar *rtmp, *pc, multiplier, *pv; 449 const MatScalar *aa = a->a, *v; 450 PetscBool row_identity, col_identity; 451 FactorShiftCtx sctx; 452 const PetscInt *ddiag; 453 PetscReal rs; 454 MatScalar d; 455 456 PetscFunctionBegin; 457 /* MatPivotSetUp(): initialize shift context sctx */ 458 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 459 460 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 461 ddiag = a->diag; 462 sctx.shift_top = info->zeropivot; 463 for (i = 0; i < n; i++) { 464 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 465 d = (aa)[ddiag[i]]; 466 rs = -PetscAbsScalar(d) - PetscRealPart(d); 467 v = aa + ai[i]; 468 nz = ai[i + 1] - ai[i]; 469 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 470 if (rs > sctx.shift_top) sctx.shift_top = rs; 471 } 472 sctx.shift_top *= 1.1; 473 sctx.nshift_max = 5; 474 sctx.shift_lo = 0.; 475 sctx.shift_hi = 1.; 476 } 477 478 PetscCall(ISGetIndices(isrow, &r)); 479 PetscCall(ISGetIndices(isicol, &ic)); 480 PetscCall(PetscMalloc1(n + 1, &rtmp)); 481 ics = ic; 482 483 do { 484 sctx.newshift = PETSC_FALSE; 485 for (i = 0; i < n; i++) { 486 /* zero rtmp */ 487 /* L part */ 488 nz = bi[i + 1] - bi[i]; 489 bjtmp = bj + bi[i]; 490 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 491 492 /* U part */ 493 nz = bdiag[i] - bdiag[i + 1]; 494 bjtmp = bj + bdiag[i + 1] + 1; 495 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 496 497 /* load in initial (unfactored row) */ 498 nz = ai[r[i] + 1] - ai[r[i]]; 499 ajtmp = aj + ai[r[i]]; 500 v = aa + ai[r[i]]; 501 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 502 503 /* ZeropivotApply() */ 504 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 505 506 /* elimination */ 507 bjtmp = bj + bi[i]; 508 row = *bjtmp++; 509 nzL = bi[i + 1] - bi[i]; 510 for (k = 0; k < nzL; k++) { 511 pc = rtmp + row; 512 if (*pc != (PetscScalar)0.0) { 513 pv = b->a + bdiag[row]; 514 multiplier = *pc * (*pv); 515 *pc = multiplier; 516 517 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 518 pv = b->a + bdiag[row + 1] + 1; 519 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 520 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 521 PetscCall(PetscLogFlops(2.0 * nz)); 522 } 523 row = *bjtmp++; 524 } 525 526 /* finished row so stick it into b->a */ 527 rs = 0.0; 528 /* L part */ 529 pv = b->a + bi[i]; 530 pj = b->j + bi[i]; 531 nz = bi[i + 1] - bi[i]; 532 for (j = 0; j < nz; j++) { 533 pv[j] = rtmp[pj[j]]; 534 rs += PetscAbsScalar(pv[j]); 535 } 536 537 /* U part */ 538 pv = b->a + bdiag[i + 1] + 1; 539 pj = b->j + bdiag[i + 1] + 1; 540 nz = bdiag[i] - bdiag[i + 1] - 1; 541 for (j = 0; j < nz; j++) { 542 pv[j] = rtmp[pj[j]]; 543 rs += PetscAbsScalar(pv[j]); 544 } 545 546 sctx.rs = rs; 547 sctx.pv = rtmp[i]; 548 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 549 if (sctx.newshift) break; /* break for-loop */ 550 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 551 552 /* Mark diagonal and invert diagonal for simpler triangular solves */ 553 pv = b->a + bdiag[i]; 554 *pv = (PetscScalar)1.0 / rtmp[i]; 555 556 } /* endof for (i=0; i<n; i++) { */ 557 558 /* MatPivotRefine() */ 559 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 560 /* 561 * if no shift in this attempt & shifting & started shifting & can refine, 562 * then try lower shift 563 */ 564 sctx.shift_hi = sctx.shift_fraction; 565 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 566 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 567 sctx.newshift = PETSC_TRUE; 568 sctx.nshift++; 569 } 570 } while (sctx.newshift); 571 572 PetscCall(PetscFree(rtmp)); 573 PetscCall(ISRestoreIndices(isicol, &ic)); 574 PetscCall(ISRestoreIndices(isrow, &r)); 575 576 PetscCall(ISIdentity(isrow, &row_identity)); 577 PetscCall(ISIdentity(isicol, &col_identity)); 578 if (row_identity && col_identity) { 579 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 580 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 581 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 582 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 583 } else { 584 C->ops->solve = MatSolve_SeqBAIJ_1; 585 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 586 } 587 C->assembled = PETSC_TRUE; 588 PetscCall(PetscLogFlops(C->cmap->n)); 589 590 /* MatShiftView(A,info,&sctx) */ 591 if (sctx.nshift) { 592 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 593 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 594 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 595 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 596 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 597 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 598 } 599 } 600 PetscFunctionReturn(PETSC_SUCCESS); 601 } 602 603 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 604 { 605 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 606 IS isrow = b->row, isicol = b->icol; 607 const PetscInt *r, *ic; 608 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 609 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 610 PetscInt *diag_offset = b->diag, diag, *pj; 611 MatScalar *pv, *v, *rtmp, multiplier, *pc; 612 MatScalar *ba = b->a, *aa = a->a; 613 PetscBool row_identity, col_identity; 614 615 PetscFunctionBegin; 616 PetscCall(ISGetIndices(isrow, &r)); 617 PetscCall(ISGetIndices(isicol, &ic)); 618 PetscCall(PetscMalloc1(n + 1, &rtmp)); 619 620 for (i = 0; i < n; i++) { 621 nz = bi[i + 1] - bi[i]; 622 ajtmp = bj + bi[i]; 623 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 624 625 /* load in initial (unfactored row) */ 626 nz = ai[r[i] + 1] - ai[r[i]]; 627 ajtmpold = aj + ai[r[i]]; 628 v = aa + ai[r[i]]; 629 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 630 631 row = *ajtmp++; 632 while (row < i) { 633 pc = rtmp + row; 634 if (*pc != 0.0) { 635 pv = ba + diag_offset[row]; 636 pj = bj + diag_offset[row] + 1; 637 multiplier = *pc * *pv++; 638 *pc = multiplier; 639 nz = bi[row + 1] - diag_offset[row] - 1; 640 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 641 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 642 } 643 row = *ajtmp++; 644 } 645 /* finished row so stick it into b->a */ 646 pv = ba + bi[i]; 647 pj = bj + bi[i]; 648 nz = bi[i + 1] - bi[i]; 649 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 650 diag = diag_offset[i] - bi[i]; 651 /* check pivot entry for current row */ 652 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 653 pv[diag] = 1.0 / pv[diag]; 654 } 655 656 PetscCall(PetscFree(rtmp)); 657 PetscCall(ISRestoreIndices(isicol, &ic)); 658 PetscCall(ISRestoreIndices(isrow, &r)); 659 PetscCall(ISIdentity(isrow, &row_identity)); 660 PetscCall(ISIdentity(isicol, &col_identity)); 661 if (row_identity && col_identity) { 662 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 663 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 664 } else { 665 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 666 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 667 } 668 C->assembled = PETSC_TRUE; 669 PetscCall(PetscLogFlops(C->cmap->n)); 670 PetscFunctionReturn(PETSC_SUCCESS); 671 } 672 673 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 674 { 675 PetscFunctionBegin; 676 *type = MATSOLVERPETSC; 677 PetscFunctionReturn(PETSC_SUCCESS); 678 } 679 680 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 681 { 682 PetscInt n = A->rmap->n; 683 684 PetscFunctionBegin; 685 #if defined(PETSC_USE_COMPLEX) 686 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 687 #endif 688 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 689 PetscCall(MatSetSizes(*B, n, n, n, n)); 690 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 691 PetscCall(MatSetType(*B, MATSEQBAIJ)); 692 693 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 694 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 695 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 696 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 697 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 698 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 699 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 700 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 701 702 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 703 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 704 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 705 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 706 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 707 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 708 (*B)->factortype = ftype; 709 (*B)->canuseordering = PETSC_TRUE; 710 711 PetscCall(PetscFree((*B)->solvertype)); 712 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 713 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 714 PetscFunctionReturn(PETSC_SUCCESS); 715 } 716 717 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 718 { 719 Mat C; 720 721 PetscFunctionBegin; 722 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 723 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 724 PetscCall(MatLUFactorNumeric(C, A, info)); 725 726 A->ops->solve = C->ops->solve; 727 A->ops->solvetranspose = C->ops->solvetranspose; 728 729 PetscCall(MatHeaderMerge(A, &C)); 730 PetscFunctionReturn(PETSC_SUCCESS); 731 } 732 733 #include <../src/mat/impls/sbaij/seq/sbaij.h> 734 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 735 { 736 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 737 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 738 IS ip = b->row; 739 const PetscInt *rip; 740 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 741 PetscInt *ai = a->i, *aj = a->j; 742 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 743 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 744 PetscReal rs; 745 FactorShiftCtx sctx; 746 747 PetscFunctionBegin; 748 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 749 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 750 PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 751 PetscCall(MatDestroy(&a->sbaijMat)); 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 /* MatPivotSetUp(): initialize shift context sctx */ 756 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 757 758 PetscCall(ISGetIndices(ip, &rip)); 759 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 760 761 sctx.shift_amount = 0.; 762 sctx.nshift = 0; 763 do { 764 sctx.newshift = PETSC_FALSE; 765 for (i = 0; i < mbs; i++) { 766 rtmp[i] = 0.0; 767 jl[i] = mbs; 768 il[0] = 0; 769 } 770 771 for (k = 0; k < mbs; k++) { 772 bval = ba + bi[k]; 773 /* initialize k-th row by the perm[k]-th row of A */ 774 jmin = ai[rip[k]]; 775 jmax = ai[rip[k] + 1]; 776 for (j = jmin; j < jmax; j++) { 777 col = rip[aj[j]]; 778 if (col >= k) { /* only take upper triangular entry */ 779 rtmp[col] = aa[j]; 780 *bval++ = 0.0; /* for in-place factorization */ 781 } 782 } 783 784 /* shift the diagonal of the matrix */ 785 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 786 787 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 788 dk = rtmp[k]; 789 i = jl[k]; /* first row to be added to k_th row */ 790 791 while (i < k) { 792 nexti = jl[i]; /* next row to be added to k_th row */ 793 794 /* compute multiplier, update diag(k) and U(i,k) */ 795 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 796 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 797 dk += uikdi * ba[ili]; 798 ba[ili] = uikdi; /* -U(i,k) */ 799 800 /* add multiple of row i to k-th row */ 801 jmin = ili + 1; 802 jmax = bi[i + 1]; 803 if (jmin < jmax) { 804 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 805 /* update il and jl for row i */ 806 il[i] = jmin; 807 j = bj[jmin]; 808 jl[i] = jl[j]; 809 jl[j] = i; 810 } 811 i = nexti; 812 } 813 814 /* shift the diagonals when zero pivot is detected */ 815 /* compute rs=sum of abs(off-diagonal) */ 816 rs = 0.0; 817 jmin = bi[k] + 1; 818 nz = bi[k + 1] - jmin; 819 if (nz) { 820 bcol = bj + jmin; 821 while (nz--) { 822 rs += PetscAbsScalar(rtmp[*bcol]); 823 bcol++; 824 } 825 } 826 827 sctx.rs = rs; 828 sctx.pv = dk; 829 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 830 if (sctx.newshift) break; 831 dk = sctx.pv; 832 833 /* copy data into U(k,:) */ 834 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 835 jmin = bi[k] + 1; 836 jmax = bi[k + 1]; 837 if (jmin < jmax) { 838 for (j = jmin; j < jmax; j++) { 839 col = bj[j]; 840 ba[j] = rtmp[col]; 841 rtmp[col] = 0.0; 842 } 843 /* add the k-th row into il and jl */ 844 il[k] = jmin; 845 i = bj[jmin]; 846 jl[k] = jl[i]; 847 jl[i] = k; 848 } 849 } 850 } while (sctx.newshift); 851 PetscCall(PetscFree3(rtmp, il, jl)); 852 853 PetscCall(ISRestoreIndices(ip, &rip)); 854 855 C->assembled = PETSC_TRUE; 856 C->preallocated = PETSC_TRUE; 857 858 PetscCall(PetscLogFlops(C->rmap->N)); 859 if (sctx.nshift) { 860 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 861 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 862 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 863 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 864 } 865 } 866 PetscFunctionReturn(PETSC_SUCCESS); 867 } 868 869 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 870 { 871 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 872 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 873 PetscInt i, j, am = a->mbs; 874 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 875 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 876 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 877 PetscReal rs; 878 FactorShiftCtx sctx; 879 880 PetscFunctionBegin; 881 /* MatPivotSetUp(): initialize shift context sctx */ 882 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 883 884 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 885 886 do { 887 sctx.newshift = PETSC_FALSE; 888 for (i = 0; i < am; i++) { 889 rtmp[i] = 0.0; 890 jl[i] = am; 891 il[0] = 0; 892 } 893 894 for (k = 0; k < am; k++) { 895 /* initialize k-th row with elements nonzero in row perm(k) of A */ 896 nz = ai[k + 1] - ai[k]; 897 acol = aj + ai[k]; 898 aval = aa + ai[k]; 899 bval = ba + bi[k]; 900 while (nz--) { 901 if (*acol < k) { /* skip lower triangular entries */ 902 acol++; 903 aval++; 904 } else { 905 rtmp[*acol++] = *aval++; 906 *bval++ = 0.0; /* for in-place factorization */ 907 } 908 } 909 910 /* shift the diagonal of the matrix */ 911 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 912 913 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 914 dk = rtmp[k]; 915 i = jl[k]; /* first row to be added to k_th row */ 916 917 while (i < k) { 918 nexti = jl[i]; /* next row to be added to k_th row */ 919 /* compute multiplier, update D(k) and U(i,k) */ 920 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 921 uikdi = -ba[ili] * ba[bi[i]]; 922 dk += uikdi * ba[ili]; 923 ba[ili] = uikdi; /* -U(i,k) */ 924 925 /* add multiple of row i to k-th row ... */ 926 jmin = ili + 1; 927 nz = bi[i + 1] - jmin; 928 if (nz > 0) { 929 bcol = bj + jmin; 930 bval = ba + jmin; 931 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 932 /* update il and jl for i-th row */ 933 il[i] = jmin; 934 j = bj[jmin]; 935 jl[i] = jl[j]; 936 jl[j] = i; 937 } 938 i = nexti; 939 } 940 941 /* shift the diagonals when zero pivot is detected */ 942 /* compute rs=sum of abs(off-diagonal) */ 943 rs = 0.0; 944 jmin = bi[k] + 1; 945 nz = bi[k + 1] - jmin; 946 if (nz) { 947 bcol = bj + jmin; 948 while (nz--) { 949 rs += PetscAbsScalar(rtmp[*bcol]); 950 bcol++; 951 } 952 } 953 954 sctx.rs = rs; 955 sctx.pv = dk; 956 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 957 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 958 dk = sctx.pv; 959 960 /* copy data into U(k,:) */ 961 ba[bi[k]] = 1.0 / dk; 962 jmin = bi[k] + 1; 963 nz = bi[k + 1] - jmin; 964 if (nz) { 965 bcol = bj + jmin; 966 bval = ba + jmin; 967 while (nz--) { 968 *bval++ = rtmp[*bcol]; 969 rtmp[*bcol++] = 0.0; 970 } 971 /* add k-th row into il and jl */ 972 il[k] = jmin; 973 i = bj[jmin]; 974 jl[k] = jl[i]; 975 jl[i] = k; 976 } 977 } 978 } while (sctx.newshift); 979 PetscCall(PetscFree3(rtmp, il, jl)); 980 981 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 982 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 983 C->assembled = PETSC_TRUE; 984 C->preallocated = PETSC_TRUE; 985 986 PetscCall(PetscLogFlops(C->rmap->N)); 987 if (sctx.nshift) { 988 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 989 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 990 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 991 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 992 } 993 } 994 PetscFunctionReturn(PETSC_SUCCESS); 995 } 996 997 #include <petscbt.h> 998 #include <../src/mat/utils/freespace.h> 999 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1000 { 1001 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1002 Mat_SeqSBAIJ *b; 1003 Mat B; 1004 PetscBool perm_identity, missing; 1005 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1006 const PetscInt *rip; 1007 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1008 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1009 PetscReal fill = info->fill, levels = info->levels; 1010 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1011 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1012 PetscBT lnkbt; 1013 1014 PetscFunctionBegin; 1015 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1016 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1017 1018 if (bs > 1) { 1019 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1020 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1021 1022 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1023 PetscFunctionReturn(PETSC_SUCCESS); 1024 } 1025 1026 PetscCall(ISIdentity(perm, &perm_identity)); 1027 PetscCall(ISGetIndices(perm, &rip)); 1028 1029 /* special case that simply copies fill pattern */ 1030 if (!levels && perm_identity) { 1031 PetscCall(PetscMalloc1(am + 1, &ui)); 1032 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1033 B = fact; 1034 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1035 1036 b = (Mat_SeqSBAIJ *)B->data; 1037 uj = b->j; 1038 for (i = 0; i < am; i++) { 1039 aj = a->j + a->diag[i]; 1040 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1041 b->ilen[i] = ui[i]; 1042 } 1043 PetscCall(PetscFree(ui)); 1044 1045 B->factortype = MAT_FACTOR_NONE; 1046 1047 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1048 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1049 B->factortype = MAT_FACTOR_ICC; 1050 1051 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1052 PetscFunctionReturn(PETSC_SUCCESS); 1053 } 1054 1055 /* initialization */ 1056 PetscCall(PetscMalloc1(am + 1, &ui)); 1057 ui[0] = 0; 1058 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1059 1060 /* jl: linked list for storing indices of the pivot rows 1061 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1062 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1063 for (i = 0; i < am; i++) { 1064 jl[i] = am; 1065 il[i] = 0; 1066 } 1067 1068 /* create and initialize a linked list for storing column indices of the active row k */ 1069 nlnk = am + 1; 1070 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1071 1072 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1073 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1074 1075 current_space = free_space; 1076 1077 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1078 current_space_lvl = free_space_lvl; 1079 1080 for (k = 0; k < am; k++) { /* for each active row k */ 1081 /* initialize lnk by the column indices of row rip[k] of A */ 1082 nzk = 0; 1083 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1084 ncols_upper = 0; 1085 cols = cols_lvl + am; 1086 for (j = 0; j < ncols; j++) { 1087 i = rip[*(aj + ai[rip[k]] + j)]; 1088 if (i >= k) { /* only take upper triangular entry */ 1089 cols[ncols_upper] = i; 1090 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1091 ncols_upper++; 1092 } 1093 } 1094 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1095 nzk += nlnk; 1096 1097 /* update lnk by computing fill-in for each pivot row to be merged in */ 1098 prow = jl[k]; /* 1st pivot row */ 1099 1100 while (prow < k) { 1101 nextprow = jl[prow]; 1102 1103 /* merge prow into k-th row */ 1104 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1105 jmax = ui[prow + 1]; 1106 ncols = jmax - jmin; 1107 i = jmin - ui[prow]; 1108 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1109 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1110 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1111 nzk += nlnk; 1112 1113 /* update il and jl for prow */ 1114 if (jmin < jmax) { 1115 il[prow] = jmin; 1116 1117 j = *cols; 1118 jl[prow] = jl[j]; 1119 jl[j] = prow; 1120 } 1121 prow = nextprow; 1122 } 1123 1124 /* if free space is not available, make more free space */ 1125 if (current_space->local_remaining < nzk) { 1126 i = am - k + 1; /* num of unfactored rows */ 1127 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1128 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1129 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1130 reallocs++; 1131 } 1132 1133 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1134 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1135 1136 /* add the k-th row into il and jl */ 1137 if (nzk - 1 > 0) { 1138 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1139 jl[k] = jl[i]; 1140 jl[i] = k; 1141 il[k] = ui[k] + 1; 1142 } 1143 uj_ptr[k] = current_space->array; 1144 uj_lvl_ptr[k] = current_space_lvl->array; 1145 1146 current_space->array += nzk; 1147 current_space->local_used += nzk; 1148 current_space->local_remaining -= nzk; 1149 1150 current_space_lvl->array += nzk; 1151 current_space_lvl->local_used += nzk; 1152 current_space_lvl->local_remaining -= nzk; 1153 1154 ui[k + 1] = ui[k] + nzk; 1155 } 1156 1157 PetscCall(ISRestoreIndices(perm, &rip)); 1158 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1159 PetscCall(PetscFree(cols_lvl)); 1160 1161 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1162 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1163 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1164 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1165 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1166 1167 /* put together the new matrix in MATSEQSBAIJ format */ 1168 B = fact; 1169 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1170 1171 b = (Mat_SeqSBAIJ *)B->data; 1172 b->singlemalloc = PETSC_FALSE; 1173 b->free_a = PETSC_TRUE; 1174 b->free_ij = PETSC_TRUE; 1175 1176 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1177 1178 b->j = uj; 1179 b->i = ui; 1180 b->diag = NULL; 1181 b->ilen = NULL; 1182 b->imax = NULL; 1183 b->row = perm; 1184 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1185 1186 PetscCall(PetscObjectReference((PetscObject)perm)); 1187 1188 b->icol = perm; 1189 1190 PetscCall(PetscObjectReference((PetscObject)perm)); 1191 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1192 1193 b->maxnz = b->nz = ui[am]; 1194 1195 B->info.factor_mallocs = reallocs; 1196 B->info.fill_ratio_given = fill; 1197 if (ai[am] != 0.) { 1198 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1199 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1200 } else { 1201 B->info.fill_ratio_needed = 0.0; 1202 } 1203 #if defined(PETSC_USE_INFO) 1204 if (ai[am] != 0) { 1205 PetscReal af = B->info.fill_ratio_needed; 1206 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1207 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1208 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1209 } else { 1210 PetscCall(PetscInfo(A, "Empty matrix\n")); 1211 } 1212 #endif 1213 if (perm_identity) { 1214 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1215 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1216 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1217 } else { 1218 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1219 } 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1224 { 1225 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1226 Mat_SeqSBAIJ *b; 1227 Mat B; 1228 PetscBool perm_identity, missing; 1229 PetscReal fill = info->fill; 1230 const PetscInt *rip; 1231 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1232 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1233 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1234 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1235 PetscBT lnkbt; 1236 1237 PetscFunctionBegin; 1238 if (bs > 1) { /* convert to seqsbaij */ 1239 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1240 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1241 1242 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1243 PetscFunctionReturn(PETSC_SUCCESS); 1244 } 1245 1246 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1247 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1248 1249 /* check whether perm is the identity mapping */ 1250 PetscCall(ISIdentity(perm, &perm_identity)); 1251 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1252 PetscCall(ISGetIndices(perm, &rip)); 1253 1254 /* initialization */ 1255 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1256 ui[0] = 0; 1257 1258 /* jl: linked list for storing indices of the pivot rows 1259 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1260 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1261 for (i = 0; i < mbs; i++) { 1262 jl[i] = mbs; 1263 il[i] = 0; 1264 } 1265 1266 /* create and initialize a linked list for storing column indices of the active row k */ 1267 nlnk = mbs + 1; 1268 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1269 1270 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1271 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1272 1273 current_space = free_space; 1274 1275 for (k = 0; k < mbs; k++) { /* for each active row k */ 1276 /* initialize lnk by the column indices of row rip[k] of A */ 1277 nzk = 0; 1278 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1279 ncols_upper = 0; 1280 for (j = 0; j < ncols; j++) { 1281 i = rip[*(aj + ai[rip[k]] + j)]; 1282 if (i >= k) { /* only take upper triangular entry */ 1283 cols[ncols_upper] = i; 1284 ncols_upper++; 1285 } 1286 } 1287 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1288 nzk += nlnk; 1289 1290 /* update lnk by computing fill-in for each pivot row to be merged in */ 1291 prow = jl[k]; /* 1st pivot row */ 1292 1293 while (prow < k) { 1294 nextprow = jl[prow]; 1295 /* merge prow into k-th row */ 1296 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1297 jmax = ui[prow + 1]; 1298 ncols = jmax - jmin; 1299 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1300 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1301 nzk += nlnk; 1302 1303 /* update il and jl for prow */ 1304 if (jmin < jmax) { 1305 il[prow] = jmin; 1306 j = *uj_ptr; 1307 jl[prow] = jl[j]; 1308 jl[j] = prow; 1309 } 1310 prow = nextprow; 1311 } 1312 1313 /* if free space is not available, make more free space */ 1314 if (current_space->local_remaining < nzk) { 1315 i = mbs - k + 1; /* num of unfactored rows */ 1316 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1317 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1318 reallocs++; 1319 } 1320 1321 /* copy data into free space, then initialize lnk */ 1322 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1323 1324 /* add the k-th row into il and jl */ 1325 if (nzk - 1 > 0) { 1326 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1327 jl[k] = jl[i]; 1328 jl[i] = k; 1329 il[k] = ui[k] + 1; 1330 } 1331 ui_ptr[k] = current_space->array; 1332 current_space->array += nzk; 1333 current_space->local_used += nzk; 1334 current_space->local_remaining -= nzk; 1335 1336 ui[k + 1] = ui[k] + nzk; 1337 } 1338 1339 PetscCall(ISRestoreIndices(perm, &rip)); 1340 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1341 1342 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1343 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1344 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1345 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1346 1347 /* put together the new matrix in MATSEQSBAIJ format */ 1348 B = fact; 1349 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1350 1351 b = (Mat_SeqSBAIJ *)B->data; 1352 b->singlemalloc = PETSC_FALSE; 1353 b->free_a = PETSC_TRUE; 1354 b->free_ij = PETSC_TRUE; 1355 1356 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1357 1358 b->j = uj; 1359 b->i = ui; 1360 b->diag = NULL; 1361 b->ilen = NULL; 1362 b->imax = NULL; 1363 b->row = perm; 1364 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1365 1366 PetscCall(PetscObjectReference((PetscObject)perm)); 1367 b->icol = perm; 1368 PetscCall(PetscObjectReference((PetscObject)perm)); 1369 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1370 b->maxnz = b->nz = ui[mbs]; 1371 1372 B->info.factor_mallocs = reallocs; 1373 B->info.fill_ratio_given = fill; 1374 if (ai[mbs] != 0.) { 1375 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1376 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1377 } else { 1378 B->info.fill_ratio_needed = 0.0; 1379 } 1380 #if defined(PETSC_USE_INFO) 1381 if (ai[mbs] != 0.) { 1382 PetscReal af = B->info.fill_ratio_needed; 1383 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1384 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1385 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1386 } else { 1387 PetscCall(PetscInfo(A, "Empty matrix\n")); 1388 } 1389 #endif 1390 if (perm_identity) { 1391 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1392 } else { 1393 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1394 } 1395 PetscFunctionReturn(PETSC_SUCCESS); 1396 } 1397 1398 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1399 { 1400 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1401 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1402 PetscInt i, k, n = a->mbs; 1403 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1404 const MatScalar *aa = a->a, *v; 1405 PetscScalar *x, *s, *t, *ls; 1406 const PetscScalar *b; 1407 1408 PetscFunctionBegin; 1409 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1410 PetscCall(VecGetArrayRead(bb, &b)); 1411 PetscCall(VecGetArray(xx, &x)); 1412 t = a->solve_work; 1413 1414 /* forward solve the lower triangular */ 1415 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1416 1417 for (i = 1; i < n; i++) { 1418 v = aa + bs2 * ai[i]; 1419 vi = aj + ai[i]; 1420 nz = ai[i + 1] - ai[i]; 1421 s = t + bs * i; 1422 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1423 for (k = 0; k < nz; k++) { 1424 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1425 v += bs2; 1426 } 1427 } 1428 1429 /* backward solve the upper triangular */ 1430 ls = a->solve_work + A->cmap->n; 1431 for (i = n - 1; i >= 0; i--) { 1432 v = aa + bs2 * (adiag[i + 1] + 1); 1433 vi = aj + adiag[i + 1] + 1; 1434 nz = adiag[i] - adiag[i + 1] - 1; 1435 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1436 for (k = 0; k < nz; k++) { 1437 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1438 v += bs2; 1439 } 1440 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1441 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1442 } 1443 1444 PetscCall(VecRestoreArrayRead(bb, &b)); 1445 PetscCall(VecRestoreArray(xx, &x)); 1446 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1447 PetscFunctionReturn(PETSC_SUCCESS); 1448 } 1449 1450 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1451 { 1452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1453 IS iscol = a->col, isrow = a->row; 1454 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1455 PetscInt i, m, n = a->mbs; 1456 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1457 const MatScalar *aa = a->a, *v; 1458 PetscScalar *x, *s, *t, *ls; 1459 const PetscScalar *b; 1460 1461 PetscFunctionBegin; 1462 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1463 PetscCall(VecGetArrayRead(bb, &b)); 1464 PetscCall(VecGetArray(xx, &x)); 1465 t = a->solve_work; 1466 1467 PetscCall(ISGetIndices(isrow, &rout)); 1468 r = rout; 1469 PetscCall(ISGetIndices(iscol, &cout)); 1470 c = cout; 1471 1472 /* forward solve the lower triangular */ 1473 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1474 for (i = 1; i < n; i++) { 1475 v = aa + bs2 * ai[i]; 1476 vi = aj + ai[i]; 1477 nz = ai[i + 1] - ai[i]; 1478 s = t + bs * i; 1479 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1480 for (m = 0; m < nz; m++) { 1481 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1482 v += bs2; 1483 } 1484 } 1485 1486 /* backward solve the upper triangular */ 1487 ls = a->solve_work + A->cmap->n; 1488 for (i = n - 1; i >= 0; i--) { 1489 v = aa + bs2 * (adiag[i + 1] + 1); 1490 vi = aj + adiag[i + 1] + 1; 1491 nz = adiag[i] - adiag[i + 1] - 1; 1492 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1493 for (m = 0; m < nz; m++) { 1494 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1495 v += bs2; 1496 } 1497 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1498 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1499 } 1500 PetscCall(ISRestoreIndices(isrow, &rout)); 1501 PetscCall(ISRestoreIndices(iscol, &cout)); 1502 PetscCall(VecRestoreArrayRead(bb, &b)); 1503 PetscCall(VecRestoreArray(xx, &x)); 1504 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1505 PetscFunctionReturn(PETSC_SUCCESS); 1506 } 1507 1508 /* 1509 For each block in an block array saves the largest absolute value in the block into another array 1510 */ 1511 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1512 { 1513 PetscInt i, j; 1514 1515 PetscFunctionBegin; 1516 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1517 for (i = 0; i < nbs; i++) { 1518 for (j = 0; j < bs2; j++) { 1519 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1520 } 1521 } 1522 PetscFunctionReturn(PETSC_SUCCESS); 1523 } 1524 1525 /* 1526 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1527 */ 1528 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1529 { 1530 Mat B = *fact; 1531 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1532 IS isicol; 1533 const PetscInt *r, *ic; 1534 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1535 PetscInt *bi, *bj, *bdiag; 1536 1537 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1538 PetscInt nlnk, *lnk; 1539 PetscBT lnkbt; 1540 PetscBool row_identity, icol_identity; 1541 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1542 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1543 1544 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1545 PetscInt nnz_max; 1546 PetscBool missing; 1547 PetscReal *vtmp_abs; 1548 MatScalar *v_work; 1549 PetscInt *v_pivots; 1550 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1551 1552 PetscFunctionBegin; 1553 /* symbolic factorization, can be reused */ 1554 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1555 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1556 adiag = a->diag; 1557 1558 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1559 1560 /* bdiag is location of diagonal in factor */ 1561 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1562 1563 /* allocate row pointers bi */ 1564 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1565 1566 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1567 dtcount = (PetscInt)info->dtcount; 1568 if (dtcount > mbs - 1) dtcount = mbs - 1; 1569 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1570 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1571 PetscCall(PetscMalloc1(nnz_max, &bj)); 1572 nnz_max = nnz_max * bs2; 1573 PetscCall(PetscMalloc1(nnz_max, &ba)); 1574 1575 /* put together the new matrix */ 1576 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1577 1578 b = (Mat_SeqBAIJ *)(B)->data; 1579 b->free_a = PETSC_TRUE; 1580 b->free_ij = PETSC_TRUE; 1581 b->singlemalloc = PETSC_FALSE; 1582 1583 b->a = ba; 1584 b->j = bj; 1585 b->i = bi; 1586 b->diag = bdiag; 1587 b->ilen = NULL; 1588 b->imax = NULL; 1589 b->row = isrow; 1590 b->col = iscol; 1591 1592 PetscCall(PetscObjectReference((PetscObject)isrow)); 1593 PetscCall(PetscObjectReference((PetscObject)iscol)); 1594 1595 b->icol = isicol; 1596 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1597 b->maxnz = nnz_max / bs2; 1598 1599 (B)->factortype = MAT_FACTOR_ILUDT; 1600 (B)->info.factor_mallocs = 0; 1601 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1602 /* end of symbolic factorization */ 1603 PetscCall(ISGetIndices(isrow, &r)); 1604 PetscCall(ISGetIndices(isicol, &ic)); 1605 1606 /* linked list for storing column indices of the active row */ 1607 nlnk = mbs + 1; 1608 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1609 1610 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1611 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1612 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1613 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1614 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1615 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1616 1617 allowzeropivot = PetscNot(A->erroriffailure); 1618 bi[0] = 0; 1619 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1620 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1621 for (i = 0; i < mbs; i++) { 1622 /* copy initial fill into linked list */ 1623 nzi = ai[r[i] + 1] - ai[r[i]]; 1624 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1625 nzi_al = adiag[r[i]] - ai[r[i]]; 1626 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1627 1628 /* load in initial unfactored row */ 1629 ajtmp = aj + ai[r[i]]; 1630 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1631 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1632 aatmp = a->a + bs2 * ai[r[i]]; 1633 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1634 1635 /* add pivot rows into linked list */ 1636 row = lnk[mbs]; 1637 while (row < i) { 1638 nzi_bl = bi[row + 1] - bi[row] + 1; 1639 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1640 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1641 nzi += nlnk; 1642 row = lnk[row]; 1643 } 1644 1645 /* copy data from lnk into jtmp, then initialize lnk */ 1646 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1647 1648 /* numerical factorization */ 1649 bjtmp = jtmp; 1650 row = *bjtmp++; /* 1st pivot row */ 1651 1652 while (row < i) { 1653 pc = rtmp + bs2 * row; 1654 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1655 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1656 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1657 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1658 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1659 pv = ba + bs2 * (bdiag[row + 1] + 1); 1660 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1661 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1662 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1663 } 1664 row = *bjtmp++; 1665 } 1666 1667 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1668 nzi_bl = 0; 1669 j = 0; 1670 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1671 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1672 nzi_bl++; 1673 j++; 1674 } 1675 nzi_bu = nzi - nzi_bl - 1; 1676 1677 while (j < nzi) { /* U-part */ 1678 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1679 j++; 1680 } 1681 1682 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1683 1684 bjtmp = bj + bi[i]; 1685 batmp = ba + bs2 * bi[i]; 1686 /* apply level dropping rule to L part */ 1687 ncut = nzi_al + dtcount; 1688 if (ncut < nzi_bl) { 1689 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1690 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1691 } else { 1692 ncut = nzi_bl; 1693 } 1694 for (j = 0; j < ncut; j++) { 1695 bjtmp[j] = jtmp[j]; 1696 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1697 } 1698 bi[i + 1] = bi[i] + ncut; 1699 nzi = ncut + 1; 1700 1701 /* apply level dropping rule to U part */ 1702 ncut = nzi_au + dtcount; 1703 if (ncut < nzi_bu) { 1704 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1705 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1706 } else { 1707 ncut = nzi_bu; 1708 } 1709 nzi += ncut; 1710 1711 /* mark bdiagonal */ 1712 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1713 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1714 1715 bjtmp = bj + bdiag[i]; 1716 batmp = ba + bs2 * bdiag[i]; 1717 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1718 *bjtmp = i; 1719 1720 bjtmp = bj + bdiag[i + 1] + 1; 1721 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1722 1723 for (k = 0; k < ncut; k++) { 1724 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1725 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1726 } 1727 1728 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1729 1730 /* invert diagonal block for simpler triangular solves - add shift??? */ 1731 batmp = ba + bs2 * bdiag[i]; 1732 1733 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1734 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1735 } /* for (i=0; i<mbs; i++) */ 1736 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1737 1738 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1739 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1740 1741 PetscCall(ISRestoreIndices(isrow, &r)); 1742 PetscCall(ISRestoreIndices(isicol, &ic)); 1743 1744 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1745 1746 PetscCall(PetscFree2(im, jtmp)); 1747 PetscCall(PetscFree2(rtmp, vtmp)); 1748 1749 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1750 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1751 1752 PetscCall(ISIdentity(isrow, &row_identity)); 1753 PetscCall(ISIdentity(isicol, &icol_identity)); 1754 if (row_identity && icol_identity) { 1755 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1756 } else { 1757 B->ops->solve = MatSolve_SeqBAIJ_N; 1758 } 1759 1760 B->ops->solveadd = NULL; 1761 B->ops->solvetranspose = NULL; 1762 B->ops->solvetransposeadd = NULL; 1763 B->ops->matsolve = NULL; 1764 B->assembled = PETSC_TRUE; 1765 B->preallocated = PETSC_TRUE; 1766 PetscFunctionReturn(PETSC_SUCCESS); 1767 } 1768