1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) { 9 Mat C = B; 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 11 IS isrow = b->row, isicol = b->icol; 12 const PetscInt *r, *ic; 13 PetscInt i, j, k, nz, nzL, row, *pj; 14 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 15 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 16 MatScalar *rtmp, *pc, *mwork, *pv; 17 MatScalar *aa = a->a, *v; 18 PetscInt flg; 19 PetscReal shift = info->shiftamount; 20 PetscBool allowzeropivot, zeropivotdetected; 21 22 PetscFunctionBegin; 23 PetscCall(ISGetIndices(isrow, &r)); 24 PetscCall(ISGetIndices(isicol, &ic)); 25 allowzeropivot = PetscNot(A->erroriffailure); 26 27 /* generate work space needed by the factorization */ 28 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 29 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 30 31 for (i = 0; i < n; i++) { 32 /* zero rtmp */ 33 /* L part */ 34 nz = bi[i + 1] - bi[i]; 35 bjtmp = bj + bi[i]; 36 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 37 38 /* U part */ 39 nz = bdiag[i] - bdiag[i + 1]; 40 bjtmp = bj + bdiag[i + 1] + 1; 41 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 42 43 /* load in initial (unfactored row) */ 44 nz = ai[r[i] + 1] - ai[r[i]]; 45 ajtmp = aj + ai[r[i]]; 46 v = aa + bs2 * ai[r[i]]; 47 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); } 48 49 /* elimination */ 50 bjtmp = bj + bi[i]; 51 nzL = bi[i + 1] - bi[i]; 52 for (k = 0; k < nzL; k++) { 53 row = bjtmp[k]; 54 pc = rtmp + bs2 * row; 55 for (flg = 0, j = 0; j < bs2; j++) { 56 if (pc[j] != (PetscScalar)0.0) { 57 flg = 1; 58 break; 59 } 60 } 61 if (flg) { 62 pv = b->a + bs2 * bdiag[row]; 63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 65 66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 67 pv = b->a + bs2 * (bdiag[row + 1] + 1); 68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 69 for (j = 0; j < nz; j++) { 70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 72 v = rtmp + 4 * pj[j]; 73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 74 pv += 4; 75 } 76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 77 } 78 } 79 80 /* finished row so stick it into b->a */ 81 /* L part */ 82 pv = b->a + bs2 * bi[i]; 83 pj = b->j + bi[i]; 84 nz = bi[i + 1] - bi[i]; 85 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 86 87 /* Mark diagonal and invert diagonal for simpler triangular solves */ 88 pv = b->a + bs2 * bdiag[i]; 89 pj = b->j + bdiag[i]; 90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 91 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 92 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 93 94 /* U part */ 95 pv = b->a + bs2 * (bdiag[i + 1] + 1); 96 pj = b->j + bdiag[i + 1] + 1; 97 nz = bdiag[i] - bdiag[i + 1] - 1; 98 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 99 } 100 101 PetscCall(PetscFree2(rtmp, mwork)); 102 PetscCall(ISRestoreIndices(isicol, &ic)); 103 PetscCall(ISRestoreIndices(isrow, &r)); 104 105 C->ops->solve = MatSolve_SeqBAIJ_2; 106 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 107 C->assembled = PETSC_TRUE; 108 109 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 110 PetscFunctionReturn(0); 111 } 112 113 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 114 Mat C = B; 115 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 116 PetscInt i, j, k, nz, nzL, row, *pj; 117 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 118 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 119 MatScalar *rtmp, *pc, *mwork, *pv; 120 MatScalar *aa = a->a, *v; 121 PetscInt flg; 122 PetscReal shift = info->shiftamount; 123 PetscBool allowzeropivot, zeropivotdetected; 124 125 PetscFunctionBegin; 126 allowzeropivot = PetscNot(A->erroriffailure); 127 128 /* generate work space needed by the factorization */ 129 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 130 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 131 132 for (i = 0; i < n; i++) { 133 /* zero rtmp */ 134 /* L part */ 135 nz = bi[i + 1] - bi[i]; 136 bjtmp = bj + bi[i]; 137 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 138 139 /* U part */ 140 nz = bdiag[i] - bdiag[i + 1]; 141 bjtmp = bj + bdiag[i + 1] + 1; 142 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 143 144 /* load in initial (unfactored row) */ 145 nz = ai[i + 1] - ai[i]; 146 ajtmp = aj + ai[i]; 147 v = aa + bs2 * ai[i]; 148 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); } 149 150 /* elimination */ 151 bjtmp = bj + bi[i]; 152 nzL = bi[i + 1] - bi[i]; 153 for (k = 0; k < nzL; k++) { 154 row = bjtmp[k]; 155 pc = rtmp + bs2 * row; 156 for (flg = 0, j = 0; j < bs2; j++) { 157 if (pc[j] != (PetscScalar)0.0) { 158 flg = 1; 159 break; 160 } 161 } 162 if (flg) { 163 pv = b->a + bs2 * bdiag[row]; 164 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 165 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 166 167 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 168 pv = b->a + bs2 * (bdiag[row + 1] + 1); 169 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 170 for (j = 0; j < nz; j++) { 171 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 172 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 173 v = rtmp + 4 * pj[j]; 174 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 175 pv += 4; 176 } 177 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 178 } 179 } 180 181 /* finished row so stick it into b->a */ 182 /* L part */ 183 pv = b->a + bs2 * bi[i]; 184 pj = b->j + bi[i]; 185 nz = bi[i + 1] - bi[i]; 186 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 187 188 /* Mark diagonal and invert diagonal for simpler triangular solves */ 189 pv = b->a + bs2 * bdiag[i]; 190 pj = b->j + bdiag[i]; 191 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 192 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 193 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 194 195 /* U part */ 196 /* 197 pv = b->a + bs2*bi[2*n-i]; 198 pj = b->j + bi[2*n-i]; 199 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 200 */ 201 pv = b->a + bs2 * (bdiag[i + 1] + 1); 202 pj = b->j + bdiag[i + 1] + 1; 203 nz = bdiag[i] - bdiag[i + 1] - 1; 204 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 205 } 206 PetscCall(PetscFree2(rtmp, mwork)); 207 208 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 209 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 210 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 211 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 212 C->assembled = PETSC_TRUE; 213 214 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 215 PetscFunctionReturn(0); 216 } 217 218 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) { 219 Mat C = B; 220 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 221 IS isrow = b->row, isicol = b->icol; 222 const PetscInt *r, *ic; 223 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 224 PetscInt *ajtmpold, *ajtmp, nz, row; 225 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 226 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 227 MatScalar p1, p2, p3, p4; 228 MatScalar *ba = b->a, *aa = a->a; 229 PetscReal shift = info->shiftamount; 230 PetscBool allowzeropivot, zeropivotdetected; 231 232 PetscFunctionBegin; 233 allowzeropivot = PetscNot(A->erroriffailure); 234 PetscCall(ISGetIndices(isrow, &r)); 235 PetscCall(ISGetIndices(isicol, &ic)); 236 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 237 238 for (i = 0; i < n; i++) { 239 nz = bi[i + 1] - bi[i]; 240 ajtmp = bj + bi[i]; 241 for (j = 0; j < nz; j++) { 242 x = rtmp + 4 * ajtmp[j]; 243 x[0] = x[1] = x[2] = x[3] = 0.0; 244 } 245 /* load in initial (unfactored row) */ 246 idx = r[i]; 247 nz = ai[idx + 1] - ai[idx]; 248 ajtmpold = aj + ai[idx]; 249 v = aa + 4 * ai[idx]; 250 for (j = 0; j < nz; j++) { 251 x = rtmp + 4 * ic[ajtmpold[j]]; 252 x[0] = v[0]; 253 x[1] = v[1]; 254 x[2] = v[2]; 255 x[3] = v[3]; 256 v += 4; 257 } 258 row = *ajtmp++; 259 while (row < i) { 260 pc = rtmp + 4 * row; 261 p1 = pc[0]; 262 p2 = pc[1]; 263 p3 = pc[2]; 264 p4 = pc[3]; 265 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 266 pv = ba + 4 * diag_offset[row]; 267 pj = bj + diag_offset[row] + 1; 268 x1 = pv[0]; 269 x2 = pv[1]; 270 x3 = pv[2]; 271 x4 = pv[3]; 272 pc[0] = m1 = p1 * x1 + p3 * x2; 273 pc[1] = m2 = p2 * x1 + p4 * x2; 274 pc[2] = m3 = p1 * x3 + p3 * x4; 275 pc[3] = m4 = p2 * x3 + p4 * x4; 276 nz = bi[row + 1] - diag_offset[row] - 1; 277 pv += 4; 278 for (j = 0; j < nz; j++) { 279 x1 = pv[0]; 280 x2 = pv[1]; 281 x3 = pv[2]; 282 x4 = pv[3]; 283 x = rtmp + 4 * pj[j]; 284 x[0] -= m1 * x1 + m3 * x2; 285 x[1] -= m2 * x1 + m4 * x2; 286 x[2] -= m1 * x3 + m3 * x4; 287 x[3] -= m2 * x3 + m4 * x4; 288 pv += 4; 289 } 290 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 291 } 292 row = *ajtmp++; 293 } 294 /* finished row so stick it into b->a */ 295 pv = ba + 4 * bi[i]; 296 pj = bj + bi[i]; 297 nz = bi[i + 1] - bi[i]; 298 for (j = 0; j < nz; j++) { 299 x = rtmp + 4 * pj[j]; 300 pv[0] = x[0]; 301 pv[1] = x[1]; 302 pv[2] = x[2]; 303 pv[3] = x[3]; 304 pv += 4; 305 } 306 /* invert diagonal block */ 307 w = ba + 4 * diag_offset[i]; 308 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 309 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 310 } 311 312 PetscCall(PetscFree(rtmp)); 313 PetscCall(ISRestoreIndices(isicol, &ic)); 314 PetscCall(ISRestoreIndices(isrow, &r)); 315 316 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 317 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 318 C->assembled = PETSC_TRUE; 319 320 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 321 PetscFunctionReturn(0); 322 } 323 /* 324 Version for when blocks are 2 by 2 Using natural ordering 325 */ 326 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 328 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 329 PetscInt *ajtmpold, *ajtmp, nz, row; 330 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 331 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 332 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 333 MatScalar *ba = b->a, *aa = a->a; 334 PetscReal shift = info->shiftamount; 335 PetscBool allowzeropivot, zeropivotdetected; 336 337 PetscFunctionBegin; 338 allowzeropivot = PetscNot(A->erroriffailure); 339 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 340 for (i = 0; i < n; i++) { 341 nz = bi[i + 1] - bi[i]; 342 ajtmp = bj + bi[i]; 343 for (j = 0; j < nz; j++) { 344 x = rtmp + 4 * ajtmp[j]; 345 x[0] = x[1] = x[2] = x[3] = 0.0; 346 } 347 /* load in initial (unfactored row) */ 348 nz = ai[i + 1] - ai[i]; 349 ajtmpold = aj + ai[i]; 350 v = aa + 4 * ai[i]; 351 for (j = 0; j < nz; j++) { 352 x = rtmp + 4 * ajtmpold[j]; 353 x[0] = v[0]; 354 x[1] = v[1]; 355 x[2] = v[2]; 356 x[3] = v[3]; 357 v += 4; 358 } 359 row = *ajtmp++; 360 while (row < i) { 361 pc = rtmp + 4 * row; 362 p1 = pc[0]; 363 p2 = pc[1]; 364 p3 = pc[2]; 365 p4 = pc[3]; 366 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 367 pv = ba + 4 * diag_offset[row]; 368 pj = bj + diag_offset[row] + 1; 369 x1 = pv[0]; 370 x2 = pv[1]; 371 x3 = pv[2]; 372 x4 = pv[3]; 373 pc[0] = m1 = p1 * x1 + p3 * x2; 374 pc[1] = m2 = p2 * x1 + p4 * x2; 375 pc[2] = m3 = p1 * x3 + p3 * x4; 376 pc[3] = m4 = p2 * x3 + p4 * x4; 377 nz = bi[row + 1] - diag_offset[row] - 1; 378 pv += 4; 379 for (j = 0; j < nz; j++) { 380 x1 = pv[0]; 381 x2 = pv[1]; 382 x3 = pv[2]; 383 x4 = pv[3]; 384 x = rtmp + 4 * pj[j]; 385 x[0] -= m1 * x1 + m3 * x2; 386 x[1] -= m2 * x1 + m4 * x2; 387 x[2] -= m1 * x3 + m3 * x4; 388 x[3] -= m2 * x3 + m4 * x4; 389 pv += 4; 390 } 391 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 392 } 393 row = *ajtmp++; 394 } 395 /* finished row so stick it into b->a */ 396 pv = ba + 4 * bi[i]; 397 pj = bj + bi[i]; 398 nz = bi[i + 1] - bi[i]; 399 for (j = 0; j < nz; j++) { 400 x = rtmp + 4 * pj[j]; 401 pv[0] = x[0]; 402 pv[1] = x[1]; 403 pv[2] = x[2]; 404 pv[3] = x[3]; 405 /* 406 printf(" col %d:",pj[j]); 407 PetscInt j1; 408 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 409 printf("\n"); 410 */ 411 pv += 4; 412 } 413 /* invert diagonal block */ 414 w = ba + 4 * diag_offset[i]; 415 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 416 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 417 } 418 419 PetscCall(PetscFree(rtmp)); 420 421 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 422 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 423 C->assembled = PETSC_TRUE; 424 425 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 426 PetscFunctionReturn(0); 427 } 428 429 /* ----------------------------------------------------------- */ 430 /* 431 Version for when blocks are 1 by 1. 432 */ 433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) { 434 Mat C = B; 435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 436 IS isrow = b->row, isicol = b->icol; 437 const PetscInt *r, *ic, *ics; 438 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 439 PetscInt i, j, k, nz, nzL, row, *pj; 440 const PetscInt *ajtmp, *bjtmp; 441 MatScalar *rtmp, *pc, multiplier, *pv; 442 const MatScalar *aa = a->a, *v; 443 PetscBool row_identity, col_identity; 444 FactorShiftCtx sctx; 445 const PetscInt *ddiag; 446 PetscReal rs; 447 MatScalar d; 448 449 PetscFunctionBegin; 450 /* MatPivotSetUp(): initialize shift context sctx */ 451 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 452 453 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 454 ddiag = a->diag; 455 sctx.shift_top = info->zeropivot; 456 for (i = 0; i < n; i++) { 457 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 458 d = (aa)[ddiag[i]]; 459 rs = -PetscAbsScalar(d) - PetscRealPart(d); 460 v = aa + ai[i]; 461 nz = ai[i + 1] - ai[i]; 462 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 463 if (rs > sctx.shift_top) sctx.shift_top = rs; 464 } 465 sctx.shift_top *= 1.1; 466 sctx.nshift_max = 5; 467 sctx.shift_lo = 0.; 468 sctx.shift_hi = 1.; 469 } 470 471 PetscCall(ISGetIndices(isrow, &r)); 472 PetscCall(ISGetIndices(isicol, &ic)); 473 PetscCall(PetscMalloc1(n + 1, &rtmp)); 474 ics = ic; 475 476 do { 477 sctx.newshift = PETSC_FALSE; 478 for (i = 0; i < n; i++) { 479 /* zero rtmp */ 480 /* L part */ 481 nz = bi[i + 1] - bi[i]; 482 bjtmp = bj + bi[i]; 483 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 484 485 /* U part */ 486 nz = bdiag[i] - bdiag[i + 1]; 487 bjtmp = bj + bdiag[i + 1] + 1; 488 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 489 490 /* load in initial (unfactored row) */ 491 nz = ai[r[i] + 1] - ai[r[i]]; 492 ajtmp = aj + ai[r[i]]; 493 v = aa + ai[r[i]]; 494 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 495 496 /* ZeropivotApply() */ 497 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 498 499 /* elimination */ 500 bjtmp = bj + bi[i]; 501 row = *bjtmp++; 502 nzL = bi[i + 1] - bi[i]; 503 for (k = 0; k < nzL; k++) { 504 pc = rtmp + row; 505 if (*pc != (PetscScalar)0.0) { 506 pv = b->a + bdiag[row]; 507 multiplier = *pc * (*pv); 508 *pc = multiplier; 509 510 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 511 pv = b->a + bdiag[row + 1] + 1; 512 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 513 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 514 PetscCall(PetscLogFlops(2.0 * nz)); 515 } 516 row = *bjtmp++; 517 } 518 519 /* finished row so stick it into b->a */ 520 rs = 0.0; 521 /* L part */ 522 pv = b->a + bi[i]; 523 pj = b->j + bi[i]; 524 nz = bi[i + 1] - bi[i]; 525 for (j = 0; j < nz; j++) { 526 pv[j] = rtmp[pj[j]]; 527 rs += PetscAbsScalar(pv[j]); 528 } 529 530 /* U part */ 531 pv = b->a + bdiag[i + 1] + 1; 532 pj = b->j + bdiag[i + 1] + 1; 533 nz = bdiag[i] - bdiag[i + 1] - 1; 534 for (j = 0; j < nz; j++) { 535 pv[j] = rtmp[pj[j]]; 536 rs += PetscAbsScalar(pv[j]); 537 } 538 539 sctx.rs = rs; 540 sctx.pv = rtmp[i]; 541 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 542 if (sctx.newshift) break; /* break for-loop */ 543 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 544 545 /* Mark diagonal and invert diagonal for simpler triangular solves */ 546 pv = b->a + bdiag[i]; 547 *pv = (PetscScalar)1.0 / rtmp[i]; 548 549 } /* endof for (i=0; i<n; i++) { */ 550 551 /* MatPivotRefine() */ 552 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 553 /* 554 * if no shift in this attempt & shifting & started shifting & can refine, 555 * then try lower shift 556 */ 557 sctx.shift_hi = sctx.shift_fraction; 558 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 559 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 560 sctx.newshift = PETSC_TRUE; 561 sctx.nshift++; 562 } 563 } while (sctx.newshift); 564 565 PetscCall(PetscFree(rtmp)); 566 PetscCall(ISRestoreIndices(isicol, &ic)); 567 PetscCall(ISRestoreIndices(isrow, &r)); 568 569 PetscCall(ISIdentity(isrow, &row_identity)); 570 PetscCall(ISIdentity(isicol, &col_identity)); 571 if (row_identity && col_identity) { 572 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 573 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 574 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 575 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 576 } else { 577 C->ops->solve = MatSolve_SeqBAIJ_1; 578 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 579 } 580 C->assembled = PETSC_TRUE; 581 PetscCall(PetscLogFlops(C->cmap->n)); 582 583 /* MatShiftView(A,info,&sctx) */ 584 if (sctx.nshift) { 585 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 586 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 587 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 588 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 589 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 590 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 591 } 592 } 593 PetscFunctionReturn(0); 594 } 595 596 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) { 597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 598 IS isrow = b->row, isicol = b->icol; 599 const PetscInt *r, *ic; 600 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 601 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 602 PetscInt *diag_offset = b->diag, diag, *pj; 603 MatScalar *pv, *v, *rtmp, multiplier, *pc; 604 MatScalar *ba = b->a, *aa = a->a; 605 PetscBool row_identity, col_identity; 606 607 PetscFunctionBegin; 608 PetscCall(ISGetIndices(isrow, &r)); 609 PetscCall(ISGetIndices(isicol, &ic)); 610 PetscCall(PetscMalloc1(n + 1, &rtmp)); 611 612 for (i = 0; i < n; i++) { 613 nz = bi[i + 1] - bi[i]; 614 ajtmp = bj + bi[i]; 615 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 616 617 /* load in initial (unfactored row) */ 618 nz = ai[r[i] + 1] - ai[r[i]]; 619 ajtmpold = aj + ai[r[i]]; 620 v = aa + ai[r[i]]; 621 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 622 623 row = *ajtmp++; 624 while (row < i) { 625 pc = rtmp + row; 626 if (*pc != 0.0) { 627 pv = ba + diag_offset[row]; 628 pj = bj + diag_offset[row] + 1; 629 multiplier = *pc * *pv++; 630 *pc = multiplier; 631 nz = bi[row + 1] - diag_offset[row] - 1; 632 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 633 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 634 } 635 row = *ajtmp++; 636 } 637 /* finished row so stick it into b->a */ 638 pv = ba + bi[i]; 639 pj = bj + bi[i]; 640 nz = bi[i + 1] - bi[i]; 641 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 642 diag = diag_offset[i] - bi[i]; 643 /* check pivot entry for current row */ 644 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 645 pv[diag] = 1.0 / pv[diag]; 646 } 647 648 PetscCall(PetscFree(rtmp)); 649 PetscCall(ISRestoreIndices(isicol, &ic)); 650 PetscCall(ISRestoreIndices(isrow, &r)); 651 PetscCall(ISIdentity(isrow, &row_identity)); 652 PetscCall(ISIdentity(isicol, &col_identity)); 653 if (row_identity && col_identity) { 654 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 655 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 656 } else { 657 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 658 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 659 } 660 C->assembled = PETSC_TRUE; 661 PetscCall(PetscLogFlops(C->cmap->n)); 662 PetscFunctionReturn(0); 663 } 664 665 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) { 666 PetscFunctionBegin; 667 *type = MATSOLVERPETSC; 668 PetscFunctionReturn(0); 669 } 670 671 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) { 672 PetscInt n = A->rmap->n; 673 674 PetscFunctionBegin; 675 #if defined(PETSC_USE_COMPLEX) 676 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 677 #endif 678 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 679 PetscCall(MatSetSizes(*B, n, n, n, n)); 680 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 681 PetscCall(MatSetType(*B, MATSEQBAIJ)); 682 683 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 684 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 685 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 686 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 687 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 688 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 689 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 690 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 691 692 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 693 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 694 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 695 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 696 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 697 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 698 (*B)->factortype = ftype; 699 (*B)->canuseordering = PETSC_TRUE; 700 701 PetscCall(PetscFree((*B)->solvertype)); 702 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 703 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 704 PetscFunctionReturn(0); 705 } 706 707 /* ----------------------------------------------------------- */ 708 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) { 709 Mat C; 710 711 PetscFunctionBegin; 712 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 713 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 714 PetscCall(MatLUFactorNumeric(C, A, info)); 715 716 A->ops->solve = C->ops->solve; 717 A->ops->solvetranspose = C->ops->solvetranspose; 718 719 PetscCall(MatHeaderMerge(A, &C)); 720 PetscCall(PetscLogObjectParent((PetscObject)A, (PetscObject)((Mat_SeqBAIJ *)(A->data))->icol)); 721 PetscFunctionReturn(0); 722 } 723 724 #include <../src/mat/impls/sbaij/seq/sbaij.h> 725 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) { 726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 727 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 728 IS ip = b->row; 729 const PetscInt *rip; 730 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 731 PetscInt *ai = a->i, *aj = a->j; 732 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 733 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 734 PetscReal rs; 735 FactorShiftCtx sctx; 736 737 PetscFunctionBegin; 738 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 739 if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); } 740 PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 741 PetscCall(MatDestroy(&a->sbaijMat)); 742 PetscFunctionReturn(0); 743 } 744 745 /* MatPivotSetUp(): initialize shift context sctx */ 746 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 747 748 PetscCall(ISGetIndices(ip, &rip)); 749 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 750 751 sctx.shift_amount = 0.; 752 sctx.nshift = 0; 753 do { 754 sctx.newshift = PETSC_FALSE; 755 for (i = 0; i < mbs; i++) { 756 rtmp[i] = 0.0; 757 jl[i] = mbs; 758 il[0] = 0; 759 } 760 761 for (k = 0; k < mbs; k++) { 762 bval = ba + bi[k]; 763 /* initialize k-th row by the perm[k]-th row of A */ 764 jmin = ai[rip[k]]; 765 jmax = ai[rip[k] + 1]; 766 for (j = jmin; j < jmax; j++) { 767 col = rip[aj[j]]; 768 if (col >= k) { /* only take upper triangular entry */ 769 rtmp[col] = aa[j]; 770 *bval++ = 0.0; /* for in-place factorization */ 771 } 772 } 773 774 /* shift the diagonal of the matrix */ 775 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 776 777 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 778 dk = rtmp[k]; 779 i = jl[k]; /* first row to be added to k_th row */ 780 781 while (i < k) { 782 nexti = jl[i]; /* next row to be added to k_th row */ 783 784 /* compute multiplier, update diag(k) and U(i,k) */ 785 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 786 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 787 dk += uikdi * ba[ili]; 788 ba[ili] = uikdi; /* -U(i,k) */ 789 790 /* add multiple of row i to k-th row */ 791 jmin = ili + 1; 792 jmax = bi[i + 1]; 793 if (jmin < jmax) { 794 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 795 /* update il and jl for row i */ 796 il[i] = jmin; 797 j = bj[jmin]; 798 jl[i] = jl[j]; 799 jl[j] = i; 800 } 801 i = nexti; 802 } 803 804 /* shift the diagonals when zero pivot is detected */ 805 /* compute rs=sum of abs(off-diagonal) */ 806 rs = 0.0; 807 jmin = bi[k] + 1; 808 nz = bi[k + 1] - jmin; 809 if (nz) { 810 bcol = bj + jmin; 811 while (nz--) { 812 rs += PetscAbsScalar(rtmp[*bcol]); 813 bcol++; 814 } 815 } 816 817 sctx.rs = rs; 818 sctx.pv = dk; 819 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 820 if (sctx.newshift) break; 821 dk = sctx.pv; 822 823 /* copy data into U(k,:) */ 824 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 825 jmin = bi[k] + 1; 826 jmax = bi[k + 1]; 827 if (jmin < jmax) { 828 for (j = jmin; j < jmax; j++) { 829 col = bj[j]; 830 ba[j] = rtmp[col]; 831 rtmp[col] = 0.0; 832 } 833 /* add the k-th row into il and jl */ 834 il[k] = jmin; 835 i = bj[jmin]; 836 jl[k] = jl[i]; 837 jl[i] = k; 838 } 839 } 840 } while (sctx.newshift); 841 PetscCall(PetscFree3(rtmp, il, jl)); 842 843 PetscCall(ISRestoreIndices(ip, &rip)); 844 845 C->assembled = PETSC_TRUE; 846 C->preallocated = PETSC_TRUE; 847 848 PetscCall(PetscLogFlops(C->rmap->N)); 849 if (sctx.nshift) { 850 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 851 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 852 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 853 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 854 } 855 } 856 PetscFunctionReturn(0); 857 } 858 859 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) { 860 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 861 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 862 PetscInt i, j, am = a->mbs; 863 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 864 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 865 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 866 PetscReal rs; 867 FactorShiftCtx sctx; 868 869 PetscFunctionBegin; 870 /* MatPivotSetUp(): initialize shift context sctx */ 871 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 872 873 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 874 875 do { 876 sctx.newshift = PETSC_FALSE; 877 for (i = 0; i < am; i++) { 878 rtmp[i] = 0.0; 879 jl[i] = am; 880 il[0] = 0; 881 } 882 883 for (k = 0; k < am; k++) { 884 /* initialize k-th row with elements nonzero in row perm(k) of A */ 885 nz = ai[k + 1] - ai[k]; 886 acol = aj + ai[k]; 887 aval = aa + ai[k]; 888 bval = ba + bi[k]; 889 while (nz--) { 890 if (*acol < k) { /* skip lower triangular entries */ 891 acol++; 892 aval++; 893 } else { 894 rtmp[*acol++] = *aval++; 895 *bval++ = 0.0; /* for in-place factorization */ 896 } 897 } 898 899 /* shift the diagonal of the matrix */ 900 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 901 902 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 903 dk = rtmp[k]; 904 i = jl[k]; /* first row to be added to k_th row */ 905 906 while (i < k) { 907 nexti = jl[i]; /* next row to be added to k_th row */ 908 /* compute multiplier, update D(k) and U(i,k) */ 909 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 910 uikdi = -ba[ili] * ba[bi[i]]; 911 dk += uikdi * ba[ili]; 912 ba[ili] = uikdi; /* -U(i,k) */ 913 914 /* add multiple of row i to k-th row ... */ 915 jmin = ili + 1; 916 nz = bi[i + 1] - jmin; 917 if (nz > 0) { 918 bcol = bj + jmin; 919 bval = ba + jmin; 920 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 921 /* update il and jl for i-th row */ 922 il[i] = jmin; 923 j = bj[jmin]; 924 jl[i] = jl[j]; 925 jl[j] = i; 926 } 927 i = nexti; 928 } 929 930 /* shift the diagonals when zero pivot is detected */ 931 /* compute rs=sum of abs(off-diagonal) */ 932 rs = 0.0; 933 jmin = bi[k] + 1; 934 nz = bi[k + 1] - jmin; 935 if (nz) { 936 bcol = bj + jmin; 937 while (nz--) { 938 rs += PetscAbsScalar(rtmp[*bcol]); 939 bcol++; 940 } 941 } 942 943 sctx.rs = rs; 944 sctx.pv = dk; 945 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 946 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 947 dk = sctx.pv; 948 949 /* copy data into U(k,:) */ 950 ba[bi[k]] = 1.0 / dk; 951 jmin = bi[k] + 1; 952 nz = bi[k + 1] - jmin; 953 if (nz) { 954 bcol = bj + jmin; 955 bval = ba + jmin; 956 while (nz--) { 957 *bval++ = rtmp[*bcol]; 958 rtmp[*bcol++] = 0.0; 959 } 960 /* add k-th row into il and jl */ 961 il[k] = jmin; 962 i = bj[jmin]; 963 jl[k] = jl[i]; 964 jl[i] = k; 965 } 966 } 967 } while (sctx.newshift); 968 PetscCall(PetscFree3(rtmp, il, jl)); 969 970 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 971 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 972 C->assembled = PETSC_TRUE; 973 C->preallocated = PETSC_TRUE; 974 975 PetscCall(PetscLogFlops(C->rmap->N)); 976 if (sctx.nshift) { 977 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 978 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 979 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 980 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 981 } 982 } 983 PetscFunctionReturn(0); 984 } 985 986 #include <petscbt.h> 987 #include <../src/mat/utils/freespace.h> 988 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 989 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 990 Mat_SeqSBAIJ *b; 991 Mat B; 992 PetscBool perm_identity, missing; 993 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 994 const PetscInt *rip; 995 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 996 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 997 PetscReal fill = info->fill, levels = info->levels; 998 PetscFreeSpaceList free_space = NULL, current_space = NULL; 999 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1000 PetscBT lnkbt; 1001 1002 PetscFunctionBegin; 1003 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1004 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1005 1006 if (bs > 1) { 1007 if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); } 1008 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1009 1010 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1011 PetscFunctionReturn(0); 1012 } 1013 1014 PetscCall(ISIdentity(perm, &perm_identity)); 1015 PetscCall(ISGetIndices(perm, &rip)); 1016 1017 /* special case that simply copies fill pattern */ 1018 if (!levels && perm_identity) { 1019 PetscCall(PetscMalloc1(am + 1, &ui)); 1020 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1021 B = fact; 1022 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1023 1024 b = (Mat_SeqSBAIJ *)B->data; 1025 uj = b->j; 1026 for (i = 0; i < am; i++) { 1027 aj = a->j + a->diag[i]; 1028 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1029 b->ilen[i] = ui[i]; 1030 } 1031 PetscCall(PetscFree(ui)); 1032 1033 B->factortype = MAT_FACTOR_NONE; 1034 1035 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1036 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1037 B->factortype = MAT_FACTOR_ICC; 1038 1039 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1040 PetscFunctionReturn(0); 1041 } 1042 1043 /* initialization */ 1044 PetscCall(PetscMalloc1(am + 1, &ui)); 1045 ui[0] = 0; 1046 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1047 1048 /* jl: linked list for storing indices of the pivot rows 1049 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1050 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1051 for (i = 0; i < am; i++) { 1052 jl[i] = am; 1053 il[i] = 0; 1054 } 1055 1056 /* create and initialize a linked list for storing column indices of the active row k */ 1057 nlnk = am + 1; 1058 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1059 1060 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1061 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1062 1063 current_space = free_space; 1064 1065 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1066 current_space_lvl = free_space_lvl; 1067 1068 for (k = 0; k < am; k++) { /* for each active row k */ 1069 /* initialize lnk by the column indices of row rip[k] of A */ 1070 nzk = 0; 1071 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1072 ncols_upper = 0; 1073 cols = cols_lvl + am; 1074 for (j = 0; j < ncols; j++) { 1075 i = rip[*(aj + ai[rip[k]] + j)]; 1076 if (i >= k) { /* only take upper triangular entry */ 1077 cols[ncols_upper] = i; 1078 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1079 ncols_upper++; 1080 } 1081 } 1082 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1083 nzk += nlnk; 1084 1085 /* update lnk by computing fill-in for each pivot row to be merged in */ 1086 prow = jl[k]; /* 1st pivot row */ 1087 1088 while (prow < k) { 1089 nextprow = jl[prow]; 1090 1091 /* merge prow into k-th row */ 1092 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1093 jmax = ui[prow + 1]; 1094 ncols = jmax - jmin; 1095 i = jmin - ui[prow]; 1096 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1097 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1098 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1099 nzk += nlnk; 1100 1101 /* update il and jl for prow */ 1102 if (jmin < jmax) { 1103 il[prow] = jmin; 1104 1105 j = *cols; 1106 jl[prow] = jl[j]; 1107 jl[j] = prow; 1108 } 1109 prow = nextprow; 1110 } 1111 1112 /* if free space is not available, make more free space */ 1113 if (current_space->local_remaining < nzk) { 1114 i = am - k + 1; /* num of unfactored rows */ 1115 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1116 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1117 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1118 reallocs++; 1119 } 1120 1121 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1122 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1123 1124 /* add the k-th row into il and jl */ 1125 if (nzk - 1 > 0) { 1126 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1127 jl[k] = jl[i]; 1128 jl[i] = k; 1129 il[k] = ui[k] + 1; 1130 } 1131 uj_ptr[k] = current_space->array; 1132 uj_lvl_ptr[k] = current_space_lvl->array; 1133 1134 current_space->array += nzk; 1135 current_space->local_used += nzk; 1136 current_space->local_remaining -= nzk; 1137 1138 current_space_lvl->array += nzk; 1139 current_space_lvl->local_used += nzk; 1140 current_space_lvl->local_remaining -= nzk; 1141 1142 ui[k + 1] = ui[k] + nzk; 1143 } 1144 1145 PetscCall(ISRestoreIndices(perm, &rip)); 1146 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1147 PetscCall(PetscFree(cols_lvl)); 1148 1149 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1150 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1151 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1152 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1153 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1154 1155 /* put together the new matrix in MATSEQSBAIJ format */ 1156 B = fact; 1157 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1158 1159 b = (Mat_SeqSBAIJ *)B->data; 1160 b->singlemalloc = PETSC_FALSE; 1161 b->free_a = PETSC_TRUE; 1162 b->free_ij = PETSC_TRUE; 1163 1164 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1165 1166 b->j = uj; 1167 b->i = ui; 1168 b->diag = NULL; 1169 b->ilen = NULL; 1170 b->imax = NULL; 1171 b->row = perm; 1172 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1173 1174 PetscCall(PetscObjectReference((PetscObject)perm)); 1175 1176 b->icol = perm; 1177 1178 PetscCall(PetscObjectReference((PetscObject)perm)); 1179 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1180 PetscCall(PetscLogObjectMemory((PetscObject)B, (ui[am] - am) * (sizeof(PetscInt) + sizeof(MatScalar)))); 1181 1182 b->maxnz = b->nz = ui[am]; 1183 1184 B->info.factor_mallocs = reallocs; 1185 B->info.fill_ratio_given = fill; 1186 if (ai[am] != 0.) { 1187 /* nonzeros in lower triangular part of A (includign diagonals)= (ai[am]+am)/2 */ 1188 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1189 } else { 1190 B->info.fill_ratio_needed = 0.0; 1191 } 1192 #if defined(PETSC_USE_INFO) 1193 if (ai[am] != 0) { 1194 PetscReal af = B->info.fill_ratio_needed; 1195 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1196 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1197 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1198 } else { 1199 PetscCall(PetscInfo(A, "Empty matrix\n")); 1200 } 1201 #endif 1202 if (perm_identity) { 1203 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1204 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1205 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1206 } else { 1207 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1208 } 1209 PetscFunctionReturn(0); 1210 } 1211 1212 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1213 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1214 Mat_SeqSBAIJ *b; 1215 Mat B; 1216 PetscBool perm_identity, missing; 1217 PetscReal fill = info->fill; 1218 const PetscInt *rip; 1219 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1220 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1221 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1222 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1223 PetscBT lnkbt; 1224 1225 PetscFunctionBegin; 1226 if (bs > 1) { /* convert to seqsbaij */ 1227 if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); } 1228 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1229 1230 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1231 PetscFunctionReturn(0); 1232 } 1233 1234 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1235 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1236 1237 /* check whether perm is the identity mapping */ 1238 PetscCall(ISIdentity(perm, &perm_identity)); 1239 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1240 PetscCall(ISGetIndices(perm, &rip)); 1241 1242 /* initialization */ 1243 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1244 ui[0] = 0; 1245 1246 /* jl: linked list for storing indices of the pivot rows 1247 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1248 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1249 for (i = 0; i < mbs; i++) { 1250 jl[i] = mbs; 1251 il[i] = 0; 1252 } 1253 1254 /* create and initialize a linked list for storing column indices of the active row k */ 1255 nlnk = mbs + 1; 1256 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1257 1258 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1259 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1260 1261 current_space = free_space; 1262 1263 for (k = 0; k < mbs; k++) { /* for each active row k */ 1264 /* initialize lnk by the column indices of row rip[k] of A */ 1265 nzk = 0; 1266 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1267 ncols_upper = 0; 1268 for (j = 0; j < ncols; j++) { 1269 i = rip[*(aj + ai[rip[k]] + j)]; 1270 if (i >= k) { /* only take upper triangular entry */ 1271 cols[ncols_upper] = i; 1272 ncols_upper++; 1273 } 1274 } 1275 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1276 nzk += nlnk; 1277 1278 /* update lnk by computing fill-in for each pivot row to be merged in */ 1279 prow = jl[k]; /* 1st pivot row */ 1280 1281 while (prow < k) { 1282 nextprow = jl[prow]; 1283 /* merge prow into k-th row */ 1284 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1285 jmax = ui[prow + 1]; 1286 ncols = jmax - jmin; 1287 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1288 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1289 nzk += nlnk; 1290 1291 /* update il and jl for prow */ 1292 if (jmin < jmax) { 1293 il[prow] = jmin; 1294 j = *uj_ptr; 1295 jl[prow] = jl[j]; 1296 jl[j] = prow; 1297 } 1298 prow = nextprow; 1299 } 1300 1301 /* if free space is not available, make more free space */ 1302 if (current_space->local_remaining < nzk) { 1303 i = mbs - k + 1; /* num of unfactored rows */ 1304 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1305 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1306 reallocs++; 1307 } 1308 1309 /* copy data into free space, then initialize lnk */ 1310 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1311 1312 /* add the k-th row into il and jl */ 1313 if (nzk - 1 > 0) { 1314 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1315 jl[k] = jl[i]; 1316 jl[i] = k; 1317 il[k] = ui[k] + 1; 1318 } 1319 ui_ptr[k] = current_space->array; 1320 current_space->array += nzk; 1321 current_space->local_used += nzk; 1322 current_space->local_remaining -= nzk; 1323 1324 ui[k + 1] = ui[k] + nzk; 1325 } 1326 1327 PetscCall(ISRestoreIndices(perm, &rip)); 1328 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1329 1330 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1331 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1332 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1333 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1334 1335 /* put together the new matrix in MATSEQSBAIJ format */ 1336 B = fact; 1337 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1338 1339 b = (Mat_SeqSBAIJ *)B->data; 1340 b->singlemalloc = PETSC_FALSE; 1341 b->free_a = PETSC_TRUE; 1342 b->free_ij = PETSC_TRUE; 1343 1344 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1345 1346 b->j = uj; 1347 b->i = ui; 1348 b->diag = NULL; 1349 b->ilen = NULL; 1350 b->imax = NULL; 1351 b->row = perm; 1352 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1353 1354 PetscCall(PetscObjectReference((PetscObject)perm)); 1355 b->icol = perm; 1356 PetscCall(PetscObjectReference((PetscObject)perm)); 1357 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1358 PetscCall(PetscLogObjectMemory((PetscObject)B, (ui[mbs] - mbs) * (sizeof(PetscInt) + sizeof(MatScalar)))); 1359 b->maxnz = b->nz = ui[mbs]; 1360 1361 B->info.factor_mallocs = reallocs; 1362 B->info.fill_ratio_given = fill; 1363 if (ai[mbs] != 0.) { 1364 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1365 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1366 } else { 1367 B->info.fill_ratio_needed = 0.0; 1368 } 1369 #if defined(PETSC_USE_INFO) 1370 if (ai[mbs] != 0.) { 1371 PetscReal af = B->info.fill_ratio_needed; 1372 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1373 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1374 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1375 } else { 1376 PetscCall(PetscInfo(A, "Empty matrix\n")); 1377 } 1378 #endif 1379 if (perm_identity) { 1380 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1381 } else { 1382 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1383 } 1384 PetscFunctionReturn(0); 1385 } 1386 1387 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1388 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1389 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1390 PetscInt i, k, n = a->mbs; 1391 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1392 const MatScalar *aa = a->a, *v; 1393 PetscScalar *x, *s, *t, *ls; 1394 const PetscScalar *b; 1395 1396 PetscFunctionBegin; 1397 PetscCall(VecGetArrayRead(bb, &b)); 1398 PetscCall(VecGetArray(xx, &x)); 1399 t = a->solve_work; 1400 1401 /* forward solve the lower triangular */ 1402 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1403 1404 for (i = 1; i < n; i++) { 1405 v = aa + bs2 * ai[i]; 1406 vi = aj + ai[i]; 1407 nz = ai[i + 1] - ai[i]; 1408 s = t + bs * i; 1409 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1410 for (k = 0; k < nz; k++) { 1411 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1412 v += bs2; 1413 } 1414 } 1415 1416 /* backward solve the upper triangular */ 1417 ls = a->solve_work + A->cmap->n; 1418 for (i = n - 1; i >= 0; i--) { 1419 v = aa + bs2 * (adiag[i + 1] + 1); 1420 vi = aj + adiag[i + 1] + 1; 1421 nz = adiag[i] - adiag[i + 1] - 1; 1422 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1423 for (k = 0; k < nz; k++) { 1424 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1425 v += bs2; 1426 } 1427 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1428 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1429 } 1430 1431 PetscCall(VecRestoreArrayRead(bb, &b)); 1432 PetscCall(VecRestoreArray(xx, &x)); 1433 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1434 PetscFunctionReturn(0); 1435 } 1436 1437 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) { 1438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1439 IS iscol = a->col, isrow = a->row; 1440 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1441 PetscInt i, m, n = a->mbs; 1442 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1443 const MatScalar *aa = a->a, *v; 1444 PetscScalar *x, *s, *t, *ls; 1445 const PetscScalar *b; 1446 1447 PetscFunctionBegin; 1448 PetscCall(VecGetArrayRead(bb, &b)); 1449 PetscCall(VecGetArray(xx, &x)); 1450 t = a->solve_work; 1451 1452 PetscCall(ISGetIndices(isrow, &rout)); 1453 r = rout; 1454 PetscCall(ISGetIndices(iscol, &cout)); 1455 c = cout; 1456 1457 /* forward solve the lower triangular */ 1458 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1459 for (i = 1; i < n; i++) { 1460 v = aa + bs2 * ai[i]; 1461 vi = aj + ai[i]; 1462 nz = ai[i + 1] - ai[i]; 1463 s = t + bs * i; 1464 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1465 for (m = 0; m < nz; m++) { 1466 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1467 v += bs2; 1468 } 1469 } 1470 1471 /* backward solve the upper triangular */ 1472 ls = a->solve_work + A->cmap->n; 1473 for (i = n - 1; i >= 0; i--) { 1474 v = aa + bs2 * (adiag[i + 1] + 1); 1475 vi = aj + adiag[i + 1] + 1; 1476 nz = adiag[i] - adiag[i + 1] - 1; 1477 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1478 for (m = 0; m < nz; m++) { 1479 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1480 v += bs2; 1481 } 1482 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1483 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1484 } 1485 PetscCall(ISRestoreIndices(isrow, &rout)); 1486 PetscCall(ISRestoreIndices(iscol, &cout)); 1487 PetscCall(VecRestoreArrayRead(bb, &b)); 1488 PetscCall(VecRestoreArray(xx, &x)); 1489 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1490 PetscFunctionReturn(0); 1491 } 1492 1493 /* 1494 For each block in an block array saves the largest absolute value in the block into another array 1495 */ 1496 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) { 1497 PetscInt i, j; 1498 1499 PetscFunctionBegin; 1500 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1501 for (i = 0; i < nbs; i++) { 1502 for (j = 0; j < bs2; j++) { 1503 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1504 } 1505 } 1506 PetscFunctionReturn(0); 1507 } 1508 1509 /* 1510 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1511 */ 1512 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) { 1513 Mat B = *fact; 1514 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1515 IS isicol; 1516 const PetscInt *r, *ic; 1517 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1518 PetscInt *bi, *bj, *bdiag; 1519 1520 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1521 PetscInt nlnk, *lnk; 1522 PetscBT lnkbt; 1523 PetscBool row_identity, icol_identity; 1524 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1525 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1526 1527 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1528 PetscInt nnz_max; 1529 PetscBool missing; 1530 PetscReal *vtmp_abs; 1531 MatScalar *v_work; 1532 PetscInt *v_pivots; 1533 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1534 1535 PetscFunctionBegin; 1536 /* ------- symbolic factorization, can be reused ---------*/ 1537 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1538 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1539 adiag = a->diag; 1540 1541 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1542 1543 /* bdiag is location of diagonal in factor */ 1544 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1545 1546 /* allocate row pointers bi */ 1547 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1548 1549 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1550 dtcount = (PetscInt)info->dtcount; 1551 if (dtcount > mbs - 1) dtcount = mbs - 1; 1552 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1553 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1554 PetscCall(PetscMalloc1(nnz_max, &bj)); 1555 nnz_max = nnz_max * bs2; 1556 PetscCall(PetscMalloc1(nnz_max, &ba)); 1557 1558 /* put together the new matrix */ 1559 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1560 PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)isicol)); 1561 1562 b = (Mat_SeqBAIJ *)(B)->data; 1563 b->free_a = PETSC_TRUE; 1564 b->free_ij = PETSC_TRUE; 1565 b->singlemalloc = PETSC_FALSE; 1566 1567 b->a = ba; 1568 b->j = bj; 1569 b->i = bi; 1570 b->diag = bdiag; 1571 b->ilen = NULL; 1572 b->imax = NULL; 1573 b->row = isrow; 1574 b->col = iscol; 1575 1576 PetscCall(PetscObjectReference((PetscObject)isrow)); 1577 PetscCall(PetscObjectReference((PetscObject)iscol)); 1578 1579 b->icol = isicol; 1580 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1581 PetscCall(PetscLogObjectMemory((PetscObject)B, nnz_max * (sizeof(PetscInt) + sizeof(MatScalar)))); 1582 b->maxnz = nnz_max / bs2; 1583 1584 (B)->factortype = MAT_FACTOR_ILUDT; 1585 (B)->info.factor_mallocs = 0; 1586 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1587 /* ------- end of symbolic factorization ---------*/ 1588 PetscCall(ISGetIndices(isrow, &r)); 1589 PetscCall(ISGetIndices(isicol, &ic)); 1590 1591 /* linked list for storing column indices of the active row */ 1592 nlnk = mbs + 1; 1593 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1594 1595 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1596 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1597 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1598 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1599 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1600 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1601 1602 allowzeropivot = PetscNot(A->erroriffailure); 1603 bi[0] = 0; 1604 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1605 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1606 for (i = 0; i < mbs; i++) { 1607 /* copy initial fill into linked list */ 1608 nzi = ai[r[i] + 1] - ai[r[i]]; 1609 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1610 nzi_al = adiag[r[i]] - ai[r[i]]; 1611 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1612 1613 /* load in initial unfactored row */ 1614 ajtmp = aj + ai[r[i]]; 1615 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1616 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1617 aatmp = a->a + bs2 * ai[r[i]]; 1618 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1619 1620 /* add pivot rows into linked list */ 1621 row = lnk[mbs]; 1622 while (row < i) { 1623 nzi_bl = bi[row + 1] - bi[row] + 1; 1624 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1625 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1626 nzi += nlnk; 1627 row = lnk[row]; 1628 } 1629 1630 /* copy data from lnk into jtmp, then initialize lnk */ 1631 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1632 1633 /* numerical factorization */ 1634 bjtmp = jtmp; 1635 row = *bjtmp++; /* 1st pivot row */ 1636 1637 while (row < i) { 1638 pc = rtmp + bs2 * row; 1639 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1640 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1641 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1642 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1643 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1644 pv = ba + bs2 * (bdiag[row + 1] + 1); 1645 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1646 for (j = 0; j < nz; j++) { PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); } 1647 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1648 } 1649 row = *bjtmp++; 1650 } 1651 1652 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1653 nzi_bl = 0; 1654 j = 0; 1655 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1656 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1657 nzi_bl++; 1658 j++; 1659 } 1660 nzi_bu = nzi - nzi_bl - 1; 1661 1662 while (j < nzi) { /* U-part */ 1663 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1664 j++; 1665 } 1666 1667 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1668 1669 bjtmp = bj + bi[i]; 1670 batmp = ba + bs2 * bi[i]; 1671 /* apply level dropping rule to L part */ 1672 ncut = nzi_al + dtcount; 1673 if (ncut < nzi_bl) { 1674 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1675 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1676 } else { 1677 ncut = nzi_bl; 1678 } 1679 for (j = 0; j < ncut; j++) { 1680 bjtmp[j] = jtmp[j]; 1681 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1682 } 1683 bi[i + 1] = bi[i] + ncut; 1684 nzi = ncut + 1; 1685 1686 /* apply level dropping rule to U part */ 1687 ncut = nzi_au + dtcount; 1688 if (ncut < nzi_bu) { 1689 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1690 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1691 } else { 1692 ncut = nzi_bu; 1693 } 1694 nzi += ncut; 1695 1696 /* mark bdiagonal */ 1697 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1698 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1699 1700 bjtmp = bj + bdiag[i]; 1701 batmp = ba + bs2 * bdiag[i]; 1702 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1703 *bjtmp = i; 1704 1705 bjtmp = bj + bdiag[i + 1] + 1; 1706 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1707 1708 for (k = 0; k < ncut; k++) { 1709 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1710 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1711 } 1712 1713 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1714 1715 /* invert diagonal block for simpler triangular solves - add shift??? */ 1716 batmp = ba + bs2 * bdiag[i]; 1717 1718 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1719 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1720 } /* for (i=0; i<mbs; i++) */ 1721 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1722 1723 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1724 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1725 1726 PetscCall(ISRestoreIndices(isrow, &r)); 1727 PetscCall(ISRestoreIndices(isicol, &ic)); 1728 1729 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1730 1731 PetscCall(PetscFree2(im, jtmp)); 1732 PetscCall(PetscFree2(rtmp, vtmp)); 1733 1734 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1735 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1736 1737 PetscCall(ISIdentity(isrow, &row_identity)); 1738 PetscCall(ISIdentity(isicol, &icol_identity)); 1739 if (row_identity && icol_identity) { 1740 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1741 } else { 1742 B->ops->solve = MatSolve_SeqBAIJ_N; 1743 } 1744 1745 B->ops->solveadd = NULL; 1746 B->ops->solvetranspose = NULL; 1747 B->ops->solvetransposeadd = NULL; 1748 B->ops->matsolve = NULL; 1749 B->assembled = PETSC_TRUE; 1750 B->preallocated = PETSC_TRUE; 1751 PetscFunctionReturn(0); 1752 } 1753