1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 9 { 10 Mat C = B; 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 12 IS isrow = b->row, isicol = b->icol; 13 const PetscInt *r, *ic; 14 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 15 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 16 MatScalar *rtmp, *pc, *mwork, *pv; 17 MatScalar *aa = a->a, *v; 18 PetscReal shift = info->shiftamount; 19 const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 20 21 PetscFunctionBegin; 22 PetscCall(ISGetIndices(isrow, &r)); 23 PetscCall(ISGetIndices(isicol, &ic)); 24 25 /* generate work space needed by the factorization */ 26 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 27 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 28 29 for (PetscInt i = 0; i < n; i++) { 30 /* zero rtmp */ 31 /* L part */ 32 PetscInt *pj; 33 PetscInt nzL, nz = bi[i + 1] - bi[i]; 34 bjtmp = bj + bi[i]; 35 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 36 37 /* U part */ 38 nz = bdiag[i] - bdiag[i + 1]; 39 bjtmp = bj + bdiag[i + 1] + 1; 40 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 41 42 /* load in initial (unfactored row) */ 43 nz = ai[r[i] + 1] - ai[r[i]]; 44 ajtmp = aj + ai[r[i]]; 45 v = aa + bs2 * ai[r[i]]; 46 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 47 48 /* elimination */ 49 bjtmp = bj + bi[i]; 50 nzL = bi[i + 1] - bi[i]; 51 for (PetscInt k = 0; k < nzL; k++) { 52 PetscBool flg = PETSC_FALSE; 53 PetscInt row = bjtmp[k]; 54 55 pc = rtmp + bs2 * row; 56 for (PetscInt j = 0; j < bs2; j++) { 57 if (pc[j] != (PetscScalar)0.0) { 58 flg = PETSC_TRUE; 59 break; 60 } 61 } 62 if (flg) { 63 pv = b->a + bs2 * bdiag[row]; 64 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 65 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 66 67 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 68 pv = b->a + bs2 * (bdiag[row + 1] + 1); 69 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 70 for (PetscInt j = 0; j < nz; j++) { 71 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 72 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 73 v = rtmp + 4 * pj[j]; 74 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 75 pv += 4; 76 } 77 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 78 } 79 } 80 81 /* finished row so stick it into b->a */ 82 /* L part */ 83 pv = b->a + bs2 * bi[i]; 84 pj = b->j + bi[i]; 85 nz = bi[i + 1] - bi[i]; 86 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 87 88 /* Mark diagonal and invert diagonal for simpler triangular solves */ 89 pv = b->a + bs2 * bdiag[i]; 90 pj = b->j + bdiag[i]; 91 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 92 { 93 PetscBool zeropivotdetected; 94 95 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 96 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97 } 98 99 /* U part */ 100 pv = b->a + bs2 * (bdiag[i + 1] + 1); 101 pj = b->j + bdiag[i + 1] + 1; 102 nz = bdiag[i] - bdiag[i + 1] - 1; 103 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 104 } 105 106 PetscCall(PetscFree2(rtmp, mwork)); 107 PetscCall(ISRestoreIndices(isicol, &ic)); 108 PetscCall(ISRestoreIndices(isrow, &r)); 109 110 C->ops->solve = MatSolve_SeqBAIJ_2; 111 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 112 C->assembled = PETSC_TRUE; 113 114 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 115 PetscFunctionReturn(PETSC_SUCCESS); 116 } 117 118 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 119 { 120 Mat C = B; 121 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 122 PetscInt i, j, k, nz, nzL, row, *pj; 123 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 124 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 125 MatScalar *rtmp, *pc, *mwork, *pv; 126 MatScalar *aa = a->a, *v; 127 PetscInt flg; 128 PetscReal shift = info->shiftamount; 129 PetscBool allowzeropivot, zeropivotdetected; 130 131 PetscFunctionBegin; 132 allowzeropivot = PetscNot(A->erroriffailure); 133 134 /* generate work space needed by the factorization */ 135 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 136 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 137 138 for (i = 0; i < n; i++) { 139 /* zero rtmp */ 140 /* L part */ 141 nz = bi[i + 1] - bi[i]; 142 bjtmp = bj + bi[i]; 143 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 144 145 /* U part */ 146 nz = bdiag[i] - bdiag[i + 1]; 147 bjtmp = bj + bdiag[i + 1] + 1; 148 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 149 150 /* load in initial (unfactored row) */ 151 nz = ai[i + 1] - ai[i]; 152 ajtmp = aj + ai[i]; 153 v = aa + bs2 * ai[i]; 154 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 155 156 /* elimination */ 157 bjtmp = bj + bi[i]; 158 nzL = bi[i + 1] - bi[i]; 159 for (k = 0; k < nzL; k++) { 160 row = bjtmp[k]; 161 pc = rtmp + bs2 * row; 162 for (flg = 0, j = 0; j < bs2; j++) { 163 if (pc[j] != (PetscScalar)0.0) { 164 flg = 1; 165 break; 166 } 167 } 168 if (flg) { 169 pv = b->a + bs2 * bdiag[row]; 170 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 171 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 172 173 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 174 pv = b->a + bs2 * (bdiag[row + 1] + 1); 175 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 176 for (j = 0; j < nz; j++) { 177 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 178 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 179 v = rtmp + 4 * pj[j]; 180 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 181 pv += 4; 182 } 183 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 184 } 185 } 186 187 /* finished row so stick it into b->a */ 188 /* L part */ 189 pv = b->a + bs2 * bi[i]; 190 pj = b->j + bi[i]; 191 nz = bi[i + 1] - bi[i]; 192 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 193 194 /* Mark diagonal and invert diagonal for simpler triangular solves */ 195 pv = b->a + bs2 * bdiag[i]; 196 pj = b->j + bdiag[i]; 197 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 198 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 199 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 200 201 /* U part */ 202 /* 203 pv = b->a + bs2*bi[2*n-i]; 204 pj = b->j + bi[2*n-i]; 205 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 206 */ 207 pv = b->a + bs2 * (bdiag[i + 1] + 1); 208 pj = b->j + bdiag[i + 1] + 1; 209 nz = bdiag[i] - bdiag[i + 1] - 1; 210 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 211 } 212 PetscCall(PetscFree2(rtmp, mwork)); 213 214 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 215 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 216 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 217 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 218 C->assembled = PETSC_TRUE; 219 220 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 221 PetscFunctionReturn(PETSC_SUCCESS); 222 } 223 224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 225 { 226 Mat C = B; 227 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 228 IS isrow = b->row, isicol = b->icol; 229 const PetscInt *r, *ic; 230 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 231 PetscInt *ajtmpold, *ajtmp, nz, row; 232 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 233 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 234 MatScalar p1, p2, p3, p4; 235 MatScalar *ba = b->a, *aa = a->a; 236 PetscReal shift = info->shiftamount; 237 PetscBool allowzeropivot, zeropivotdetected; 238 239 PetscFunctionBegin; 240 allowzeropivot = PetscNot(A->erroriffailure); 241 PetscCall(ISGetIndices(isrow, &r)); 242 PetscCall(ISGetIndices(isicol, &ic)); 243 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 244 245 for (i = 0; i < n; i++) { 246 nz = bi[i + 1] - bi[i]; 247 ajtmp = bj + bi[i]; 248 for (j = 0; j < nz; j++) { 249 x = rtmp + 4 * ajtmp[j]; 250 x[0] = x[1] = x[2] = x[3] = 0.0; 251 } 252 /* load in initial (unfactored row) */ 253 idx = r[i]; 254 nz = ai[idx + 1] - ai[idx]; 255 ajtmpold = aj + ai[idx]; 256 v = aa + 4 * ai[idx]; 257 for (j = 0; j < nz; j++) { 258 x = rtmp + 4 * ic[ajtmpold[j]]; 259 x[0] = v[0]; 260 x[1] = v[1]; 261 x[2] = v[2]; 262 x[3] = v[3]; 263 v += 4; 264 } 265 row = *ajtmp++; 266 while (row < i) { 267 pc = rtmp + 4 * row; 268 p1 = pc[0]; 269 p2 = pc[1]; 270 p3 = pc[2]; 271 p4 = pc[3]; 272 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 273 pv = ba + 4 * diag_offset[row]; 274 pj = bj + diag_offset[row] + 1; 275 x1 = pv[0]; 276 x2 = pv[1]; 277 x3 = pv[2]; 278 x4 = pv[3]; 279 pc[0] = m1 = p1 * x1 + p3 * x2; 280 pc[1] = m2 = p2 * x1 + p4 * x2; 281 pc[2] = m3 = p1 * x3 + p3 * x4; 282 pc[3] = m4 = p2 * x3 + p4 * x4; 283 nz = bi[row + 1] - diag_offset[row] - 1; 284 pv += 4; 285 for (j = 0; j < nz; j++) { 286 x1 = pv[0]; 287 x2 = pv[1]; 288 x3 = pv[2]; 289 x4 = pv[3]; 290 x = rtmp + 4 * pj[j]; 291 x[0] -= m1 * x1 + m3 * x2; 292 x[1] -= m2 * x1 + m4 * x2; 293 x[2] -= m1 * x3 + m3 * x4; 294 x[3] -= m2 * x3 + m4 * x4; 295 pv += 4; 296 } 297 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 298 } 299 row = *ajtmp++; 300 } 301 /* finished row so stick it into b->a */ 302 pv = ba + 4 * bi[i]; 303 pj = bj + bi[i]; 304 nz = bi[i + 1] - bi[i]; 305 for (j = 0; j < nz; j++) { 306 x = rtmp + 4 * pj[j]; 307 pv[0] = x[0]; 308 pv[1] = x[1]; 309 pv[2] = x[2]; 310 pv[3] = x[3]; 311 pv += 4; 312 } 313 /* invert diagonal block */ 314 w = ba + 4 * diag_offset[i]; 315 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 316 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 317 } 318 319 PetscCall(PetscFree(rtmp)); 320 PetscCall(ISRestoreIndices(isicol, &ic)); 321 PetscCall(ISRestoreIndices(isrow, &r)); 322 323 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 324 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 325 C->assembled = PETSC_TRUE; 326 327 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 /* 331 Version for when blocks are 2 by 2 Using natural ordering 332 */ 333 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 334 { 335 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 336 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 337 PetscInt *ajtmpold, *ajtmp, nz, row; 338 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 339 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 340 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 341 MatScalar *ba = b->a, *aa = a->a; 342 PetscReal shift = info->shiftamount; 343 PetscBool allowzeropivot, zeropivotdetected; 344 345 PetscFunctionBegin; 346 allowzeropivot = PetscNot(A->erroriffailure); 347 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 348 for (i = 0; i < n; i++) { 349 nz = bi[i + 1] - bi[i]; 350 ajtmp = bj + bi[i]; 351 for (j = 0; j < nz; j++) { 352 x = rtmp + 4 * ajtmp[j]; 353 x[0] = x[1] = x[2] = x[3] = 0.0; 354 } 355 /* load in initial (unfactored row) */ 356 nz = ai[i + 1] - ai[i]; 357 ajtmpold = aj + ai[i]; 358 v = aa + 4 * ai[i]; 359 for (j = 0; j < nz; j++) { 360 x = rtmp + 4 * ajtmpold[j]; 361 x[0] = v[0]; 362 x[1] = v[1]; 363 x[2] = v[2]; 364 x[3] = v[3]; 365 v += 4; 366 } 367 row = *ajtmp++; 368 while (row < i) { 369 pc = rtmp + 4 * row; 370 p1 = pc[0]; 371 p2 = pc[1]; 372 p3 = pc[2]; 373 p4 = pc[3]; 374 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 375 pv = ba + 4 * diag_offset[row]; 376 pj = bj + diag_offset[row] + 1; 377 x1 = pv[0]; 378 x2 = pv[1]; 379 x3 = pv[2]; 380 x4 = pv[3]; 381 pc[0] = m1 = p1 * x1 + p3 * x2; 382 pc[1] = m2 = p2 * x1 + p4 * x2; 383 pc[2] = m3 = p1 * x3 + p3 * x4; 384 pc[3] = m4 = p2 * x3 + p4 * x4; 385 nz = bi[row + 1] - diag_offset[row] - 1; 386 pv += 4; 387 for (j = 0; j < nz; j++) { 388 x1 = pv[0]; 389 x2 = pv[1]; 390 x3 = pv[2]; 391 x4 = pv[3]; 392 x = rtmp + 4 * pj[j]; 393 x[0] -= m1 * x1 + m3 * x2; 394 x[1] -= m2 * x1 + m4 * x2; 395 x[2] -= m1 * x3 + m3 * x4; 396 x[3] -= m2 * x3 + m4 * x4; 397 pv += 4; 398 } 399 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 400 } 401 row = *ajtmp++; 402 } 403 /* finished row so stick it into b->a */ 404 pv = ba + 4 * bi[i]; 405 pj = bj + bi[i]; 406 nz = bi[i + 1] - bi[i]; 407 for (j = 0; j < nz; j++) { 408 x = rtmp + 4 * pj[j]; 409 pv[0] = x[0]; 410 pv[1] = x[1]; 411 pv[2] = x[2]; 412 pv[3] = x[3]; 413 /* 414 printf(" col %d:",pj[j]); 415 PetscInt j1; 416 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 417 printf("\n"); 418 */ 419 pv += 4; 420 } 421 /* invert diagonal block */ 422 w = ba + 4 * diag_offset[i]; 423 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 424 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 425 } 426 427 PetscCall(PetscFree(rtmp)); 428 429 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 430 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 431 C->assembled = PETSC_TRUE; 432 433 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 434 PetscFunctionReturn(PETSC_SUCCESS); 435 } 436 437 /* 438 Version for when blocks are 1 by 1. 439 */ 440 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 441 { 442 Mat C = B; 443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 444 IS isrow = b->row, isicol = b->icol; 445 const PetscInt *r, *ic, *ics; 446 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 447 PetscInt i, j, k, nz, nzL, row, *pj; 448 const PetscInt *ajtmp, *bjtmp; 449 MatScalar *rtmp, *pc, multiplier, *pv; 450 const MatScalar *aa = a->a, *v; 451 PetscBool row_identity, col_identity; 452 FactorShiftCtx sctx; 453 const PetscInt *ddiag; 454 PetscReal rs; 455 MatScalar d; 456 457 PetscFunctionBegin; 458 /* MatPivotSetUp(): initialize shift context sctx */ 459 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 460 461 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 462 ddiag = a->diag; 463 sctx.shift_top = info->zeropivot; 464 for (i = 0; i < n; i++) { 465 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 466 d = (aa)[ddiag[i]]; 467 rs = -PetscAbsScalar(d) - PetscRealPart(d); 468 v = aa + ai[i]; 469 nz = ai[i + 1] - ai[i]; 470 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 471 if (rs > sctx.shift_top) sctx.shift_top = rs; 472 } 473 sctx.shift_top *= 1.1; 474 sctx.nshift_max = 5; 475 sctx.shift_lo = 0.; 476 sctx.shift_hi = 1.; 477 } 478 479 PetscCall(ISGetIndices(isrow, &r)); 480 PetscCall(ISGetIndices(isicol, &ic)); 481 PetscCall(PetscMalloc1(n + 1, &rtmp)); 482 ics = ic; 483 484 do { 485 sctx.newshift = PETSC_FALSE; 486 for (i = 0; i < n; i++) { 487 /* zero rtmp */ 488 /* L part */ 489 nz = bi[i + 1] - bi[i]; 490 bjtmp = bj + bi[i]; 491 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 492 493 /* U part */ 494 nz = bdiag[i] - bdiag[i + 1]; 495 bjtmp = bj + bdiag[i + 1] + 1; 496 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 497 498 /* load in initial (unfactored row) */ 499 nz = ai[r[i] + 1] - ai[r[i]]; 500 ajtmp = aj + ai[r[i]]; 501 v = aa + ai[r[i]]; 502 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 503 504 /* ZeropivotApply() */ 505 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 506 507 /* elimination */ 508 bjtmp = bj + bi[i]; 509 row = *bjtmp++; 510 nzL = bi[i + 1] - bi[i]; 511 for (k = 0; k < nzL; k++) { 512 pc = rtmp + row; 513 if (*pc != (PetscScalar)0.0) { 514 pv = b->a + bdiag[row]; 515 multiplier = *pc * (*pv); 516 *pc = multiplier; 517 518 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 519 pv = b->a + bdiag[row + 1] + 1; 520 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 521 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 522 PetscCall(PetscLogFlops(2.0 * nz)); 523 } 524 row = *bjtmp++; 525 } 526 527 /* finished row so stick it into b->a */ 528 rs = 0.0; 529 /* L part */ 530 pv = b->a + bi[i]; 531 pj = b->j + bi[i]; 532 nz = bi[i + 1] - bi[i]; 533 for (j = 0; j < nz; j++) { 534 pv[j] = rtmp[pj[j]]; 535 rs += PetscAbsScalar(pv[j]); 536 } 537 538 /* U part */ 539 pv = b->a + bdiag[i + 1] + 1; 540 pj = b->j + bdiag[i + 1] + 1; 541 nz = bdiag[i] - bdiag[i + 1] - 1; 542 for (j = 0; j < nz; j++) { 543 pv[j] = rtmp[pj[j]]; 544 rs += PetscAbsScalar(pv[j]); 545 } 546 547 sctx.rs = rs; 548 sctx.pv = rtmp[i]; 549 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 550 if (sctx.newshift) break; /* break for-loop */ 551 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 552 553 /* Mark diagonal and invert diagonal for simpler triangular solves */ 554 pv = b->a + bdiag[i]; 555 *pv = (PetscScalar)1.0 / rtmp[i]; 556 557 } /* endof for (i=0; i<n; i++) { */ 558 559 /* MatPivotRefine() */ 560 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 561 /* 562 * if no shift in this attempt & shifting & started shifting & can refine, 563 * then try lower shift 564 */ 565 sctx.shift_hi = sctx.shift_fraction; 566 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 567 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 568 sctx.newshift = PETSC_TRUE; 569 sctx.nshift++; 570 } 571 } while (sctx.newshift); 572 573 PetscCall(PetscFree(rtmp)); 574 PetscCall(ISRestoreIndices(isicol, &ic)); 575 PetscCall(ISRestoreIndices(isrow, &r)); 576 577 PetscCall(ISIdentity(isrow, &row_identity)); 578 PetscCall(ISIdentity(isicol, &col_identity)); 579 if (row_identity && col_identity) { 580 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 581 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 582 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 583 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 584 } else { 585 C->ops->solve = MatSolve_SeqBAIJ_1; 586 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 587 } 588 C->assembled = PETSC_TRUE; 589 PetscCall(PetscLogFlops(C->cmap->n)); 590 591 /* MatShiftView(A,info,&sctx) */ 592 if (sctx.nshift) { 593 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 594 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 595 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 596 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 597 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 598 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 599 } 600 } 601 PetscFunctionReturn(PETSC_SUCCESS); 602 } 603 604 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 605 { 606 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 607 IS isrow = b->row, isicol = b->icol; 608 const PetscInt *r, *ic; 609 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 610 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 611 PetscInt *diag_offset = b->diag, diag, *pj; 612 MatScalar *pv, *v, *rtmp, multiplier, *pc; 613 MatScalar *ba = b->a, *aa = a->a; 614 PetscBool row_identity, col_identity; 615 616 PetscFunctionBegin; 617 PetscCall(ISGetIndices(isrow, &r)); 618 PetscCall(ISGetIndices(isicol, &ic)); 619 PetscCall(PetscMalloc1(n + 1, &rtmp)); 620 621 for (i = 0; i < n; i++) { 622 nz = bi[i + 1] - bi[i]; 623 ajtmp = bj + bi[i]; 624 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 625 626 /* load in initial (unfactored row) */ 627 nz = ai[r[i] + 1] - ai[r[i]]; 628 ajtmpold = aj + ai[r[i]]; 629 v = aa + ai[r[i]]; 630 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 631 632 row = *ajtmp++; 633 while (row < i) { 634 pc = rtmp + row; 635 if (*pc != 0.0) { 636 pv = ba + diag_offset[row]; 637 pj = bj + diag_offset[row] + 1; 638 multiplier = *pc * *pv++; 639 *pc = multiplier; 640 nz = bi[row + 1] - diag_offset[row] - 1; 641 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 642 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 643 } 644 row = *ajtmp++; 645 } 646 /* finished row so stick it into b->a */ 647 pv = ba + bi[i]; 648 pj = bj + bi[i]; 649 nz = bi[i + 1] - bi[i]; 650 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 651 diag = diag_offset[i] - bi[i]; 652 /* check pivot entry for current row */ 653 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 654 pv[diag] = 1.0 / pv[diag]; 655 } 656 657 PetscCall(PetscFree(rtmp)); 658 PetscCall(ISRestoreIndices(isicol, &ic)); 659 PetscCall(ISRestoreIndices(isrow, &r)); 660 PetscCall(ISIdentity(isrow, &row_identity)); 661 PetscCall(ISIdentity(isicol, &col_identity)); 662 if (row_identity && col_identity) { 663 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 664 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 665 } else { 666 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 667 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 668 } 669 C->assembled = PETSC_TRUE; 670 PetscCall(PetscLogFlops(C->cmap->n)); 671 PetscFunctionReturn(PETSC_SUCCESS); 672 } 673 674 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 675 { 676 PetscFunctionBegin; 677 *type = MATSOLVERPETSC; 678 PetscFunctionReturn(PETSC_SUCCESS); 679 } 680 681 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 682 { 683 PetscInt n = A->rmap->n; 684 685 PetscFunctionBegin; 686 #if defined(PETSC_USE_COMPLEX) 687 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 688 #endif 689 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 690 PetscCall(MatSetSizes(*B, n, n, n, n)); 691 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 692 PetscCall(MatSetType(*B, MATSEQBAIJ)); 693 694 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 695 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 696 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 697 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 698 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 699 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 700 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 701 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 702 703 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 704 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 705 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 706 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 707 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 708 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 709 (*B)->factortype = ftype; 710 (*B)->canuseordering = PETSC_TRUE; 711 712 PetscCall(PetscFree((*B)->solvertype)); 713 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 714 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 715 PetscFunctionReturn(PETSC_SUCCESS); 716 } 717 718 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 719 { 720 Mat C; 721 722 PetscFunctionBegin; 723 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 724 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 725 PetscCall(MatLUFactorNumeric(C, A, info)); 726 727 A->ops->solve = C->ops->solve; 728 A->ops->solvetranspose = C->ops->solvetranspose; 729 730 PetscCall(MatHeaderMerge(A, &C)); 731 PetscFunctionReturn(PETSC_SUCCESS); 732 } 733 734 #include <../src/mat/impls/sbaij/seq/sbaij.h> 735 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 736 { 737 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 738 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 739 IS ip = b->row; 740 const PetscInt *rip; 741 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 742 PetscInt *ai = a->i, *aj = a->j; 743 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 744 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 745 PetscReal rs; 746 FactorShiftCtx sctx; 747 748 PetscFunctionBegin; 749 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 750 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 751 PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 752 PetscCall(MatDestroy(&a->sbaijMat)); 753 PetscFunctionReturn(PETSC_SUCCESS); 754 } 755 756 /* MatPivotSetUp(): initialize shift context sctx */ 757 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 758 759 PetscCall(ISGetIndices(ip, &rip)); 760 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 761 762 sctx.shift_amount = 0.; 763 sctx.nshift = 0; 764 do { 765 sctx.newshift = PETSC_FALSE; 766 for (i = 0; i < mbs; i++) { 767 rtmp[i] = 0.0; 768 jl[i] = mbs; 769 il[0] = 0; 770 } 771 772 for (k = 0; k < mbs; k++) { 773 bval = ba + bi[k]; 774 /* initialize k-th row by the perm[k]-th row of A */ 775 jmin = ai[rip[k]]; 776 jmax = ai[rip[k] + 1]; 777 for (j = jmin; j < jmax; j++) { 778 col = rip[aj[j]]; 779 if (col >= k) { /* only take upper triangular entry */ 780 rtmp[col] = aa[j]; 781 *bval++ = 0.0; /* for in-place factorization */ 782 } 783 } 784 785 /* shift the diagonal of the matrix */ 786 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 787 788 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 789 dk = rtmp[k]; 790 i = jl[k]; /* first row to be added to k_th row */ 791 792 while (i < k) { 793 nexti = jl[i]; /* next row to be added to k_th row */ 794 795 /* compute multiplier, update diag(k) and U(i,k) */ 796 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 797 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 798 dk += uikdi * ba[ili]; 799 ba[ili] = uikdi; /* -U(i,k) */ 800 801 /* add multiple of row i to k-th row */ 802 jmin = ili + 1; 803 jmax = bi[i + 1]; 804 if (jmin < jmax) { 805 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 806 /* update il and jl for row i */ 807 il[i] = jmin; 808 j = bj[jmin]; 809 jl[i] = jl[j]; 810 jl[j] = i; 811 } 812 i = nexti; 813 } 814 815 /* shift the diagonals when zero pivot is detected */ 816 /* compute rs=sum of abs(off-diagonal) */ 817 rs = 0.0; 818 jmin = bi[k] + 1; 819 nz = bi[k + 1] - jmin; 820 if (nz) { 821 bcol = bj + jmin; 822 while (nz--) { 823 rs += PetscAbsScalar(rtmp[*bcol]); 824 bcol++; 825 } 826 } 827 828 sctx.rs = rs; 829 sctx.pv = dk; 830 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 831 if (sctx.newshift) break; 832 dk = sctx.pv; 833 834 /* copy data into U(k,:) */ 835 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 836 jmin = bi[k] + 1; 837 jmax = bi[k + 1]; 838 if (jmin < jmax) { 839 for (j = jmin; j < jmax; j++) { 840 col = bj[j]; 841 ba[j] = rtmp[col]; 842 rtmp[col] = 0.0; 843 } 844 /* add the k-th row into il and jl */ 845 il[k] = jmin; 846 i = bj[jmin]; 847 jl[k] = jl[i]; 848 jl[i] = k; 849 } 850 } 851 } while (sctx.newshift); 852 PetscCall(PetscFree3(rtmp, il, jl)); 853 854 PetscCall(ISRestoreIndices(ip, &rip)); 855 856 C->assembled = PETSC_TRUE; 857 C->preallocated = PETSC_TRUE; 858 859 PetscCall(PetscLogFlops(C->rmap->N)); 860 if (sctx.nshift) { 861 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 862 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 863 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 864 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 865 } 866 } 867 PetscFunctionReturn(PETSC_SUCCESS); 868 } 869 870 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 871 { 872 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 873 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 874 PetscInt i, j, am = a->mbs; 875 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 876 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 877 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 878 PetscReal rs; 879 FactorShiftCtx sctx; 880 881 PetscFunctionBegin; 882 /* MatPivotSetUp(): initialize shift context sctx */ 883 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 884 885 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 886 887 do { 888 sctx.newshift = PETSC_FALSE; 889 for (i = 0; i < am; i++) { 890 rtmp[i] = 0.0; 891 jl[i] = am; 892 il[0] = 0; 893 } 894 895 for (k = 0; k < am; k++) { 896 /* initialize k-th row with elements nonzero in row perm(k) of A */ 897 nz = ai[k + 1] - ai[k]; 898 acol = aj + ai[k]; 899 aval = aa + ai[k]; 900 bval = ba + bi[k]; 901 while (nz--) { 902 if (*acol < k) { /* skip lower triangular entries */ 903 acol++; 904 aval++; 905 } else { 906 rtmp[*acol++] = *aval++; 907 *bval++ = 0.0; /* for in-place factorization */ 908 } 909 } 910 911 /* shift the diagonal of the matrix */ 912 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 913 914 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 915 dk = rtmp[k]; 916 i = jl[k]; /* first row to be added to k_th row */ 917 918 while (i < k) { 919 nexti = jl[i]; /* next row to be added to k_th row */ 920 /* compute multiplier, update D(k) and U(i,k) */ 921 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 922 uikdi = -ba[ili] * ba[bi[i]]; 923 dk += uikdi * ba[ili]; 924 ba[ili] = uikdi; /* -U(i,k) */ 925 926 /* add multiple of row i to k-th row ... */ 927 jmin = ili + 1; 928 nz = bi[i + 1] - jmin; 929 if (nz > 0) { 930 bcol = bj + jmin; 931 bval = ba + jmin; 932 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 933 /* update il and jl for i-th row */ 934 il[i] = jmin; 935 j = bj[jmin]; 936 jl[i] = jl[j]; 937 jl[j] = i; 938 } 939 i = nexti; 940 } 941 942 /* shift the diagonals when zero pivot is detected */ 943 /* compute rs=sum of abs(off-diagonal) */ 944 rs = 0.0; 945 jmin = bi[k] + 1; 946 nz = bi[k + 1] - jmin; 947 if (nz) { 948 bcol = bj + jmin; 949 while (nz--) { 950 rs += PetscAbsScalar(rtmp[*bcol]); 951 bcol++; 952 } 953 } 954 955 sctx.rs = rs; 956 sctx.pv = dk; 957 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 958 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 959 dk = sctx.pv; 960 961 /* copy data into U(k,:) */ 962 ba[bi[k]] = 1.0 / dk; 963 jmin = bi[k] + 1; 964 nz = bi[k + 1] - jmin; 965 if (nz) { 966 bcol = bj + jmin; 967 bval = ba + jmin; 968 while (nz--) { 969 *bval++ = rtmp[*bcol]; 970 rtmp[*bcol++] = 0.0; 971 } 972 /* add k-th row into il and jl */ 973 il[k] = jmin; 974 i = bj[jmin]; 975 jl[k] = jl[i]; 976 jl[i] = k; 977 } 978 } 979 } while (sctx.newshift); 980 PetscCall(PetscFree3(rtmp, il, jl)); 981 982 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 983 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 984 C->assembled = PETSC_TRUE; 985 C->preallocated = PETSC_TRUE; 986 987 PetscCall(PetscLogFlops(C->rmap->N)); 988 if (sctx.nshift) { 989 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 990 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 991 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 992 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 993 } 994 } 995 PetscFunctionReturn(PETSC_SUCCESS); 996 } 997 998 #include <petscbt.h> 999 #include <../src/mat/utils/freespace.h> 1000 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1001 { 1002 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1003 Mat_SeqSBAIJ *b; 1004 Mat B; 1005 PetscBool perm_identity, missing; 1006 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1007 const PetscInt *rip; 1008 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1009 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1010 PetscReal fill = info->fill, levels = info->levels; 1011 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1012 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1013 PetscBT lnkbt; 1014 1015 PetscFunctionBegin; 1016 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1017 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1018 1019 if (bs > 1) { 1020 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1021 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1022 1023 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1024 PetscFunctionReturn(PETSC_SUCCESS); 1025 } 1026 1027 PetscCall(ISIdentity(perm, &perm_identity)); 1028 PetscCall(ISGetIndices(perm, &rip)); 1029 1030 /* special case that simply copies fill pattern */ 1031 if (!levels && perm_identity) { 1032 PetscCall(PetscMalloc1(am + 1, &ui)); 1033 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1034 B = fact; 1035 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1036 1037 b = (Mat_SeqSBAIJ *)B->data; 1038 uj = b->j; 1039 for (i = 0; i < am; i++) { 1040 aj = a->j + a->diag[i]; 1041 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1042 b->ilen[i] = ui[i]; 1043 } 1044 PetscCall(PetscFree(ui)); 1045 1046 B->factortype = MAT_FACTOR_NONE; 1047 1048 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1049 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1050 B->factortype = MAT_FACTOR_ICC; 1051 1052 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1053 PetscFunctionReturn(PETSC_SUCCESS); 1054 } 1055 1056 /* initialization */ 1057 PetscCall(PetscMalloc1(am + 1, &ui)); 1058 ui[0] = 0; 1059 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1060 1061 /* jl: linked list for storing indices of the pivot rows 1062 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1063 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1064 for (i = 0; i < am; i++) { 1065 jl[i] = am; 1066 il[i] = 0; 1067 } 1068 1069 /* create and initialize a linked list for storing column indices of the active row k */ 1070 nlnk = am + 1; 1071 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1072 1073 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1074 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1075 1076 current_space = free_space; 1077 1078 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1079 current_space_lvl = free_space_lvl; 1080 1081 for (k = 0; k < am; k++) { /* for each active row k */ 1082 /* initialize lnk by the column indices of row rip[k] of A */ 1083 nzk = 0; 1084 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1085 ncols_upper = 0; 1086 cols = cols_lvl + am; 1087 for (j = 0; j < ncols; j++) { 1088 i = rip[*(aj + ai[rip[k]] + j)]; 1089 if (i >= k) { /* only take upper triangular entry */ 1090 cols[ncols_upper] = i; 1091 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1092 ncols_upper++; 1093 } 1094 } 1095 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1096 nzk += nlnk; 1097 1098 /* update lnk by computing fill-in for each pivot row to be merged in */ 1099 prow = jl[k]; /* 1st pivot row */ 1100 1101 while (prow < k) { 1102 nextprow = jl[prow]; 1103 1104 /* merge prow into k-th row */ 1105 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1106 jmax = ui[prow + 1]; 1107 ncols = jmax - jmin; 1108 i = jmin - ui[prow]; 1109 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1110 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1111 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1112 nzk += nlnk; 1113 1114 /* update il and jl for prow */ 1115 if (jmin < jmax) { 1116 il[prow] = jmin; 1117 1118 j = *cols; 1119 jl[prow] = jl[j]; 1120 jl[j] = prow; 1121 } 1122 prow = nextprow; 1123 } 1124 1125 /* if free space is not available, make more free space */ 1126 if (current_space->local_remaining < nzk) { 1127 i = am - k + 1; /* num of unfactored rows */ 1128 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1129 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1130 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1131 reallocs++; 1132 } 1133 1134 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1135 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1136 1137 /* add the k-th row into il and jl */ 1138 if (nzk - 1 > 0) { 1139 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1140 jl[k] = jl[i]; 1141 jl[i] = k; 1142 il[k] = ui[k] + 1; 1143 } 1144 uj_ptr[k] = current_space->array; 1145 uj_lvl_ptr[k] = current_space_lvl->array; 1146 1147 current_space->array += nzk; 1148 current_space->local_used += nzk; 1149 current_space->local_remaining -= nzk; 1150 1151 current_space_lvl->array += nzk; 1152 current_space_lvl->local_used += nzk; 1153 current_space_lvl->local_remaining -= nzk; 1154 1155 ui[k + 1] = ui[k] + nzk; 1156 } 1157 1158 PetscCall(ISRestoreIndices(perm, &rip)); 1159 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1160 PetscCall(PetscFree(cols_lvl)); 1161 1162 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1163 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1164 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1165 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1166 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1167 1168 /* put together the new matrix in MATSEQSBAIJ format */ 1169 B = fact; 1170 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1171 1172 b = (Mat_SeqSBAIJ *)B->data; 1173 b->singlemalloc = PETSC_FALSE; 1174 b->free_a = PETSC_TRUE; 1175 b->free_ij = PETSC_TRUE; 1176 1177 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1178 1179 b->j = uj; 1180 b->i = ui; 1181 b->diag = NULL; 1182 b->ilen = NULL; 1183 b->imax = NULL; 1184 b->row = perm; 1185 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1186 1187 PetscCall(PetscObjectReference((PetscObject)perm)); 1188 1189 b->icol = perm; 1190 1191 PetscCall(PetscObjectReference((PetscObject)perm)); 1192 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1193 1194 b->maxnz = b->nz = ui[am]; 1195 1196 B->info.factor_mallocs = reallocs; 1197 B->info.fill_ratio_given = fill; 1198 if (ai[am] != 0.) { 1199 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1200 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1201 } else { 1202 B->info.fill_ratio_needed = 0.0; 1203 } 1204 #if defined(PETSC_USE_INFO) 1205 if (ai[am] != 0) { 1206 PetscReal af = B->info.fill_ratio_needed; 1207 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1208 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1209 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1210 } else { 1211 PetscCall(PetscInfo(A, "Empty matrix\n")); 1212 } 1213 #endif 1214 if (perm_identity) { 1215 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1216 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1217 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1218 } else { 1219 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1220 } 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1225 { 1226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1227 Mat_SeqSBAIJ *b; 1228 Mat B; 1229 PetscBool perm_identity, missing; 1230 PetscReal fill = info->fill; 1231 const PetscInt *rip; 1232 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1233 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1234 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1235 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1236 PetscBT lnkbt; 1237 1238 PetscFunctionBegin; 1239 if (bs > 1) { /* convert to seqsbaij */ 1240 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1241 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1242 1243 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1244 PetscFunctionReturn(PETSC_SUCCESS); 1245 } 1246 1247 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1248 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1249 1250 /* check whether perm is the identity mapping */ 1251 PetscCall(ISIdentity(perm, &perm_identity)); 1252 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1253 PetscCall(ISGetIndices(perm, &rip)); 1254 1255 /* initialization */ 1256 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1257 ui[0] = 0; 1258 1259 /* jl: linked list for storing indices of the pivot rows 1260 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1261 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1262 for (i = 0; i < mbs; i++) { 1263 jl[i] = mbs; 1264 il[i] = 0; 1265 } 1266 1267 /* create and initialize a linked list for storing column indices of the active row k */ 1268 nlnk = mbs + 1; 1269 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1270 1271 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1272 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1273 1274 current_space = free_space; 1275 1276 for (k = 0; k < mbs; k++) { /* for each active row k */ 1277 /* initialize lnk by the column indices of row rip[k] of A */ 1278 nzk = 0; 1279 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1280 ncols_upper = 0; 1281 for (j = 0; j < ncols; j++) { 1282 i = rip[*(aj + ai[rip[k]] + j)]; 1283 if (i >= k) { /* only take upper triangular entry */ 1284 cols[ncols_upper] = i; 1285 ncols_upper++; 1286 } 1287 } 1288 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1289 nzk += nlnk; 1290 1291 /* update lnk by computing fill-in for each pivot row to be merged in */ 1292 prow = jl[k]; /* 1st pivot row */ 1293 1294 while (prow < k) { 1295 nextprow = jl[prow]; 1296 /* merge prow into k-th row */ 1297 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1298 jmax = ui[prow + 1]; 1299 ncols = jmax - jmin; 1300 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1301 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1302 nzk += nlnk; 1303 1304 /* update il and jl for prow */ 1305 if (jmin < jmax) { 1306 il[prow] = jmin; 1307 j = *uj_ptr; 1308 jl[prow] = jl[j]; 1309 jl[j] = prow; 1310 } 1311 prow = nextprow; 1312 } 1313 1314 /* if free space is not available, make more free space */ 1315 if (current_space->local_remaining < nzk) { 1316 i = mbs - k + 1; /* num of unfactored rows */ 1317 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1318 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1319 reallocs++; 1320 } 1321 1322 /* copy data into free space, then initialize lnk */ 1323 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1324 1325 /* add the k-th row into il and jl */ 1326 if (nzk - 1 > 0) { 1327 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1328 jl[k] = jl[i]; 1329 jl[i] = k; 1330 il[k] = ui[k] + 1; 1331 } 1332 ui_ptr[k] = current_space->array; 1333 current_space->array += nzk; 1334 current_space->local_used += nzk; 1335 current_space->local_remaining -= nzk; 1336 1337 ui[k + 1] = ui[k] + nzk; 1338 } 1339 1340 PetscCall(ISRestoreIndices(perm, &rip)); 1341 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1342 1343 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1344 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1345 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1346 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1347 1348 /* put together the new matrix in MATSEQSBAIJ format */ 1349 B = fact; 1350 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1351 1352 b = (Mat_SeqSBAIJ *)B->data; 1353 b->singlemalloc = PETSC_FALSE; 1354 b->free_a = PETSC_TRUE; 1355 b->free_ij = PETSC_TRUE; 1356 1357 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1358 1359 b->j = uj; 1360 b->i = ui; 1361 b->diag = NULL; 1362 b->ilen = NULL; 1363 b->imax = NULL; 1364 b->row = perm; 1365 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1366 1367 PetscCall(PetscObjectReference((PetscObject)perm)); 1368 b->icol = perm; 1369 PetscCall(PetscObjectReference((PetscObject)perm)); 1370 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1371 b->maxnz = b->nz = ui[mbs]; 1372 1373 B->info.factor_mallocs = reallocs; 1374 B->info.fill_ratio_given = fill; 1375 if (ai[mbs] != 0.) { 1376 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1377 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1378 } else { 1379 B->info.fill_ratio_needed = 0.0; 1380 } 1381 #if defined(PETSC_USE_INFO) 1382 if (ai[mbs] != 0.) { 1383 PetscReal af = B->info.fill_ratio_needed; 1384 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1385 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1386 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1387 } else { 1388 PetscCall(PetscInfo(A, "Empty matrix\n")); 1389 } 1390 #endif 1391 if (perm_identity) { 1392 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1393 } else { 1394 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1395 } 1396 PetscFunctionReturn(PETSC_SUCCESS); 1397 } 1398 1399 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1400 { 1401 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1402 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1403 PetscInt i, k, n = a->mbs; 1404 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1405 const MatScalar *aa = a->a, *v; 1406 PetscScalar *x, *s, *t, *ls; 1407 const PetscScalar *b; 1408 1409 PetscFunctionBegin; 1410 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1411 PetscCall(VecGetArrayRead(bb, &b)); 1412 PetscCall(VecGetArray(xx, &x)); 1413 t = a->solve_work; 1414 1415 /* forward solve the lower triangular */ 1416 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1417 1418 for (i = 1; i < n; i++) { 1419 v = aa + bs2 * ai[i]; 1420 vi = aj + ai[i]; 1421 nz = ai[i + 1] - ai[i]; 1422 s = t + bs * i; 1423 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1424 for (k = 0; k < nz; k++) { 1425 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1426 v += bs2; 1427 } 1428 } 1429 1430 /* backward solve the upper triangular */ 1431 ls = a->solve_work + A->cmap->n; 1432 for (i = n - 1; i >= 0; i--) { 1433 v = aa + bs2 * (adiag[i + 1] + 1); 1434 vi = aj + adiag[i + 1] + 1; 1435 nz = adiag[i] - adiag[i + 1] - 1; 1436 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1437 for (k = 0; k < nz; k++) { 1438 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1439 v += bs2; 1440 } 1441 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1442 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1443 } 1444 1445 PetscCall(VecRestoreArrayRead(bb, &b)); 1446 PetscCall(VecRestoreArray(xx, &x)); 1447 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1448 PetscFunctionReturn(PETSC_SUCCESS); 1449 } 1450 1451 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1452 { 1453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1454 IS iscol = a->col, isrow = a->row; 1455 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1456 PetscInt i, m, n = a->mbs; 1457 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1458 const MatScalar *aa = a->a, *v; 1459 PetscScalar *x, *s, *t, *ls; 1460 const PetscScalar *b; 1461 1462 PetscFunctionBegin; 1463 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 1464 PetscCall(VecGetArrayRead(bb, &b)); 1465 PetscCall(VecGetArray(xx, &x)); 1466 t = a->solve_work; 1467 1468 PetscCall(ISGetIndices(isrow, &rout)); 1469 r = rout; 1470 PetscCall(ISGetIndices(iscol, &cout)); 1471 c = cout; 1472 1473 /* forward solve the lower triangular */ 1474 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1475 for (i = 1; i < n; i++) { 1476 v = aa + bs2 * ai[i]; 1477 vi = aj + ai[i]; 1478 nz = ai[i + 1] - ai[i]; 1479 s = t + bs * i; 1480 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1481 for (m = 0; m < nz; m++) { 1482 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1483 v += bs2; 1484 } 1485 } 1486 1487 /* backward solve the upper triangular */ 1488 ls = a->solve_work + A->cmap->n; 1489 for (i = n - 1; i >= 0; i--) { 1490 v = aa + bs2 * (adiag[i + 1] + 1); 1491 vi = aj + adiag[i + 1] + 1; 1492 nz = adiag[i] - adiag[i + 1] - 1; 1493 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1494 for (m = 0; m < nz; m++) { 1495 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1496 v += bs2; 1497 } 1498 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1499 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1500 } 1501 PetscCall(ISRestoreIndices(isrow, &rout)); 1502 PetscCall(ISRestoreIndices(iscol, &cout)); 1503 PetscCall(VecRestoreArrayRead(bb, &b)); 1504 PetscCall(VecRestoreArray(xx, &x)); 1505 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1506 PetscFunctionReturn(PETSC_SUCCESS); 1507 } 1508 1509 /* 1510 For each block in an block array saves the largest absolute value in the block into another array 1511 */ 1512 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1513 { 1514 PetscInt i, j; 1515 1516 PetscFunctionBegin; 1517 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1518 for (i = 0; i < nbs; i++) { 1519 for (j = 0; j < bs2; j++) { 1520 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1521 } 1522 } 1523 PetscFunctionReturn(PETSC_SUCCESS); 1524 } 1525 1526 /* 1527 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1528 */ 1529 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1530 { 1531 Mat B = *fact; 1532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1533 IS isicol; 1534 const PetscInt *r, *ic; 1535 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1536 PetscInt *bi, *bj, *bdiag; 1537 1538 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1539 PetscInt nlnk, *lnk; 1540 PetscBT lnkbt; 1541 PetscBool row_identity, icol_identity; 1542 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1543 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1544 1545 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1546 PetscInt nnz_max; 1547 PetscBool missing; 1548 PetscReal *vtmp_abs; 1549 MatScalar *v_work; 1550 PetscInt *v_pivots; 1551 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1552 1553 PetscFunctionBegin; 1554 /* symbolic factorization, can be reused */ 1555 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1556 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1557 adiag = a->diag; 1558 1559 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1560 1561 /* bdiag is location of diagonal in factor */ 1562 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1563 1564 /* allocate row pointers bi */ 1565 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1566 1567 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1568 dtcount = (PetscInt)info->dtcount; 1569 if (dtcount > mbs - 1) dtcount = mbs - 1; 1570 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1571 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1572 PetscCall(PetscMalloc1(nnz_max, &bj)); 1573 nnz_max = nnz_max * bs2; 1574 PetscCall(PetscMalloc1(nnz_max, &ba)); 1575 1576 /* put together the new matrix */ 1577 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1578 1579 b = (Mat_SeqBAIJ *)(B)->data; 1580 b->free_a = PETSC_TRUE; 1581 b->free_ij = PETSC_TRUE; 1582 b->singlemalloc = PETSC_FALSE; 1583 1584 b->a = ba; 1585 b->j = bj; 1586 b->i = bi; 1587 b->diag = bdiag; 1588 b->ilen = NULL; 1589 b->imax = NULL; 1590 b->row = isrow; 1591 b->col = iscol; 1592 1593 PetscCall(PetscObjectReference((PetscObject)isrow)); 1594 PetscCall(PetscObjectReference((PetscObject)iscol)); 1595 1596 b->icol = isicol; 1597 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1598 b->maxnz = nnz_max / bs2; 1599 1600 (B)->factortype = MAT_FACTOR_ILUDT; 1601 (B)->info.factor_mallocs = 0; 1602 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1603 /* end of symbolic factorization */ 1604 PetscCall(ISGetIndices(isrow, &r)); 1605 PetscCall(ISGetIndices(isicol, &ic)); 1606 1607 /* linked list for storing column indices of the active row */ 1608 nlnk = mbs + 1; 1609 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1610 1611 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1612 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1613 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1614 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1615 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1616 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1617 1618 allowzeropivot = PetscNot(A->erroriffailure); 1619 bi[0] = 0; 1620 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1621 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1622 for (i = 0; i < mbs; i++) { 1623 /* copy initial fill into linked list */ 1624 nzi = ai[r[i] + 1] - ai[r[i]]; 1625 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1626 nzi_al = adiag[r[i]] - ai[r[i]]; 1627 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1628 1629 /* load in initial unfactored row */ 1630 ajtmp = aj + ai[r[i]]; 1631 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1632 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1633 aatmp = a->a + bs2 * ai[r[i]]; 1634 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1635 1636 /* add pivot rows into linked list */ 1637 row = lnk[mbs]; 1638 while (row < i) { 1639 nzi_bl = bi[row + 1] - bi[row] + 1; 1640 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1641 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1642 nzi += nlnk; 1643 row = lnk[row]; 1644 } 1645 1646 /* copy data from lnk into jtmp, then initialize lnk */ 1647 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1648 1649 /* numerical factorization */ 1650 bjtmp = jtmp; 1651 row = *bjtmp++; /* 1st pivot row */ 1652 1653 while (row < i) { 1654 pc = rtmp + bs2 * row; 1655 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1656 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1657 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1658 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1659 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1660 pv = ba + bs2 * (bdiag[row + 1] + 1); 1661 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1662 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1663 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1664 } 1665 row = *bjtmp++; 1666 } 1667 1668 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1669 nzi_bl = 0; 1670 j = 0; 1671 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1672 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1673 nzi_bl++; 1674 j++; 1675 } 1676 nzi_bu = nzi - nzi_bl - 1; 1677 1678 while (j < nzi) { /* U-part */ 1679 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1680 j++; 1681 } 1682 1683 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1684 1685 bjtmp = bj + bi[i]; 1686 batmp = ba + bs2 * bi[i]; 1687 /* apply level dropping rule to L part */ 1688 ncut = nzi_al + dtcount; 1689 if (ncut < nzi_bl) { 1690 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1691 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1692 } else { 1693 ncut = nzi_bl; 1694 } 1695 for (j = 0; j < ncut; j++) { 1696 bjtmp[j] = jtmp[j]; 1697 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1698 } 1699 bi[i + 1] = bi[i] + ncut; 1700 nzi = ncut + 1; 1701 1702 /* apply level dropping rule to U part */ 1703 ncut = nzi_au + dtcount; 1704 if (ncut < nzi_bu) { 1705 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1706 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1707 } else { 1708 ncut = nzi_bu; 1709 } 1710 nzi += ncut; 1711 1712 /* mark bdiagonal */ 1713 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1714 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1715 1716 bjtmp = bj + bdiag[i]; 1717 batmp = ba + bs2 * bdiag[i]; 1718 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1719 *bjtmp = i; 1720 1721 bjtmp = bj + bdiag[i + 1] + 1; 1722 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1723 1724 for (k = 0; k < ncut; k++) { 1725 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1726 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1727 } 1728 1729 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1730 1731 /* invert diagonal block for simpler triangular solves - add shift??? */ 1732 batmp = ba + bs2 * bdiag[i]; 1733 1734 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1735 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1736 } /* for (i=0; i<mbs; i++) */ 1737 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1738 1739 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1740 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1741 1742 PetscCall(ISRestoreIndices(isrow, &r)); 1743 PetscCall(ISRestoreIndices(isicol, &ic)); 1744 1745 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1746 1747 PetscCall(PetscFree2(im, jtmp)); 1748 PetscCall(PetscFree2(rtmp, vtmp)); 1749 1750 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1751 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1752 1753 PetscCall(ISIdentity(isrow, &row_identity)); 1754 PetscCall(ISIdentity(isicol, &icol_identity)); 1755 if (row_identity && icol_identity) { 1756 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1757 } else { 1758 B->ops->solve = MatSolve_SeqBAIJ_N; 1759 } 1760 1761 B->ops->solveadd = NULL; 1762 B->ops->solvetranspose = NULL; 1763 B->ops->solvetransposeadd = NULL; 1764 B->ops->matsolve = NULL; 1765 B->assembled = PETSC_TRUE; 1766 B->preallocated = PETSC_TRUE; 1767 PetscFunctionReturn(PETSC_SUCCESS); 1768 } 1769