1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 9 { 10 Mat C = B; 11 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 12 IS isrow = b->row, isicol = b->icol; 13 const PetscInt *r, *ic; 14 PetscInt i, j, k, nz, nzL, row, *pj; 15 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 16 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 17 MatScalar *rtmp, *pc, *mwork, *pv; 18 MatScalar *aa = a->a, *v; 19 PetscInt flg; 20 PetscReal shift = info->shiftamount; 21 PetscBool allowzeropivot, zeropivotdetected; 22 23 PetscFunctionBegin; 24 PetscCall(ISGetIndices(isrow, &r)); 25 PetscCall(ISGetIndices(isicol, &ic)); 26 allowzeropivot = PetscNot(A->erroriffailure); 27 28 /* generate work space needed by the factorization */ 29 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 30 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 31 32 for (i = 0; i < n; i++) { 33 /* zero rtmp */ 34 /* L part */ 35 nz = bi[i + 1] - bi[i]; 36 bjtmp = bj + bi[i]; 37 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 38 39 /* U part */ 40 nz = bdiag[i] - bdiag[i + 1]; 41 bjtmp = bj + bdiag[i + 1] + 1; 42 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 43 44 /* load in initial (unfactored row) */ 45 nz = ai[r[i] + 1] - ai[r[i]]; 46 ajtmp = aj + ai[r[i]]; 47 v = aa + bs2 * ai[r[i]]; 48 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 49 50 /* elimination */ 51 bjtmp = bj + bi[i]; 52 nzL = bi[i + 1] - bi[i]; 53 for (k = 0; k < nzL; k++) { 54 row = bjtmp[k]; 55 pc = rtmp + bs2 * row; 56 for (flg = 0, j = 0; j < bs2; j++) { 57 if (pc[j] != (PetscScalar)0.0) { 58 flg = 1; 59 break; 60 } 61 } 62 if (flg) { 63 pv = b->a + bs2 * bdiag[row]; 64 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 65 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 66 67 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 68 pv = b->a + bs2 * (bdiag[row + 1] + 1); 69 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 70 for (j = 0; j < nz; j++) { 71 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 72 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 73 v = rtmp + 4 * pj[j]; 74 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 75 pv += 4; 76 } 77 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 78 } 79 } 80 81 /* finished row so stick it into b->a */ 82 /* L part */ 83 pv = b->a + bs2 * bi[i]; 84 pj = b->j + bi[i]; 85 nz = bi[i + 1] - bi[i]; 86 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 87 88 /* Mark diagonal and invert diagonal for simpler triangular solves */ 89 pv = b->a + bs2 * bdiag[i]; 90 pj = b->j + bdiag[i]; 91 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 92 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 93 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 94 95 /* U part */ 96 pv = b->a + bs2 * (bdiag[i + 1] + 1); 97 pj = b->j + bdiag[i + 1] + 1; 98 nz = bdiag[i] - bdiag[i + 1] - 1; 99 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 100 } 101 102 PetscCall(PetscFree2(rtmp, mwork)); 103 PetscCall(ISRestoreIndices(isicol, &ic)); 104 PetscCall(ISRestoreIndices(isrow, &r)); 105 106 C->ops->solve = MatSolve_SeqBAIJ_2; 107 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 108 C->assembled = PETSC_TRUE; 109 110 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 115 { 116 Mat C = B; 117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 118 PetscInt i, j, k, nz, nzL, row, *pj; 119 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 120 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 121 MatScalar *rtmp, *pc, *mwork, *pv; 122 MatScalar *aa = a->a, *v; 123 PetscInt flg; 124 PetscReal shift = info->shiftamount; 125 PetscBool allowzeropivot, zeropivotdetected; 126 127 PetscFunctionBegin; 128 allowzeropivot = PetscNot(A->erroriffailure); 129 130 /* generate work space needed by the factorization */ 131 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 132 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 133 134 for (i = 0; i < n; i++) { 135 /* zero rtmp */ 136 /* L part */ 137 nz = bi[i + 1] - bi[i]; 138 bjtmp = bj + bi[i]; 139 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 140 141 /* U part */ 142 nz = bdiag[i] - bdiag[i + 1]; 143 bjtmp = bj + bdiag[i + 1] + 1; 144 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 145 146 /* load in initial (unfactored row) */ 147 nz = ai[i + 1] - ai[i]; 148 ajtmp = aj + ai[i]; 149 v = aa + bs2 * ai[i]; 150 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 151 152 /* elimination */ 153 bjtmp = bj + bi[i]; 154 nzL = bi[i + 1] - bi[i]; 155 for (k = 0; k < nzL; k++) { 156 row = bjtmp[k]; 157 pc = rtmp + bs2 * row; 158 for (flg = 0, j = 0; j < bs2; j++) { 159 if (pc[j] != (PetscScalar)0.0) { 160 flg = 1; 161 break; 162 } 163 } 164 if (flg) { 165 pv = b->a + bs2 * bdiag[row]; 166 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 167 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 168 169 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 170 pv = b->a + bs2 * (bdiag[row + 1] + 1); 171 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 172 for (j = 0; j < nz; j++) { 173 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 174 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 175 v = rtmp + 4 * pj[j]; 176 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 177 pv += 4; 178 } 179 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 180 } 181 } 182 183 /* finished row so stick it into b->a */ 184 /* L part */ 185 pv = b->a + bs2 * bi[i]; 186 pj = b->j + bi[i]; 187 nz = bi[i + 1] - bi[i]; 188 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 189 190 /* Mark diagonal and invert diagonal for simpler triangular solves */ 191 pv = b->a + bs2 * bdiag[i]; 192 pj = b->j + bdiag[i]; 193 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 194 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 195 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 196 197 /* U part */ 198 /* 199 pv = b->a + bs2*bi[2*n-i]; 200 pj = b->j + bi[2*n-i]; 201 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 202 */ 203 pv = b->a + bs2 * (bdiag[i + 1] + 1); 204 pj = b->j + bdiag[i + 1] + 1; 205 nz = bdiag[i] - bdiag[i + 1] - 1; 206 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 207 } 208 PetscCall(PetscFree2(rtmp, mwork)); 209 210 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 211 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 212 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 213 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 214 C->assembled = PETSC_TRUE; 215 216 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 217 PetscFunctionReturn(PETSC_SUCCESS); 218 } 219 220 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 221 { 222 Mat C = B; 223 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 224 IS isrow = b->row, isicol = b->icol; 225 const PetscInt *r, *ic; 226 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 227 PetscInt *ajtmpold, *ajtmp, nz, row; 228 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 229 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 230 MatScalar p1, p2, p3, p4; 231 MatScalar *ba = b->a, *aa = a->a; 232 PetscReal shift = info->shiftamount; 233 PetscBool allowzeropivot, zeropivotdetected; 234 235 PetscFunctionBegin; 236 allowzeropivot = PetscNot(A->erroriffailure); 237 PetscCall(ISGetIndices(isrow, &r)); 238 PetscCall(ISGetIndices(isicol, &ic)); 239 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 240 241 for (i = 0; i < n; i++) { 242 nz = bi[i + 1] - bi[i]; 243 ajtmp = bj + bi[i]; 244 for (j = 0; j < nz; j++) { 245 x = rtmp + 4 * ajtmp[j]; 246 x[0] = x[1] = x[2] = x[3] = 0.0; 247 } 248 /* load in initial (unfactored row) */ 249 idx = r[i]; 250 nz = ai[idx + 1] - ai[idx]; 251 ajtmpold = aj + ai[idx]; 252 v = aa + 4 * ai[idx]; 253 for (j = 0; j < nz; j++) { 254 x = rtmp + 4 * ic[ajtmpold[j]]; 255 x[0] = v[0]; 256 x[1] = v[1]; 257 x[2] = v[2]; 258 x[3] = v[3]; 259 v += 4; 260 } 261 row = *ajtmp++; 262 while (row < i) { 263 pc = rtmp + 4 * row; 264 p1 = pc[0]; 265 p2 = pc[1]; 266 p3 = pc[2]; 267 p4 = pc[3]; 268 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 269 pv = ba + 4 * diag_offset[row]; 270 pj = bj + diag_offset[row] + 1; 271 x1 = pv[0]; 272 x2 = pv[1]; 273 x3 = pv[2]; 274 x4 = pv[3]; 275 pc[0] = m1 = p1 * x1 + p3 * x2; 276 pc[1] = m2 = p2 * x1 + p4 * x2; 277 pc[2] = m3 = p1 * x3 + p3 * x4; 278 pc[3] = m4 = p2 * x3 + p4 * x4; 279 nz = bi[row + 1] - diag_offset[row] - 1; 280 pv += 4; 281 for (j = 0; j < nz; j++) { 282 x1 = pv[0]; 283 x2 = pv[1]; 284 x3 = pv[2]; 285 x4 = pv[3]; 286 x = rtmp + 4 * pj[j]; 287 x[0] -= m1 * x1 + m3 * x2; 288 x[1] -= m2 * x1 + m4 * x2; 289 x[2] -= m1 * x3 + m3 * x4; 290 x[3] -= m2 * x3 + m4 * x4; 291 pv += 4; 292 } 293 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 294 } 295 row = *ajtmp++; 296 } 297 /* finished row so stick it into b->a */ 298 pv = ba + 4 * bi[i]; 299 pj = bj + bi[i]; 300 nz = bi[i + 1] - bi[i]; 301 for (j = 0; j < nz; j++) { 302 x = rtmp + 4 * pj[j]; 303 pv[0] = x[0]; 304 pv[1] = x[1]; 305 pv[2] = x[2]; 306 pv[3] = x[3]; 307 pv += 4; 308 } 309 /* invert diagonal block */ 310 w = ba + 4 * diag_offset[i]; 311 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 312 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 313 } 314 315 PetscCall(PetscFree(rtmp)); 316 PetscCall(ISRestoreIndices(isicol, &ic)); 317 PetscCall(ISRestoreIndices(isrow, &r)); 318 319 C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 320 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 321 C->assembled = PETSC_TRUE; 322 323 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 324 PetscFunctionReturn(PETSC_SUCCESS); 325 } 326 /* 327 Version for when blocks are 2 by 2 Using natural ordering 328 */ 329 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 330 { 331 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 332 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 333 PetscInt *ajtmpold, *ajtmp, nz, row; 334 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 335 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 336 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 337 MatScalar *ba = b->a, *aa = a->a; 338 PetscReal shift = info->shiftamount; 339 PetscBool allowzeropivot, zeropivotdetected; 340 341 PetscFunctionBegin; 342 allowzeropivot = PetscNot(A->erroriffailure); 343 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 344 for (i = 0; i < n; i++) { 345 nz = bi[i + 1] - bi[i]; 346 ajtmp = bj + bi[i]; 347 for (j = 0; j < nz; j++) { 348 x = rtmp + 4 * ajtmp[j]; 349 x[0] = x[1] = x[2] = x[3] = 0.0; 350 } 351 /* load in initial (unfactored row) */ 352 nz = ai[i + 1] - ai[i]; 353 ajtmpold = aj + ai[i]; 354 v = aa + 4 * ai[i]; 355 for (j = 0; j < nz; j++) { 356 x = rtmp + 4 * ajtmpold[j]; 357 x[0] = v[0]; 358 x[1] = v[1]; 359 x[2] = v[2]; 360 x[3] = v[3]; 361 v += 4; 362 } 363 row = *ajtmp++; 364 while (row < i) { 365 pc = rtmp + 4 * row; 366 p1 = pc[0]; 367 p2 = pc[1]; 368 p3 = pc[2]; 369 p4 = pc[3]; 370 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 371 pv = ba + 4 * diag_offset[row]; 372 pj = bj + diag_offset[row] + 1; 373 x1 = pv[0]; 374 x2 = pv[1]; 375 x3 = pv[2]; 376 x4 = pv[3]; 377 pc[0] = m1 = p1 * x1 + p3 * x2; 378 pc[1] = m2 = p2 * x1 + p4 * x2; 379 pc[2] = m3 = p1 * x3 + p3 * x4; 380 pc[3] = m4 = p2 * x3 + p4 * x4; 381 nz = bi[row + 1] - diag_offset[row] - 1; 382 pv += 4; 383 for (j = 0; j < nz; j++) { 384 x1 = pv[0]; 385 x2 = pv[1]; 386 x3 = pv[2]; 387 x4 = pv[3]; 388 x = rtmp + 4 * pj[j]; 389 x[0] -= m1 * x1 + m3 * x2; 390 x[1] -= m2 * x1 + m4 * x2; 391 x[2] -= m1 * x3 + m3 * x4; 392 x[3] -= m2 * x3 + m4 * x4; 393 pv += 4; 394 } 395 PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 396 } 397 row = *ajtmp++; 398 } 399 /* finished row so stick it into b->a */ 400 pv = ba + 4 * bi[i]; 401 pj = bj + bi[i]; 402 nz = bi[i + 1] - bi[i]; 403 for (j = 0; j < nz; j++) { 404 x = rtmp + 4 * pj[j]; 405 pv[0] = x[0]; 406 pv[1] = x[1]; 407 pv[2] = x[2]; 408 pv[3] = x[3]; 409 /* 410 printf(" col %d:",pj[j]); 411 PetscInt j1; 412 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 413 printf("\n"); 414 */ 415 pv += 4; 416 } 417 /* invert diagonal block */ 418 w = ba + 4 * diag_offset[i]; 419 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 420 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 421 } 422 423 PetscCall(PetscFree(rtmp)); 424 425 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 426 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 427 C->assembled = PETSC_TRUE; 428 429 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 /* ----------------------------------------------------------- */ 434 /* 435 Version for when blocks are 1 by 1. 436 */ 437 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 438 { 439 Mat C = B; 440 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 441 IS isrow = b->row, isicol = b->icol; 442 const PetscInt *r, *ic, *ics; 443 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 444 PetscInt i, j, k, nz, nzL, row, *pj; 445 const PetscInt *ajtmp, *bjtmp; 446 MatScalar *rtmp, *pc, multiplier, *pv; 447 const MatScalar *aa = a->a, *v; 448 PetscBool row_identity, col_identity; 449 FactorShiftCtx sctx; 450 const PetscInt *ddiag; 451 PetscReal rs; 452 MatScalar d; 453 454 PetscFunctionBegin; 455 /* MatPivotSetUp(): initialize shift context sctx */ 456 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 457 458 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 459 ddiag = a->diag; 460 sctx.shift_top = info->zeropivot; 461 for (i = 0; i < n; i++) { 462 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 463 d = (aa)[ddiag[i]]; 464 rs = -PetscAbsScalar(d) - PetscRealPart(d); 465 v = aa + ai[i]; 466 nz = ai[i + 1] - ai[i]; 467 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 468 if (rs > sctx.shift_top) sctx.shift_top = rs; 469 } 470 sctx.shift_top *= 1.1; 471 sctx.nshift_max = 5; 472 sctx.shift_lo = 0.; 473 sctx.shift_hi = 1.; 474 } 475 476 PetscCall(ISGetIndices(isrow, &r)); 477 PetscCall(ISGetIndices(isicol, &ic)); 478 PetscCall(PetscMalloc1(n + 1, &rtmp)); 479 ics = ic; 480 481 do { 482 sctx.newshift = PETSC_FALSE; 483 for (i = 0; i < n; i++) { 484 /* zero rtmp */ 485 /* L part */ 486 nz = bi[i + 1] - bi[i]; 487 bjtmp = bj + bi[i]; 488 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 489 490 /* U part */ 491 nz = bdiag[i] - bdiag[i + 1]; 492 bjtmp = bj + bdiag[i + 1] + 1; 493 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 494 495 /* load in initial (unfactored row) */ 496 nz = ai[r[i] + 1] - ai[r[i]]; 497 ajtmp = aj + ai[r[i]]; 498 v = aa + ai[r[i]]; 499 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 500 501 /* ZeropivotApply() */ 502 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 503 504 /* elimination */ 505 bjtmp = bj + bi[i]; 506 row = *bjtmp++; 507 nzL = bi[i + 1] - bi[i]; 508 for (k = 0; k < nzL; k++) { 509 pc = rtmp + row; 510 if (*pc != (PetscScalar)0.0) { 511 pv = b->a + bdiag[row]; 512 multiplier = *pc * (*pv); 513 *pc = multiplier; 514 515 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 516 pv = b->a + bdiag[row + 1] + 1; 517 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 518 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 519 PetscCall(PetscLogFlops(2.0 * nz)); 520 } 521 row = *bjtmp++; 522 } 523 524 /* finished row so stick it into b->a */ 525 rs = 0.0; 526 /* L part */ 527 pv = b->a + bi[i]; 528 pj = b->j + bi[i]; 529 nz = bi[i + 1] - bi[i]; 530 for (j = 0; j < nz; j++) { 531 pv[j] = rtmp[pj[j]]; 532 rs += PetscAbsScalar(pv[j]); 533 } 534 535 /* U part */ 536 pv = b->a + bdiag[i + 1] + 1; 537 pj = b->j + bdiag[i + 1] + 1; 538 nz = bdiag[i] - bdiag[i + 1] - 1; 539 for (j = 0; j < nz; j++) { 540 pv[j] = rtmp[pj[j]]; 541 rs += PetscAbsScalar(pv[j]); 542 } 543 544 sctx.rs = rs; 545 sctx.pv = rtmp[i]; 546 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 547 if (sctx.newshift) break; /* break for-loop */ 548 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 549 550 /* Mark diagonal and invert diagonal for simpler triangular solves */ 551 pv = b->a + bdiag[i]; 552 *pv = (PetscScalar)1.0 / rtmp[i]; 553 554 } /* endof for (i=0; i<n; i++) { */ 555 556 /* MatPivotRefine() */ 557 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 558 /* 559 * if no shift in this attempt & shifting & started shifting & can refine, 560 * then try lower shift 561 */ 562 sctx.shift_hi = sctx.shift_fraction; 563 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 564 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 565 sctx.newshift = PETSC_TRUE; 566 sctx.nshift++; 567 } 568 } while (sctx.newshift); 569 570 PetscCall(PetscFree(rtmp)); 571 PetscCall(ISRestoreIndices(isicol, &ic)); 572 PetscCall(ISRestoreIndices(isrow, &r)); 573 574 PetscCall(ISIdentity(isrow, &row_identity)); 575 PetscCall(ISIdentity(isicol, &col_identity)); 576 if (row_identity && col_identity) { 577 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 578 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 579 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 580 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 581 } else { 582 C->ops->solve = MatSolve_SeqBAIJ_1; 583 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 584 } 585 C->assembled = PETSC_TRUE; 586 PetscCall(PetscLogFlops(C->cmap->n)); 587 588 /* MatShiftView(A,info,&sctx) */ 589 if (sctx.nshift) { 590 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 591 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 592 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 593 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 594 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 595 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 596 } 597 } 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 602 { 603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 604 IS isrow = b->row, isicol = b->icol; 605 const PetscInt *r, *ic; 606 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 607 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 608 PetscInt *diag_offset = b->diag, diag, *pj; 609 MatScalar *pv, *v, *rtmp, multiplier, *pc; 610 MatScalar *ba = b->a, *aa = a->a; 611 PetscBool row_identity, col_identity; 612 613 PetscFunctionBegin; 614 PetscCall(ISGetIndices(isrow, &r)); 615 PetscCall(ISGetIndices(isicol, &ic)); 616 PetscCall(PetscMalloc1(n + 1, &rtmp)); 617 618 for (i = 0; i < n; i++) { 619 nz = bi[i + 1] - bi[i]; 620 ajtmp = bj + bi[i]; 621 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 622 623 /* load in initial (unfactored row) */ 624 nz = ai[r[i] + 1] - ai[r[i]]; 625 ajtmpold = aj + ai[r[i]]; 626 v = aa + ai[r[i]]; 627 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 628 629 row = *ajtmp++; 630 while (row < i) { 631 pc = rtmp + row; 632 if (*pc != 0.0) { 633 pv = ba + diag_offset[row]; 634 pj = bj + diag_offset[row] + 1; 635 multiplier = *pc * *pv++; 636 *pc = multiplier; 637 nz = bi[row + 1] - diag_offset[row] - 1; 638 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 639 PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 640 } 641 row = *ajtmp++; 642 } 643 /* finished row so stick it into b->a */ 644 pv = ba + bi[i]; 645 pj = bj + bi[i]; 646 nz = bi[i + 1] - bi[i]; 647 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 648 diag = diag_offset[i] - bi[i]; 649 /* check pivot entry for current row */ 650 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 651 pv[diag] = 1.0 / pv[diag]; 652 } 653 654 PetscCall(PetscFree(rtmp)); 655 PetscCall(ISRestoreIndices(isicol, &ic)); 656 PetscCall(ISRestoreIndices(isrow, &r)); 657 PetscCall(ISIdentity(isrow, &row_identity)); 658 PetscCall(ISIdentity(isicol, &col_identity)); 659 if (row_identity && col_identity) { 660 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 661 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 662 } else { 663 C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 664 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 665 } 666 C->assembled = PETSC_TRUE; 667 PetscCall(PetscLogFlops(C->cmap->n)); 668 PetscFunctionReturn(PETSC_SUCCESS); 669 } 670 671 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 672 { 673 PetscFunctionBegin; 674 *type = MATSOLVERPETSC; 675 PetscFunctionReturn(PETSC_SUCCESS); 676 } 677 678 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 679 { 680 PetscInt n = A->rmap->n; 681 682 PetscFunctionBegin; 683 #if defined(PETSC_USE_COMPLEX) 684 PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 685 #endif 686 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 687 PetscCall(MatSetSizes(*B, n, n, n, n)); 688 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 689 PetscCall(MatSetType(*B, MATSEQBAIJ)); 690 691 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 692 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 693 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 694 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 695 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 696 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 697 PetscCall(MatSetType(*B, MATSEQSBAIJ)); 698 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 699 700 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 701 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 702 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 703 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 704 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 705 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 706 (*B)->factortype = ftype; 707 (*B)->canuseordering = PETSC_TRUE; 708 709 PetscCall(PetscFree((*B)->solvertype)); 710 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 711 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 712 PetscFunctionReturn(PETSC_SUCCESS); 713 } 714 715 /* ----------------------------------------------------------- */ 716 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 717 { 718 Mat C; 719 720 PetscFunctionBegin; 721 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 722 PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 723 PetscCall(MatLUFactorNumeric(C, A, info)); 724 725 A->ops->solve = C->ops->solve; 726 A->ops->solvetranspose = C->ops->solvetranspose; 727 728 PetscCall(MatHeaderMerge(A, &C)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 732 #include <../src/mat/impls/sbaij/seq/sbaij.h> 733 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 734 { 735 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 736 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 737 IS ip = b->row; 738 const PetscInt *rip; 739 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 740 PetscInt *ai = a->i, *aj = a->j; 741 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 742 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 743 PetscReal rs; 744 FactorShiftCtx sctx; 745 746 PetscFunctionBegin; 747 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 748 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 749 PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 750 PetscCall(MatDestroy(&a->sbaijMat)); 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 /* MatPivotSetUp(): initialize shift context sctx */ 755 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 756 757 PetscCall(ISGetIndices(ip, &rip)); 758 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 759 760 sctx.shift_amount = 0.; 761 sctx.nshift = 0; 762 do { 763 sctx.newshift = PETSC_FALSE; 764 for (i = 0; i < mbs; i++) { 765 rtmp[i] = 0.0; 766 jl[i] = mbs; 767 il[0] = 0; 768 } 769 770 for (k = 0; k < mbs; k++) { 771 bval = ba + bi[k]; 772 /* initialize k-th row by the perm[k]-th row of A */ 773 jmin = ai[rip[k]]; 774 jmax = ai[rip[k] + 1]; 775 for (j = jmin; j < jmax; j++) { 776 col = rip[aj[j]]; 777 if (col >= k) { /* only take upper triangular entry */ 778 rtmp[col] = aa[j]; 779 *bval++ = 0.0; /* for in-place factorization */ 780 } 781 } 782 783 /* shift the diagonal of the matrix */ 784 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 785 786 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 787 dk = rtmp[k]; 788 i = jl[k]; /* first row to be added to k_th row */ 789 790 while (i < k) { 791 nexti = jl[i]; /* next row to be added to k_th row */ 792 793 /* compute multiplier, update diag(k) and U(i,k) */ 794 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 795 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 796 dk += uikdi * ba[ili]; 797 ba[ili] = uikdi; /* -U(i,k) */ 798 799 /* add multiple of row i to k-th row */ 800 jmin = ili + 1; 801 jmax = bi[i + 1]; 802 if (jmin < jmax) { 803 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 804 /* update il and jl for row i */ 805 il[i] = jmin; 806 j = bj[jmin]; 807 jl[i] = jl[j]; 808 jl[j] = i; 809 } 810 i = nexti; 811 } 812 813 /* shift the diagonals when zero pivot is detected */ 814 /* compute rs=sum of abs(off-diagonal) */ 815 rs = 0.0; 816 jmin = bi[k] + 1; 817 nz = bi[k + 1] - jmin; 818 if (nz) { 819 bcol = bj + jmin; 820 while (nz--) { 821 rs += PetscAbsScalar(rtmp[*bcol]); 822 bcol++; 823 } 824 } 825 826 sctx.rs = rs; 827 sctx.pv = dk; 828 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 829 if (sctx.newshift) break; 830 dk = sctx.pv; 831 832 /* copy data into U(k,:) */ 833 ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 834 jmin = bi[k] + 1; 835 jmax = bi[k + 1]; 836 if (jmin < jmax) { 837 for (j = jmin; j < jmax; j++) { 838 col = bj[j]; 839 ba[j] = rtmp[col]; 840 rtmp[col] = 0.0; 841 } 842 /* add the k-th row into il and jl */ 843 il[k] = jmin; 844 i = bj[jmin]; 845 jl[k] = jl[i]; 846 jl[i] = k; 847 } 848 } 849 } while (sctx.newshift); 850 PetscCall(PetscFree3(rtmp, il, jl)); 851 852 PetscCall(ISRestoreIndices(ip, &rip)); 853 854 C->assembled = PETSC_TRUE; 855 C->preallocated = PETSC_TRUE; 856 857 PetscCall(PetscLogFlops(C->rmap->N)); 858 if (sctx.nshift) { 859 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 860 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 861 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 862 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 863 } 864 } 865 PetscFunctionReturn(PETSC_SUCCESS); 866 } 867 868 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 869 { 870 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 871 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 872 PetscInt i, j, am = a->mbs; 873 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 874 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 875 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 876 PetscReal rs; 877 FactorShiftCtx sctx; 878 879 PetscFunctionBegin; 880 /* MatPivotSetUp(): initialize shift context sctx */ 881 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 882 883 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 884 885 do { 886 sctx.newshift = PETSC_FALSE; 887 for (i = 0; i < am; i++) { 888 rtmp[i] = 0.0; 889 jl[i] = am; 890 il[0] = 0; 891 } 892 893 for (k = 0; k < am; k++) { 894 /* initialize k-th row with elements nonzero in row perm(k) of A */ 895 nz = ai[k + 1] - ai[k]; 896 acol = aj + ai[k]; 897 aval = aa + ai[k]; 898 bval = ba + bi[k]; 899 while (nz--) { 900 if (*acol < k) { /* skip lower triangular entries */ 901 acol++; 902 aval++; 903 } else { 904 rtmp[*acol++] = *aval++; 905 *bval++ = 0.0; /* for in-place factorization */ 906 } 907 } 908 909 /* shift the diagonal of the matrix */ 910 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 911 912 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 913 dk = rtmp[k]; 914 i = jl[k]; /* first row to be added to k_th row */ 915 916 while (i < k) { 917 nexti = jl[i]; /* next row to be added to k_th row */ 918 /* compute multiplier, update D(k) and U(i,k) */ 919 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 920 uikdi = -ba[ili] * ba[bi[i]]; 921 dk += uikdi * ba[ili]; 922 ba[ili] = uikdi; /* -U(i,k) */ 923 924 /* add multiple of row i to k-th row ... */ 925 jmin = ili + 1; 926 nz = bi[i + 1] - jmin; 927 if (nz > 0) { 928 bcol = bj + jmin; 929 bval = ba + jmin; 930 while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 931 /* update il and jl for i-th row */ 932 il[i] = jmin; 933 j = bj[jmin]; 934 jl[i] = jl[j]; 935 jl[j] = i; 936 } 937 i = nexti; 938 } 939 940 /* shift the diagonals when zero pivot is detected */ 941 /* compute rs=sum of abs(off-diagonal) */ 942 rs = 0.0; 943 jmin = bi[k] + 1; 944 nz = bi[k + 1] - jmin; 945 if (nz) { 946 bcol = bj + jmin; 947 while (nz--) { 948 rs += PetscAbsScalar(rtmp[*bcol]); 949 bcol++; 950 } 951 } 952 953 sctx.rs = rs; 954 sctx.pv = dk; 955 PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 956 if (sctx.newshift) break; /* sctx.shift_amount is updated */ 957 dk = sctx.pv; 958 959 /* copy data into U(k,:) */ 960 ba[bi[k]] = 1.0 / dk; 961 jmin = bi[k] + 1; 962 nz = bi[k + 1] - jmin; 963 if (nz) { 964 bcol = bj + jmin; 965 bval = ba + jmin; 966 while (nz--) { 967 *bval++ = rtmp[*bcol]; 968 rtmp[*bcol++] = 0.0; 969 } 970 /* add k-th row into il and jl */ 971 il[k] = jmin; 972 i = bj[jmin]; 973 jl[k] = jl[i]; 974 jl[i] = k; 975 } 976 } 977 } while (sctx.newshift); 978 PetscCall(PetscFree3(rtmp, il, jl)); 979 980 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 981 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 982 C->assembled = PETSC_TRUE; 983 C->preallocated = PETSC_TRUE; 984 985 PetscCall(PetscLogFlops(C->rmap->N)); 986 if (sctx.nshift) { 987 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 988 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 989 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 990 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 991 } 992 } 993 PetscFunctionReturn(PETSC_SUCCESS); 994 } 995 996 #include <petscbt.h> 997 #include <../src/mat/utils/freespace.h> 998 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 999 { 1000 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1001 Mat_SeqSBAIJ *b; 1002 Mat B; 1003 PetscBool perm_identity, missing; 1004 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 1005 const PetscInt *rip; 1006 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 1007 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 1008 PetscReal fill = info->fill, levels = info->levels; 1009 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1010 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 1011 PetscBT lnkbt; 1012 1013 PetscFunctionBegin; 1014 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1015 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1016 1017 if (bs > 1) { 1018 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1019 (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1020 1021 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 1022 PetscFunctionReturn(PETSC_SUCCESS); 1023 } 1024 1025 PetscCall(ISIdentity(perm, &perm_identity)); 1026 PetscCall(ISGetIndices(perm, &rip)); 1027 1028 /* special case that simply copies fill pattern */ 1029 if (!levels && perm_identity) { 1030 PetscCall(PetscMalloc1(am + 1, &ui)); 1031 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1032 B = fact; 1033 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 1034 1035 b = (Mat_SeqSBAIJ *)B->data; 1036 uj = b->j; 1037 for (i = 0; i < am; i++) { 1038 aj = a->j + a->diag[i]; 1039 for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 1040 b->ilen[i] = ui[i]; 1041 } 1042 PetscCall(PetscFree(ui)); 1043 1044 B->factortype = MAT_FACTOR_NONE; 1045 1046 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 1047 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1048 B->factortype = MAT_FACTOR_ICC; 1049 1050 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1051 PetscFunctionReturn(PETSC_SUCCESS); 1052 } 1053 1054 /* initialization */ 1055 PetscCall(PetscMalloc1(am + 1, &ui)); 1056 ui[0] = 0; 1057 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 1058 1059 /* jl: linked list for storing indices of the pivot rows 1060 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 1061 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 1062 for (i = 0; i < am; i++) { 1063 jl[i] = am; 1064 il[i] = 0; 1065 } 1066 1067 /* create and initialize a linked list for storing column indices of the active row k */ 1068 nlnk = am + 1; 1069 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 1070 1071 /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 1072 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 1073 1074 current_space = free_space; 1075 1076 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 1077 current_space_lvl = free_space_lvl; 1078 1079 for (k = 0; k < am; k++) { /* for each active row k */ 1080 /* initialize lnk by the column indices of row rip[k] of A */ 1081 nzk = 0; 1082 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1083 ncols_upper = 0; 1084 cols = cols_lvl + am; 1085 for (j = 0; j < ncols; j++) { 1086 i = rip[*(aj + ai[rip[k]] + j)]; 1087 if (i >= k) { /* only take upper triangular entry */ 1088 cols[ncols_upper] = i; 1089 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 1090 ncols_upper++; 1091 } 1092 } 1093 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1094 nzk += nlnk; 1095 1096 /* update lnk by computing fill-in for each pivot row to be merged in */ 1097 prow = jl[k]; /* 1st pivot row */ 1098 1099 while (prow < k) { 1100 nextprow = jl[prow]; 1101 1102 /* merge prow into k-th row */ 1103 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 1104 jmax = ui[prow + 1]; 1105 ncols = jmax - jmin; 1106 i = jmin - ui[prow]; 1107 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 1108 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 1109 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 1110 nzk += nlnk; 1111 1112 /* update il and jl for prow */ 1113 if (jmin < jmax) { 1114 il[prow] = jmin; 1115 1116 j = *cols; 1117 jl[prow] = jl[j]; 1118 jl[j] = prow; 1119 } 1120 prow = nextprow; 1121 } 1122 1123 /* if free space is not available, make more free space */ 1124 if (current_space->local_remaining < nzk) { 1125 i = am - k + 1; /* num of unfactored rows */ 1126 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1127 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1128 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 1129 reallocs++; 1130 } 1131 1132 /* copy data into free_space and free_space_lvl, then initialize lnk */ 1133 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 1134 1135 /* add the k-th row into il and jl */ 1136 if (nzk - 1 > 0) { 1137 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 1138 jl[k] = jl[i]; 1139 jl[i] = k; 1140 il[k] = ui[k] + 1; 1141 } 1142 uj_ptr[k] = current_space->array; 1143 uj_lvl_ptr[k] = current_space_lvl->array; 1144 1145 current_space->array += nzk; 1146 current_space->local_used += nzk; 1147 current_space->local_remaining -= nzk; 1148 1149 current_space_lvl->array += nzk; 1150 current_space_lvl->local_used += nzk; 1151 current_space_lvl->local_remaining -= nzk; 1152 1153 ui[k + 1] = ui[k] + nzk; 1154 } 1155 1156 PetscCall(ISRestoreIndices(perm, &rip)); 1157 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 1158 PetscCall(PetscFree(cols_lvl)); 1159 1160 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1161 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 1162 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1163 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 1164 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 1165 1166 /* put together the new matrix in MATSEQSBAIJ format */ 1167 B = fact; 1168 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 1169 1170 b = (Mat_SeqSBAIJ *)B->data; 1171 b->singlemalloc = PETSC_FALSE; 1172 b->free_a = PETSC_TRUE; 1173 b->free_ij = PETSC_TRUE; 1174 1175 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 1176 1177 b->j = uj; 1178 b->i = ui; 1179 b->diag = NULL; 1180 b->ilen = NULL; 1181 b->imax = NULL; 1182 b->row = perm; 1183 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1184 1185 PetscCall(PetscObjectReference((PetscObject)perm)); 1186 1187 b->icol = perm; 1188 1189 PetscCall(PetscObjectReference((PetscObject)perm)); 1190 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 1191 1192 b->maxnz = b->nz = ui[am]; 1193 1194 B->info.factor_mallocs = reallocs; 1195 B->info.fill_ratio_given = fill; 1196 if (ai[am] != 0.) { 1197 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 1198 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 1199 } else { 1200 B->info.fill_ratio_needed = 0.0; 1201 } 1202 #if defined(PETSC_USE_INFO) 1203 if (ai[am] != 0) { 1204 PetscReal af = B->info.fill_ratio_needed; 1205 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1206 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1207 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1208 } else { 1209 PetscCall(PetscInfo(A, "Empty matrix\n")); 1210 } 1211 #endif 1212 if (perm_identity) { 1213 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1214 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 1215 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1216 } else { 1217 (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1218 } 1219 PetscFunctionReturn(PETSC_SUCCESS); 1220 } 1221 1222 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1223 { 1224 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1225 Mat_SeqSBAIJ *b; 1226 Mat B; 1227 PetscBool perm_identity, missing; 1228 PetscReal fill = info->fill; 1229 const PetscInt *rip; 1230 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 1231 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 1232 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 1233 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1234 PetscBT lnkbt; 1235 1236 PetscFunctionBegin; 1237 if (bs > 1) { /* convert to seqsbaij */ 1238 if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1239 (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1240 1241 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 1242 PetscFunctionReturn(PETSC_SUCCESS); 1243 } 1244 1245 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1246 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1247 1248 /* check whether perm is the identity mapping */ 1249 PetscCall(ISIdentity(perm, &perm_identity)); 1250 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 1251 PetscCall(ISGetIndices(perm, &rip)); 1252 1253 /* initialization */ 1254 PetscCall(PetscMalloc1(mbs + 1, &ui)); 1255 ui[0] = 0; 1256 1257 /* jl: linked list for storing indices of the pivot rows 1258 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 1259 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 1260 for (i = 0; i < mbs; i++) { 1261 jl[i] = mbs; 1262 il[i] = 0; 1263 } 1264 1265 /* create and initialize a linked list for storing column indices of the active row k */ 1266 nlnk = mbs + 1; 1267 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1268 1269 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 1270 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 1271 1272 current_space = free_space; 1273 1274 for (k = 0; k < mbs; k++) { /* for each active row k */ 1275 /* initialize lnk by the column indices of row rip[k] of A */ 1276 nzk = 0; 1277 ncols = ai[rip[k] + 1] - ai[rip[k]]; 1278 ncols_upper = 0; 1279 for (j = 0; j < ncols; j++) { 1280 i = rip[*(aj + ai[rip[k]] + j)]; 1281 if (i >= k) { /* only take upper triangular entry */ 1282 cols[ncols_upper] = i; 1283 ncols_upper++; 1284 } 1285 } 1286 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 1287 nzk += nlnk; 1288 1289 /* update lnk by computing fill-in for each pivot row to be merged in */ 1290 prow = jl[k]; /* 1st pivot row */ 1291 1292 while (prow < k) { 1293 nextprow = jl[prow]; 1294 /* merge prow into k-th row */ 1295 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 1296 jmax = ui[prow + 1]; 1297 ncols = jmax - jmin; 1298 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 1299 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 1300 nzk += nlnk; 1301 1302 /* update il and jl for prow */ 1303 if (jmin < jmax) { 1304 il[prow] = jmin; 1305 j = *uj_ptr; 1306 jl[prow] = jl[j]; 1307 jl[j] = prow; 1308 } 1309 prow = nextprow; 1310 } 1311 1312 /* if free space is not available, make more free space */ 1313 if (current_space->local_remaining < nzk) { 1314 i = mbs - k + 1; /* num of unfactored rows */ 1315 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1316 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 1317 reallocs++; 1318 } 1319 1320 /* copy data into free space, then initialize lnk */ 1321 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 1322 1323 /* add the k-th row into il and jl */ 1324 if (nzk - 1 > 0) { 1325 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 1326 jl[k] = jl[i]; 1327 jl[i] = k; 1328 il[k] = ui[k] + 1; 1329 } 1330 ui_ptr[k] = current_space->array; 1331 current_space->array += nzk; 1332 current_space->local_used += nzk; 1333 current_space->local_remaining -= nzk; 1334 1335 ui[k + 1] = ui[k] + nzk; 1336 } 1337 1338 PetscCall(ISRestoreIndices(perm, &rip)); 1339 PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 1340 1341 /* copy free_space into uj and free free_space; set uj in new datastructure; */ 1342 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 1343 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 1344 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1345 1346 /* put together the new matrix in MATSEQSBAIJ format */ 1347 B = fact; 1348 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1349 1350 b = (Mat_SeqSBAIJ *)B->data; 1351 b->singlemalloc = PETSC_FALSE; 1352 b->free_a = PETSC_TRUE; 1353 b->free_ij = PETSC_TRUE; 1354 1355 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 1356 1357 b->j = uj; 1358 b->i = ui; 1359 b->diag = NULL; 1360 b->ilen = NULL; 1361 b->imax = NULL; 1362 b->row = perm; 1363 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 1364 1365 PetscCall(PetscObjectReference((PetscObject)perm)); 1366 b->icol = perm; 1367 PetscCall(PetscObjectReference((PetscObject)perm)); 1368 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 1369 b->maxnz = b->nz = ui[mbs]; 1370 1371 B->info.factor_mallocs = reallocs; 1372 B->info.fill_ratio_given = fill; 1373 if (ai[mbs] != 0.) { 1374 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 1375 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 1376 } else { 1377 B->info.fill_ratio_needed = 0.0; 1378 } 1379 #if defined(PETSC_USE_INFO) 1380 if (ai[mbs] != 0.) { 1381 PetscReal af = B->info.fill_ratio_needed; 1382 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 1383 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 1384 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 1385 } else { 1386 PetscCall(PetscInfo(A, "Empty matrix\n")); 1387 } 1388 #endif 1389 if (perm_identity) { 1390 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 1391 } else { 1392 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 1393 } 1394 PetscFunctionReturn(PETSC_SUCCESS); 1395 } 1396 1397 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1398 { 1399 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1400 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1401 PetscInt i, k, n = a->mbs; 1402 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1403 const MatScalar *aa = a->a, *v; 1404 PetscScalar *x, *s, *t, *ls; 1405 const PetscScalar *b; 1406 1407 PetscFunctionBegin; 1408 PetscCall(VecGetArrayRead(bb, &b)); 1409 PetscCall(VecGetArray(xx, &x)); 1410 t = a->solve_work; 1411 1412 /* forward solve the lower triangular */ 1413 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 1414 1415 for (i = 1; i < n; i++) { 1416 v = aa + bs2 * ai[i]; 1417 vi = aj + ai[i]; 1418 nz = ai[i + 1] - ai[i]; 1419 s = t + bs * i; 1420 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 1421 for (k = 0; k < nz; k++) { 1422 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 1423 v += bs2; 1424 } 1425 } 1426 1427 /* backward solve the upper triangular */ 1428 ls = a->solve_work + A->cmap->n; 1429 for (i = n - 1; i >= 0; i--) { 1430 v = aa + bs2 * (adiag[i + 1] + 1); 1431 vi = aj + adiag[i + 1] + 1; 1432 nz = adiag[i] - adiag[i + 1] - 1; 1433 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1434 for (k = 0; k < nz; k++) { 1435 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 1436 v += bs2; 1437 } 1438 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 1439 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 1440 } 1441 1442 PetscCall(VecRestoreArrayRead(bb, &b)); 1443 PetscCall(VecRestoreArray(xx, &x)); 1444 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1445 PetscFunctionReturn(PETSC_SUCCESS); 1446 } 1447 1448 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1449 { 1450 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1451 IS iscol = a->col, isrow = a->row; 1452 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 1453 PetscInt i, m, n = a->mbs; 1454 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1455 const MatScalar *aa = a->a, *v; 1456 PetscScalar *x, *s, *t, *ls; 1457 const PetscScalar *b; 1458 1459 PetscFunctionBegin; 1460 PetscCall(VecGetArrayRead(bb, &b)); 1461 PetscCall(VecGetArray(xx, &x)); 1462 t = a->solve_work; 1463 1464 PetscCall(ISGetIndices(isrow, &rout)); 1465 r = rout; 1466 PetscCall(ISGetIndices(iscol, &cout)); 1467 c = cout; 1468 1469 /* forward solve the lower triangular */ 1470 PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 1471 for (i = 1; i < n; i++) { 1472 v = aa + bs2 * ai[i]; 1473 vi = aj + ai[i]; 1474 nz = ai[i + 1] - ai[i]; 1475 s = t + bs * i; 1476 PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 1477 for (m = 0; m < nz; m++) { 1478 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 1479 v += bs2; 1480 } 1481 } 1482 1483 /* backward solve the upper triangular */ 1484 ls = a->solve_work + A->cmap->n; 1485 for (i = n - 1; i >= 0; i--) { 1486 v = aa + bs2 * (adiag[i + 1] + 1); 1487 vi = aj + adiag[i + 1] + 1; 1488 nz = adiag[i] - adiag[i + 1] - 1; 1489 PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 1490 for (m = 0; m < nz; m++) { 1491 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 1492 v += bs2; 1493 } 1494 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 1495 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 1496 } 1497 PetscCall(ISRestoreIndices(isrow, &rout)); 1498 PetscCall(ISRestoreIndices(iscol, &cout)); 1499 PetscCall(VecRestoreArrayRead(bb, &b)); 1500 PetscCall(VecRestoreArray(xx, &x)); 1501 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 1502 PetscFunctionReturn(PETSC_SUCCESS); 1503 } 1504 1505 /* 1506 For each block in an block array saves the largest absolute value in the block into another array 1507 */ 1508 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1509 { 1510 PetscInt i, j; 1511 1512 PetscFunctionBegin; 1513 PetscCall(PetscArrayzero(absarray, nbs + 1)); 1514 for (i = 0; i < nbs; i++) { 1515 for (j = 0; j < bs2; j++) { 1516 if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 1517 } 1518 } 1519 PetscFunctionReturn(PETSC_SUCCESS); 1520 } 1521 1522 /* 1523 This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1524 */ 1525 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1526 { 1527 Mat B = *fact; 1528 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 1529 IS isicol; 1530 const PetscInt *r, *ic; 1531 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 1532 PetscInt *bi, *bj, *bdiag; 1533 1534 PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 1535 PetscInt nlnk, *lnk; 1536 PetscBT lnkbt; 1537 PetscBool row_identity, icol_identity; 1538 MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 1539 PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 1540 1541 PetscReal dt = info->dt; /* shift=info->shiftamount; */ 1542 PetscInt nnz_max; 1543 PetscBool missing; 1544 PetscReal *vtmp_abs; 1545 MatScalar *v_work; 1546 PetscInt *v_pivots; 1547 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1548 1549 PetscFunctionBegin; 1550 /* ------- symbolic factorization, can be reused ---------*/ 1551 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1552 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1553 adiag = a->diag; 1554 1555 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 1556 1557 /* bdiag is location of diagonal in factor */ 1558 PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 1559 1560 /* allocate row pointers bi */ 1561 PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 1562 1563 /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 1564 dtcount = (PetscInt)info->dtcount; 1565 if (dtcount > mbs - 1) dtcount = mbs - 1; 1566 nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 1567 /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 1568 PetscCall(PetscMalloc1(nnz_max, &bj)); 1569 nnz_max = nnz_max * bs2; 1570 PetscCall(PetscMalloc1(nnz_max, &ba)); 1571 1572 /* put together the new matrix */ 1573 PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 1574 1575 b = (Mat_SeqBAIJ *)(B)->data; 1576 b->free_a = PETSC_TRUE; 1577 b->free_ij = PETSC_TRUE; 1578 b->singlemalloc = PETSC_FALSE; 1579 1580 b->a = ba; 1581 b->j = bj; 1582 b->i = bi; 1583 b->diag = bdiag; 1584 b->ilen = NULL; 1585 b->imax = NULL; 1586 b->row = isrow; 1587 b->col = iscol; 1588 1589 PetscCall(PetscObjectReference((PetscObject)isrow)); 1590 PetscCall(PetscObjectReference((PetscObject)iscol)); 1591 1592 b->icol = isicol; 1593 PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 1594 b->maxnz = nnz_max / bs2; 1595 1596 (B)->factortype = MAT_FACTOR_ILUDT; 1597 (B)->info.factor_mallocs = 0; 1598 (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 1599 /* ------- end of symbolic factorization ---------*/ 1600 PetscCall(ISGetIndices(isrow, &r)); 1601 PetscCall(ISGetIndices(isicol, &ic)); 1602 1603 /* linked list for storing column indices of the active row */ 1604 nlnk = mbs + 1; 1605 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 1606 1607 /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 1608 PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 1609 /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 1610 PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 1611 PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 1612 PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 1613 1614 allowzeropivot = PetscNot(A->erroriffailure); 1615 bi[0] = 0; 1616 bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 1617 bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 1618 for (i = 0; i < mbs; i++) { 1619 /* copy initial fill into linked list */ 1620 nzi = ai[r[i] + 1] - ai[r[i]]; 1621 PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 1622 nzi_al = adiag[r[i]] - ai[r[i]]; 1623 nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 1624 1625 /* load in initial unfactored row */ 1626 ajtmp = aj + ai[r[i]]; 1627 PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 1628 PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 1629 aatmp = a->a + bs2 * ai[r[i]]; 1630 for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 1631 1632 /* add pivot rows into linked list */ 1633 row = lnk[mbs]; 1634 while (row < i) { 1635 nzi_bl = bi[row + 1] - bi[row] + 1; 1636 bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 1637 PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 1638 nzi += nlnk; 1639 row = lnk[row]; 1640 } 1641 1642 /* copy data from lnk into jtmp, then initialize lnk */ 1643 PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 1644 1645 /* numerical factorization */ 1646 bjtmp = jtmp; 1647 row = *bjtmp++; /* 1st pivot row */ 1648 1649 while (row < i) { 1650 pc = rtmp + bs2 * row; 1651 pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 1652 PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 1653 PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 1654 if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 1655 pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 1656 pv = ba + bs2 * (bdiag[row + 1] + 1); 1657 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1658 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 1659 /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 1660 } 1661 row = *bjtmp++; 1662 } 1663 1664 /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 1665 nzi_bl = 0; 1666 j = 0; 1667 while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 1668 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1669 nzi_bl++; 1670 j++; 1671 } 1672 nzi_bu = nzi - nzi_bl - 1; 1673 1674 while (j < nzi) { /* U-part */ 1675 PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 1676 j++; 1677 } 1678 1679 PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 1680 1681 bjtmp = bj + bi[i]; 1682 batmp = ba + bs2 * bi[i]; 1683 /* apply level dropping rule to L part */ 1684 ncut = nzi_al + dtcount; 1685 if (ncut < nzi_bl) { 1686 PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 1687 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 1688 } else { 1689 ncut = nzi_bl; 1690 } 1691 for (j = 0; j < ncut; j++) { 1692 bjtmp[j] = jtmp[j]; 1693 PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 1694 } 1695 bi[i + 1] = bi[i] + ncut; 1696 nzi = ncut + 1; 1697 1698 /* apply level dropping rule to U part */ 1699 ncut = nzi_au + dtcount; 1700 if (ncut < nzi_bu) { 1701 PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 1702 PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 1703 } else { 1704 ncut = nzi_bu; 1705 } 1706 nzi += ncut; 1707 1708 /* mark bdiagonal */ 1709 bdiag[i + 1] = bdiag[i] - (ncut + 1); 1710 bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 1711 1712 bjtmp = bj + bdiag[i]; 1713 batmp = ba + bs2 * bdiag[i]; 1714 PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 1715 *bjtmp = i; 1716 1717 bjtmp = bj + bdiag[i + 1] + 1; 1718 batmp = ba + (bdiag[i + 1] + 1) * bs2; 1719 1720 for (k = 0; k < ncut; k++) { 1721 bjtmp[k] = jtmp[nzi_bl + 1 + k]; 1722 PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 1723 } 1724 1725 im[i] = nzi; /* used by PetscLLAddSortedLU() */ 1726 1727 /* invert diagonal block for simpler triangular solves - add shift??? */ 1728 batmp = ba + bs2 * bdiag[i]; 1729 1730 PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 1731 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1732 } /* for (i=0; i<mbs; i++) */ 1733 PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 1734 1735 /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 1736 PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 1737 1738 PetscCall(ISRestoreIndices(isrow, &r)); 1739 PetscCall(ISRestoreIndices(isicol, &ic)); 1740 1741 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1742 1743 PetscCall(PetscFree2(im, jtmp)); 1744 PetscCall(PetscFree2(rtmp, vtmp)); 1745 1746 PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 1747 b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 1748 1749 PetscCall(ISIdentity(isrow, &row_identity)); 1750 PetscCall(ISIdentity(isicol, &icol_identity)); 1751 if (row_identity && icol_identity) { 1752 B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 1753 } else { 1754 B->ops->solve = MatSolve_SeqBAIJ_N; 1755 } 1756 1757 B->ops->solveadd = NULL; 1758 B->ops->solvetranspose = NULL; 1759 B->ops->solvetransposeadd = NULL; 1760 B->ops->matsolve = NULL; 1761 B->assembled = PETSC_TRUE; 1762 B->preallocated = PETSC_TRUE; 1763 PetscFunctionReturn(PETSC_SUCCESS); 1764 } 1765