1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 /* ----------------------------------------------------------------*/ 12 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool); 13 14 /* 15 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 16 */ 17 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 18 { 19 Mat C = B; 20 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 21 PetscInt i, j, k, ipvt[15]; 22 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj; 23 PetscInt nz, nzL, row; 24 MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225]; 25 const MatScalar *v, *aa = a->a; 26 PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg; 27 PetscInt sol_ver; 28 PetscBool allowzeropivot, zeropivotdetected; 29 30 PetscFunctionBegin; 31 allowzeropivot = PetscNot(A->erroriffailure); 32 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL)); 33 34 /* generate work space needed by the factorization */ 35 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 36 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 37 38 for (i = 0; i < n; i++) { 39 /* zero rtmp */ 40 /* L part */ 41 nz = bi[i + 1] - bi[i]; 42 bjtmp = bj + bi[i]; 43 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 44 45 /* U part */ 46 nz = bdiag[i] - bdiag[i + 1]; 47 bjtmp = bj + bdiag[i + 1] + 1; 48 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 49 50 /* load in initial (unfactored row) */ 51 nz = ai[i + 1] - ai[i]; 52 ajtmp = aj + ai[i]; 53 v = aa + bs2 * ai[i]; 54 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 55 56 /* elimination */ 57 bjtmp = bj + bi[i]; 58 nzL = bi[i + 1] - bi[i]; 59 for (k = 0; k < nzL; k++) { 60 row = bjtmp[k]; 61 pc = rtmp + bs2 * row; 62 for (flg = 0, j = 0; j < bs2; j++) { 63 if (pc[j] != 0.0) { 64 flg = 1; 65 break; 66 } 67 } 68 if (flg) { 69 pv = b->a + bs2 * bdiag[row]; 70 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); 71 /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */ 72 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 73 pv = b->a + bs2 * (bdiag[row + 1] + 1); 74 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 75 for (j = 0; j < nz; j++) { 76 vv = rtmp + bs2 * pj[j]; 77 PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv); 78 /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */ 79 pv += bs2; 80 } 81 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 82 } 83 } 84 85 /* finished row so stick it into b->a */ 86 /* L part */ 87 pv = b->a + bs2 * bi[i]; 88 pj = b->j + bi[i]; 89 nz = bi[i + 1] - bi[i]; 90 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 91 92 /* Mark diagonal and invert diagonal for simpler triangular solves */ 93 pv = b->a + bs2 * bdiag[i]; 94 pj = b->j + bdiag[i]; 95 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 96 PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected)); 97 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 98 99 /* U part */ 100 pv = b->a + bs2 * (bdiag[i + 1] + 1); 101 pj = b->j + bdiag[i + 1] + 1; 102 nz = bdiag[i] - bdiag[i + 1] - 1; 103 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 104 } 105 106 PetscCall(PetscFree2(rtmp, mwork)); 107 108 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 109 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 110 C->assembled = PETSC_TRUE; 111 112 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 113 PetscFunctionReturn(PETSC_SUCCESS); 114 } 115 116 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info) 117 { 118 Mat C = B; 119 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 120 IS isrow = b->row, isicol = b->icol; 121 const PetscInt *r, *ic; 122 PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 123 PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj; 124 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 125 PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg; 126 MatScalar *v_work; 127 PetscBool col_identity, row_identity, both_identity; 128 PetscBool allowzeropivot, zeropivotdetected; 129 130 PetscFunctionBegin; 131 PetscCall(ISGetIndices(isrow, &r)); 132 PetscCall(ISGetIndices(isicol, &ic)); 133 allowzeropivot = PetscNot(A->erroriffailure); 134 135 PetscCall(PetscCalloc1(bs2 * n, &rtmp)); 136 137 /* generate work space needed by dense LU factorization */ 138 PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots)); 139 140 for (i = 0; i < n; i++) { 141 /* zero rtmp */ 142 /* L part */ 143 nz = bi[i + 1] - bi[i]; 144 bjtmp = bj + bi[i]; 145 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 146 147 /* U part */ 148 nz = bdiag[i] - bdiag[i + 1]; 149 bjtmp = bj + bdiag[i + 1] + 1; 150 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 151 152 /* load in initial (unfactored row) */ 153 nz = ai[r[i] + 1] - ai[r[i]]; 154 ajtmp = aj + ai[r[i]]; 155 v = aa + bs2 * ai[r[i]]; 156 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 157 158 /* elimination */ 159 bjtmp = bj + bi[i]; 160 nzL = bi[i + 1] - bi[i]; 161 for (k = 0; k < nzL; k++) { 162 row = bjtmp[k]; 163 pc = rtmp + bs2 * row; 164 for (flg = 0, j = 0; j < bs2; j++) { 165 if (pc[j] != 0.0) { 166 flg = 1; 167 break; 168 } 169 } 170 if (flg) { 171 pv = b->a + bs2 * bdiag[row]; 172 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */ 173 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 174 pv = b->a + bs2 * (bdiag[row + 1] + 1); 175 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 176 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 177 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 178 } 179 } 180 181 /* finished row so stick it into b->a */ 182 /* L part */ 183 pv = b->a + bs2 * bi[i]; 184 pj = b->j + bi[i]; 185 nz = bi[i + 1] - bi[i]; 186 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 187 188 /* Mark diagonal and invert diagonal for simpler triangular solves */ 189 pv = b->a + bs2 * bdiag[i]; 190 pj = b->j + bdiag[i]; 191 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 192 193 PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 194 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 195 196 /* U part */ 197 pv = b->a + bs2 * (bdiag[i + 1] + 1); 198 pj = b->j + bdiag[i + 1] + 1; 199 nz = bdiag[i] - bdiag[i + 1] - 1; 200 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 201 } 202 203 PetscCall(PetscFree(rtmp)); 204 PetscCall(PetscFree3(v_work, mwork, v_pivots)); 205 PetscCall(ISRestoreIndices(isicol, &ic)); 206 PetscCall(ISRestoreIndices(isrow, &r)); 207 208 PetscCall(ISIdentity(isrow, &row_identity)); 209 PetscCall(ISIdentity(isicol, &col_identity)); 210 211 both_identity = (PetscBool)(row_identity && col_identity); 212 if (both_identity) { 213 switch (bs) { 214 case 9: 215 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 216 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 217 #else 218 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 219 #endif 220 break; 221 case 11: 222 C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; 223 break; 224 case 12: 225 C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; 226 break; 227 case 13: 228 C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; 229 break; 230 case 14: 231 C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; 232 break; 233 default: 234 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 235 break; 236 } 237 } else { 238 C->ops->solve = MatSolve_SeqBAIJ_N; 239 } 240 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 241 242 C->assembled = PETSC_TRUE; 243 244 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 245 PetscFunctionReturn(PETSC_SUCCESS); 246 } 247 248 /* 249 ilu(0) with natural ordering under new data structure. 250 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 251 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 252 */ 253 254 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 255 { 256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 257 PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2; 258 PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp; 259 260 PetscFunctionBegin; 261 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE)); 262 b = (Mat_SeqBAIJ *)(fact)->data; 263 264 /* allocate matrix arrays for new data structure */ 265 PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i)); 266 267 b->singlemalloc = PETSC_TRUE; 268 b->free_a = PETSC_TRUE; 269 b->free_ij = PETSC_TRUE; 270 fact->preallocated = PETSC_TRUE; 271 fact->assembled = PETSC_TRUE; 272 if (!b->diag) { PetscCall(PetscMalloc1(n + 1, &b->diag)); } 273 bdiag = b->diag; 274 275 if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n])); 276 277 /* set bi and bj with new data structure */ 278 bi = b->i; 279 bj = b->j; 280 281 /* L part */ 282 bi[0] = 0; 283 for (i = 0; i < n; i++) { 284 nz = adiag[i] - ai[i]; 285 bi[i + 1] = bi[i] + nz; 286 aj = a->j + ai[i]; 287 for (j = 0; j < nz; j++) { 288 *bj = aj[j]; 289 bj++; 290 } 291 } 292 293 /* U part */ 294 bi_temp = bi[n]; 295 bdiag[n] = bi[n] - 1; 296 for (i = n - 1; i >= 0; i--) { 297 nz = ai[i + 1] - adiag[i] - 1; 298 bi_temp = bi_temp + nz + 1; 299 aj = a->j + adiag[i] + 1; 300 for (j = 0; j < nz; j++) { 301 *bj = aj[j]; 302 bj++; 303 } 304 /* diag[i] */ 305 *bj = i; 306 bj++; 307 bdiag[i] = bi_temp - 1; 308 } 309 PetscFunctionReturn(PETSC_SUCCESS); 310 } 311 312 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 313 { 314 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 315 IS isicol; 316 const PetscInt *r, *ic; 317 PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d; 318 PetscInt *bi, *cols, nnz, *cols_lvl; 319 PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0; 320 PetscInt i, levels, diagonal_fill; 321 PetscBool col_identity, row_identity, both_identity; 322 PetscReal f; 323 PetscInt nlnk, *lnk, *lnk_lvl = NULL; 324 PetscBT lnkbt; 325 PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr; 326 PetscFreeSpaceList free_space = NULL, current_space = NULL; 327 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 328 PetscBool missing; 329 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 330 331 PetscFunctionBegin; 332 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 333 if (bs > 1) { /* check shifttype */ 334 PetscCheck(info->shifttype != MAT_SHIFT_NONZERO && info->shifttype != MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 335 } 336 337 PetscCall(MatMissingDiagonal(A, &missing, &d)); 338 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 339 340 f = info->fill; 341 levels = (PetscInt)info->levels; 342 diagonal_fill = (PetscInt)info->diagonal_fill; 343 344 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 345 346 PetscCall(ISIdentity(isrow, &row_identity)); 347 PetscCall(ISIdentity(iscol, &col_identity)); 348 349 both_identity = (PetscBool)(row_identity && col_identity); 350 351 if (!levels && both_identity) { 352 /* special case: ilu(0) with natural ordering */ 353 PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info)); 354 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 355 356 fact->factortype = MAT_FACTOR_ILU; 357 (fact)->info.factor_mallocs = 0; 358 (fact)->info.fill_ratio_given = info->fill; 359 (fact)->info.fill_ratio_needed = 1.0; 360 361 b = (Mat_SeqBAIJ *)(fact)->data; 362 b->row = isrow; 363 b->col = iscol; 364 b->icol = isicol; 365 PetscCall(PetscObjectReference((PetscObject)isrow)); 366 PetscCall(PetscObjectReference((PetscObject)iscol)); 367 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 368 369 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 370 PetscFunctionReturn(PETSC_SUCCESS); 371 } 372 373 PetscCall(ISGetIndices(isrow, &r)); 374 PetscCall(ISGetIndices(isicol, &ic)); 375 376 /* get new row pointers */ 377 PetscCall(PetscMalloc1(n + 1, &bi)); 378 bi[0] = 0; 379 /* bdiag is location of diagonal in factor */ 380 PetscCall(PetscMalloc1(n + 1, &bdiag)); 381 bdiag[0] = 0; 382 383 PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr)); 384 385 /* create a linked list for storing column indices of the active row */ 386 nlnk = n + 1; 387 PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt)); 388 389 /* initial FreeSpace size is f*(ai[n]+1) */ 390 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space)); 391 current_space = free_space; 392 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl)); 393 current_space_lvl = free_space_lvl; 394 395 for (i = 0; i < n; i++) { 396 nzi = 0; 397 /* copy current row into linked list */ 398 nnz = ai[r[i] + 1] - ai[r[i]]; 399 PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 400 cols = aj + ai[r[i]]; 401 lnk[i] = -1; /* marker to indicate if diagonal exists */ 402 PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt)); 403 nzi += nlnk; 404 405 /* make sure diagonal entry is included */ 406 if (diagonal_fill && lnk[i] == -1) { 407 fm = n; 408 while (lnk[fm] < i) fm = lnk[fm]; 409 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 410 lnk[fm] = i; 411 lnk_lvl[i] = 0; 412 nzi++; 413 dcount++; 414 } 415 416 /* add pivot rows into the active row */ 417 nzbd = 0; 418 prow = lnk[n]; 419 while (prow < i) { 420 nnz = bdiag[prow]; 421 cols = bj_ptr[prow] + nnz + 1; 422 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 423 nnz = bi[prow + 1] - bi[prow] - nnz - 1; 424 425 PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow)); 426 nzi += nlnk; 427 prow = lnk[prow]; 428 nzbd++; 429 } 430 bdiag[i] = nzbd; 431 bi[i + 1] = bi[i] + nzi; 432 433 /* if free space is not available, make more free space */ 434 if (current_space->local_remaining < nzi) { 435 nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */ 436 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space)); 437 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl)); 438 reallocs++; 439 } 440 441 /* copy data into free_space and free_space_lvl, then initialize lnk */ 442 PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 443 444 bj_ptr[i] = current_space->array; 445 bjlvl_ptr[i] = current_space_lvl->array; 446 447 /* make sure the active row i has diagonal entry */ 448 PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i); 449 450 current_space->array += nzi; 451 current_space->local_used += nzi; 452 current_space->local_remaining -= nzi; 453 454 current_space_lvl->array += nzi; 455 current_space_lvl->local_used += nzi; 456 current_space_lvl->local_remaining -= nzi; 457 } 458 459 PetscCall(ISRestoreIndices(isrow, &r)); 460 PetscCall(ISRestoreIndices(isicol, &ic)); 461 462 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 463 PetscCall(PetscMalloc1(bi[n] + 1, &bj)); 464 PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag)); 465 466 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 467 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 468 PetscCall(PetscFree2(bj_ptr, bjlvl_ptr)); 469 470 #if defined(PETSC_USE_INFO) 471 { 472 PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 473 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 474 PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af)); 475 PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af)); 476 PetscCall(PetscInfo(A, "for best performance.\n")); 477 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 478 } 479 #endif 480 481 /* put together the new matrix */ 482 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 483 484 b = (Mat_SeqBAIJ *)(fact)->data; 485 b->free_a = PETSC_TRUE; 486 b->free_ij = PETSC_TRUE; 487 b->singlemalloc = PETSC_FALSE; 488 489 PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a)); 490 491 b->j = bj; 492 b->i = bi; 493 b->diag = bdiag; 494 b->free_diag = PETSC_TRUE; 495 b->ilen = NULL; 496 b->imax = NULL; 497 b->row = isrow; 498 b->col = iscol; 499 PetscCall(PetscObjectReference((PetscObject)isrow)); 500 PetscCall(PetscObjectReference((PetscObject)iscol)); 501 b->icol = isicol; 502 503 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 504 /* In b structure: Free imax, ilen, old a, old j. 505 Allocate bdiag, solve_work, new a, new j */ 506 b->maxnz = b->nz = bdiag[0] + 1; 507 508 fact->info.factor_mallocs = reallocs; 509 fact->info.fill_ratio_given = f; 510 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 511 512 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 513 PetscFunctionReturn(PETSC_SUCCESS); 514 } 515 516 #if 0 517 // unused 518 /* 519 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 520 except that the data structure of Mat_SeqAIJ is slightly different. 521 Not a good example of code reuse. 522 */ 523 static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 524 { 525 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 526 IS isicol; 527 const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi; 528 PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp; 529 PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0; 530 PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd; 531 PetscBool col_identity, row_identity, both_identity, flg; 532 PetscReal f; 533 534 PetscFunctionBegin; 535 PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd)); 536 PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd); 537 538 f = info->fill; 539 levels = (PetscInt)info->levels; 540 diagonal_fill = (PetscInt)info->diagonal_fill; 541 542 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 543 544 PetscCall(ISIdentity(isrow, &row_identity)); 545 PetscCall(ISIdentity(iscol, &col_identity)); 546 both_identity = (PetscBool)(row_identity && col_identity); 547 548 if (!levels && both_identity) { /* special case copy the nonzero structure */ 549 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE)); 550 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 551 552 fact->factortype = MAT_FACTOR_ILU; 553 b = (Mat_SeqBAIJ *)fact->data; 554 b->row = isrow; 555 b->col = iscol; 556 PetscCall(PetscObjectReference((PetscObject)isrow)); 557 PetscCall(PetscObjectReference((PetscObject)iscol)); 558 b->icol = isicol; 559 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 560 561 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 562 PetscFunctionReturn(PETSC_SUCCESS); 563 } 564 565 /* general case perform the symbolic factorization */ 566 PetscCall(ISGetIndices(isrow, &r)); 567 PetscCall(ISGetIndices(isicol, &ic)); 568 569 /* get new row pointers */ 570 PetscCall(PetscMalloc1(n + 1, &ainew)); 571 ainew[0] = 0; 572 /* don't know how many column pointers are needed so estimate */ 573 jmax = (PetscInt)(f * ai[n] + 1); 574 PetscCall(PetscMalloc1(jmax, &ajnew)); 575 /* ajfill is level of fill for each fill entry */ 576 PetscCall(PetscMalloc1(jmax, &ajfill)); 577 /* fill is a linked list of nonzeros in active row */ 578 PetscCall(PetscMalloc1(n + 1, &fill)); 579 /* im is level for each filled value */ 580 PetscCall(PetscMalloc1(n + 1, &im)); 581 /* dloc is location of diagonal in factor */ 582 PetscCall(PetscMalloc1(n + 1, &dloc)); 583 dloc[0] = 0; 584 for (prow = 0; prow < n; prow++) { 585 /* copy prow into linked list */ 586 nzf = nz = ai[r[prow] + 1] - ai[r[prow]]; 587 PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow); 588 xi = aj + ai[r[prow]]; 589 fill[n] = n; 590 fill[prow] = -1; /* marker for diagonal entry */ 591 while (nz--) { 592 fm = n; 593 idx = ic[*xi++]; 594 do { 595 m = fm; 596 fm = fill[m]; 597 } while (fm < idx); 598 fill[m] = idx; 599 fill[idx] = fm; 600 im[idx] = 0; 601 } 602 603 /* make sure diagonal entry is included */ 604 if (diagonal_fill && fill[prow] == -1) { 605 fm = n; 606 while (fill[fm] < prow) fm = fill[fm]; 607 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 608 fill[fm] = prow; 609 im[prow] = 0; 610 nzf++; 611 dcount++; 612 } 613 614 nzi = 0; 615 row = fill[n]; 616 while (row < prow) { 617 incrlev = im[row] + 1; 618 nz = dloc[row]; 619 xi = ajnew + ainew[row] + nz + 1; 620 flev = ajfill + ainew[row] + nz + 1; 621 nnz = ainew[row + 1] - ainew[row] - nz - 1; 622 fm = row; 623 while (nnz-- > 0) { 624 idx = *xi++; 625 if (*flev + incrlev > levels) { 626 flev++; 627 continue; 628 } 629 do { 630 m = fm; 631 fm = fill[m]; 632 } while (fm < idx); 633 if (fm != idx) { 634 im[idx] = *flev + incrlev; 635 fill[m] = idx; 636 fill[idx] = fm; 637 fm = idx; 638 nzf++; 639 } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev; 640 flev++; 641 } 642 row = fill[row]; 643 nzi++; 644 } 645 /* copy new filled row into permanent storage */ 646 ainew[prow + 1] = ainew[prow] + nzf; 647 if (ainew[prow + 1] > jmax) { 648 /* estimate how much additional space we will need */ 649 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 650 /* just double the memory each time */ 651 PetscInt maxadd = jmax; 652 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 653 if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1); 654 jmax += maxadd; 655 656 /* allocate a longer ajnew and ajfill */ 657 PetscCall(PetscMalloc1(jmax, &xitmp)); 658 PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow])); 659 PetscCall(PetscFree(ajnew)); 660 ajnew = xitmp; 661 PetscCall(PetscMalloc1(jmax, &xitmp)); 662 PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow])); 663 PetscCall(PetscFree(ajfill)); 664 ajfill = xitmp; 665 reallocate++; /* count how many reallocations are needed */ 666 } 667 xitmp = ajnew + ainew[prow]; 668 flev = ajfill + ainew[prow]; 669 dloc[prow] = nzi; 670 fm = fill[n]; 671 while (nzf--) { 672 *xitmp++ = fm; 673 *flev++ = im[fm]; 674 fm = fill[fm]; 675 } 676 /* make sure row has diagonal entry */ 677 PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\ 678 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", 679 prow); 680 } 681 PetscCall(PetscFree(ajfill)); 682 PetscCall(ISRestoreIndices(isrow, &r)); 683 PetscCall(ISRestoreIndices(isicol, &ic)); 684 PetscCall(PetscFree(fill)); 685 PetscCall(PetscFree(im)); 686 687 #if defined(PETSC_USE_INFO) 688 { 689 PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]); 690 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af)); 691 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 692 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 693 PetscCall(PetscInfo(A, "for best performance.\n")); 694 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 695 } 696 #endif 697 698 /* put together the new matrix */ 699 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 700 b = (Mat_SeqBAIJ *)fact->data; 701 702 b->free_a = PETSC_TRUE; 703 b->free_ij = PETSC_TRUE; 704 b->singlemalloc = PETSC_FALSE; 705 706 PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a)); 707 708 b->j = ajnew; 709 b->i = ainew; 710 for (i = 0; i < n; i++) dloc[i] += ainew[i]; 711 b->diag = dloc; 712 b->free_diag = PETSC_TRUE; 713 b->ilen = NULL; 714 b->imax = NULL; 715 b->row = isrow; 716 b->col = iscol; 717 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 718 719 PetscCall(PetscObjectReference((PetscObject)isrow)); 720 PetscCall(PetscObjectReference((PetscObject)iscol)); 721 b->icol = isicol; 722 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 723 /* In b structure: Free imax, ilen, old a, old j. 724 Allocate dloc, solve_work, new a, new j */ 725 b->maxnz = b->nz = ainew[n]; 726 727 fact->info.factor_mallocs = reallocate; 728 fact->info.fill_ratio_given = f; 729 fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]); 730 731 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 732 PetscFunctionReturn(PETSC_SUCCESS); 733 } 734 #endif 735 736 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 737 { 738 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 739 /* int i,*AJ=a->j,nz=a->nz; */ 740 741 PetscFunctionBegin; 742 /* Undo Column scaling */ 743 /* while (nz--) { */ 744 /* AJ[i] = AJ[i]/4; */ 745 /* } */ 746 /* This should really invoke a push/pop logic, but we don't have that yet. */ 747 A->ops->setunfactored = NULL; 748 PetscFunctionReturn(PETSC_SUCCESS); 749 } 750 751 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 752 { 753 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 754 PetscInt *AJ = a->j, nz = a->nz; 755 unsigned short *aj = (unsigned short *)AJ; 756 757 PetscFunctionBegin; 758 /* Is this really necessary? */ 759 while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ } 760 A->ops->setunfactored = NULL; 761 PetscFunctionReturn(PETSC_SUCCESS); 762 } 763