1 /* 2 Factorization code for BAIJ format. 3 */ 4 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 #include <petscbt.h> 8 #include <../src/mat/utils/freespace.h> 9 10 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool); 11 12 /* 13 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 14 */ 15 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 16 { 17 Mat C = B; 18 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 19 PetscInt i, j, k, ipvt[15]; 20 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj; 21 PetscInt nz, nzL, row; 22 MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225]; 23 const MatScalar *v, *aa = a->a; 24 PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg; 25 PetscInt sol_ver; 26 PetscBool allowzeropivot, zeropivotdetected; 27 28 PetscFunctionBegin; 29 allowzeropivot = PetscNot(A->erroriffailure); 30 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL)); 31 32 /* generate work space needed by the factorization */ 33 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 34 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 35 36 for (i = 0; i < n; i++) { 37 /* zero rtmp */ 38 /* L part */ 39 nz = bi[i + 1] - bi[i]; 40 bjtmp = bj + bi[i]; 41 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 42 43 /* U part */ 44 nz = bdiag[i] - bdiag[i + 1]; 45 bjtmp = bj + bdiag[i + 1] + 1; 46 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 47 48 /* load in initial (unfactored row) */ 49 nz = ai[i + 1] - ai[i]; 50 ajtmp = aj + ai[i]; 51 v = aa + bs2 * ai[i]; 52 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 53 54 /* elimination */ 55 bjtmp = bj + bi[i]; 56 nzL = bi[i + 1] - bi[i]; 57 for (k = 0; k < nzL; k++) { 58 row = bjtmp[k]; 59 pc = rtmp + bs2 * row; 60 for (flg = 0, j = 0; j < bs2; j++) { 61 if (pc[j] != 0.0) { 62 flg = 1; 63 break; 64 } 65 } 66 if (flg) { 67 pv = b->a + bs2 * bdiag[row]; 68 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); 69 /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */ 70 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 71 pv = b->a + bs2 * (bdiag[row + 1] + 1); 72 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 73 for (j = 0; j < nz; j++) { 74 vv = rtmp + bs2 * pj[j]; 75 PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv); 76 /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */ 77 pv += bs2; 78 } 79 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 80 } 81 } 82 83 /* finished row so stick it into b->a */ 84 /* L part */ 85 pv = b->a + bs2 * bi[i]; 86 pj = b->j + bi[i]; 87 nz = bi[i + 1] - bi[i]; 88 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 89 90 /* Mark diagonal and invert diagonal for simpler triangular solves */ 91 pv = b->a + bs2 * bdiag[i]; 92 pj = b->j + bdiag[i]; 93 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 94 PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected)); 95 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 96 97 /* U part */ 98 pv = b->a + bs2 * (bdiag[i + 1] + 1); 99 pj = b->j + bdiag[i + 1] + 1; 100 nz = bdiag[i] - bdiag[i + 1] - 1; 101 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 102 } 103 104 PetscCall(PetscFree2(rtmp, mwork)); 105 106 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 107 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 108 C->assembled = PETSC_TRUE; 109 110 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info) 115 { 116 Mat C = B; 117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 118 IS isrow = b->row, isicol = b->icol; 119 const PetscInt *r, *ic; 120 PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 121 PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj; 122 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 123 PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg; 124 MatScalar *v_work; 125 PetscBool col_identity, row_identity, both_identity; 126 PetscBool allowzeropivot, zeropivotdetected; 127 128 PetscFunctionBegin; 129 PetscCall(ISGetIndices(isrow, &r)); 130 PetscCall(ISGetIndices(isicol, &ic)); 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 PetscCall(PetscCalloc1(bs2 * n, &rtmp)); 134 135 /* generate work space needed by dense LU factorization */ 136 PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots)); 137 138 for (i = 0; i < n; i++) { 139 /* zero rtmp */ 140 /* L part */ 141 nz = bi[i + 1] - bi[i]; 142 bjtmp = bj + bi[i]; 143 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 144 145 /* U part */ 146 nz = bdiag[i] - bdiag[i + 1]; 147 bjtmp = bj + bdiag[i + 1] + 1; 148 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 149 150 /* load in initial (unfactored row) */ 151 nz = ai[r[i] + 1] - ai[r[i]]; 152 ajtmp = aj + ai[r[i]]; 153 v = aa + bs2 * ai[r[i]]; 154 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 155 156 /* elimination */ 157 bjtmp = bj + bi[i]; 158 nzL = bi[i + 1] - bi[i]; 159 for (k = 0; k < nzL; k++) { 160 row = bjtmp[k]; 161 pc = rtmp + bs2 * row; 162 for (flg = 0, j = 0; j < bs2; j++) { 163 if (pc[j] != 0.0) { 164 flg = 1; 165 break; 166 } 167 } 168 if (flg) { 169 pv = b->a + bs2 * bdiag[row]; 170 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */ 171 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 172 pv = b->a + bs2 * (bdiag[row + 1] + 1); 173 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 174 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 175 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 176 } 177 } 178 179 /* finished row so stick it into b->a */ 180 /* L part */ 181 pv = b->a + bs2 * bi[i]; 182 pj = b->j + bi[i]; 183 nz = bi[i + 1] - bi[i]; 184 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 185 186 /* Mark diagonal and invert diagonal for simpler triangular solves */ 187 pv = b->a + bs2 * bdiag[i]; 188 pj = b->j + bdiag[i]; 189 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 190 191 PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 192 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 193 194 /* U part */ 195 pv = b->a + bs2 * (bdiag[i + 1] + 1); 196 pj = b->j + bdiag[i + 1] + 1; 197 nz = bdiag[i] - bdiag[i + 1] - 1; 198 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 199 } 200 201 PetscCall(PetscFree(rtmp)); 202 PetscCall(PetscFree3(v_work, mwork, v_pivots)); 203 PetscCall(ISRestoreIndices(isicol, &ic)); 204 PetscCall(ISRestoreIndices(isrow, &r)); 205 206 PetscCall(ISIdentity(isrow, &row_identity)); 207 PetscCall(ISIdentity(isicol, &col_identity)); 208 209 both_identity = (PetscBool)(row_identity && col_identity); 210 if (both_identity) { 211 switch (bs) { 212 case 9: 213 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 214 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 215 #else 216 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 217 #endif 218 break; 219 case 11: 220 C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; 221 break; 222 case 12: 223 C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; 224 break; 225 case 13: 226 C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; 227 break; 228 case 14: 229 C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; 230 break; 231 default: 232 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 233 break; 234 } 235 } else { 236 C->ops->solve = MatSolve_SeqBAIJ_N; 237 } 238 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 239 240 C->assembled = PETSC_TRUE; 241 242 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 243 PetscFunctionReturn(PETSC_SUCCESS); 244 } 245 246 /* 247 ilu(0) with natural ordering under new data structure. 248 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 249 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 250 */ 251 252 static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 253 { 254 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 255 PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2; 256 PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp; 257 258 PetscFunctionBegin; 259 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE)); 260 b = (Mat_SeqBAIJ *)(fact)->data; 261 262 /* allocate matrix arrays for new data structure */ 263 PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i)); 264 265 b->singlemalloc = PETSC_TRUE; 266 b->free_a = PETSC_TRUE; 267 b->free_ij = PETSC_TRUE; 268 fact->preallocated = PETSC_TRUE; 269 fact->assembled = PETSC_TRUE; 270 if (!b->diag) PetscCall(PetscMalloc1(n + 1, &b->diag)); 271 bdiag = b->diag; 272 273 if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n])); 274 275 /* set bi and bj with new data structure */ 276 bi = b->i; 277 bj = b->j; 278 279 /* L part */ 280 bi[0] = 0; 281 for (i = 0; i < n; i++) { 282 nz = adiag[i] - ai[i]; 283 bi[i + 1] = bi[i] + nz; 284 aj = a->j + ai[i]; 285 for (j = 0; j < nz; j++) { 286 *bj = aj[j]; 287 bj++; 288 } 289 } 290 291 /* U part */ 292 bi_temp = bi[n]; 293 bdiag[n] = bi[n] - 1; 294 for (i = n - 1; i >= 0; i--) { 295 nz = ai[i + 1] - adiag[i] - 1; 296 bi_temp = bi_temp + nz + 1; 297 aj = a->j + adiag[i] + 1; 298 for (j = 0; j < nz; j++) { 299 *bj = aj[j]; 300 bj++; 301 } 302 /* diag[i] */ 303 *bj = i; 304 bj++; 305 bdiag[i] = bi_temp - 1; 306 } 307 PetscFunctionReturn(PETSC_SUCCESS); 308 } 309 310 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 311 { 312 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 313 IS isicol; 314 const PetscInt *r, *ic; 315 PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d; 316 PetscInt *bi, *cols, nnz, *cols_lvl; 317 PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0; 318 PetscInt i, levels, diagonal_fill; 319 PetscBool col_identity, row_identity, both_identity; 320 PetscReal f; 321 PetscInt nlnk, *lnk, *lnk_lvl = NULL; 322 PetscBT lnkbt; 323 PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr; 324 PetscFreeSpaceList free_space = NULL, current_space = NULL; 325 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 326 PetscBool missing; 327 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 328 329 PetscFunctionBegin; 330 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 331 if (bs > 1) { /* check shifttype */ 332 PetscCheck(info->shifttype != (PetscReal)MAT_SHIFT_NONZERO && info->shifttype != (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 333 } 334 335 PetscCall(MatMissingDiagonal(A, &missing, &d)); 336 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 337 338 f = info->fill; 339 levels = (PetscInt)info->levels; 340 diagonal_fill = (PetscInt)info->diagonal_fill; 341 342 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 343 344 PetscCall(ISIdentity(isrow, &row_identity)); 345 PetscCall(ISIdentity(iscol, &col_identity)); 346 347 both_identity = (PetscBool)(row_identity && col_identity); 348 349 if (!levels && both_identity) { 350 /* special case: ilu(0) with natural ordering */ 351 PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info)); 352 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 353 354 fact->factortype = MAT_FACTOR_ILU; 355 (fact)->info.factor_mallocs = 0; 356 (fact)->info.fill_ratio_given = info->fill; 357 (fact)->info.fill_ratio_needed = 1.0; 358 359 b = (Mat_SeqBAIJ *)(fact)->data; 360 b->row = isrow; 361 b->col = iscol; 362 b->icol = isicol; 363 PetscCall(PetscObjectReference((PetscObject)isrow)); 364 PetscCall(PetscObjectReference((PetscObject)iscol)); 365 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 366 367 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 368 PetscFunctionReturn(PETSC_SUCCESS); 369 } 370 371 PetscCall(ISGetIndices(isrow, &r)); 372 PetscCall(ISGetIndices(isicol, &ic)); 373 374 /* get new row pointers */ 375 PetscCall(PetscMalloc1(n + 1, &bi)); 376 bi[0] = 0; 377 /* bdiag is location of diagonal in factor */ 378 PetscCall(PetscMalloc1(n + 1, &bdiag)); 379 bdiag[0] = 0; 380 381 PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr)); 382 383 /* create a linked list for storing column indices of the active row */ 384 nlnk = n + 1; 385 PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt)); 386 387 /* initial FreeSpace size is f*(ai[n]+1) */ 388 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space)); 389 current_space = free_space; 390 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl)); 391 current_space_lvl = free_space_lvl; 392 393 for (i = 0; i < n; i++) { 394 nzi = 0; 395 /* copy current row into linked list */ 396 nnz = ai[r[i] + 1] - ai[r[i]]; 397 PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 398 cols = aj + ai[r[i]]; 399 lnk[i] = -1; /* marker to indicate if diagonal exists */ 400 PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt)); 401 nzi += nlnk; 402 403 /* make sure diagonal entry is included */ 404 if (diagonal_fill && lnk[i] == -1) { 405 fm = n; 406 while (lnk[fm] < i) fm = lnk[fm]; 407 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 408 lnk[fm] = i; 409 lnk_lvl[i] = 0; 410 nzi++; 411 dcount++; 412 } 413 414 /* add pivot rows into the active row */ 415 nzbd = 0; 416 prow = lnk[n]; 417 while (prow < i) { 418 nnz = bdiag[prow]; 419 cols = bj_ptr[prow] + nnz + 1; 420 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 421 nnz = bi[prow + 1] - bi[prow] - nnz - 1; 422 423 PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow)); 424 nzi += nlnk; 425 prow = lnk[prow]; 426 nzbd++; 427 } 428 bdiag[i] = nzbd; 429 bi[i + 1] = bi[i] + nzi; 430 431 /* if free space is not available, make more free space */ 432 if (current_space->local_remaining < nzi) { 433 nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */ 434 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space)); 435 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl)); 436 reallocs++; 437 } 438 439 /* copy data into free_space and free_space_lvl, then initialize lnk */ 440 PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 441 442 bj_ptr[i] = current_space->array; 443 bjlvl_ptr[i] = current_space_lvl->array; 444 445 /* make sure the active row i has diagonal entry */ 446 PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix, try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i); 447 448 current_space->array += nzi; 449 current_space->local_used += nzi; 450 current_space->local_remaining -= nzi; 451 452 current_space_lvl->array += nzi; 453 current_space_lvl->local_used += nzi; 454 current_space_lvl->local_remaining -= nzi; 455 } 456 457 PetscCall(ISRestoreIndices(isrow, &r)); 458 PetscCall(ISRestoreIndices(isicol, &ic)); 459 460 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 461 PetscCall(PetscMalloc1(bi[n] + 1, &bj)); 462 PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag)); 463 464 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 465 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 466 PetscCall(PetscFree2(bj_ptr, bjlvl_ptr)); 467 468 #if defined(PETSC_USE_INFO) 469 { 470 PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 471 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 472 PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af)); 473 PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af)); 474 PetscCall(PetscInfo(A, "for best performance.\n")); 475 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 476 } 477 #endif 478 479 /* put together the new matrix */ 480 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 481 482 b = (Mat_SeqBAIJ *)(fact)->data; 483 b->free_a = PETSC_TRUE; 484 b->free_ij = PETSC_TRUE; 485 b->singlemalloc = PETSC_FALSE; 486 487 PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a)); 488 489 b->j = bj; 490 b->i = bi; 491 b->diag = bdiag; 492 b->free_diag = PETSC_TRUE; 493 b->ilen = NULL; 494 b->imax = NULL; 495 b->row = isrow; 496 b->col = iscol; 497 PetscCall(PetscObjectReference((PetscObject)isrow)); 498 PetscCall(PetscObjectReference((PetscObject)iscol)); 499 b->icol = isicol; 500 501 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 502 /* In b structure: Free imax, ilen, old a, old j. 503 Allocate bdiag, solve_work, new a, new j */ 504 b->maxnz = b->nz = bdiag[0] + 1; 505 506 fact->info.factor_mallocs = reallocs; 507 fact->info.fill_ratio_given = f; 508 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 509 510 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 511 PetscFunctionReturn(PETSC_SUCCESS); 512 } 513 514 #if 0 515 // unused 516 /* 517 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 518 except that the data structure of Mat_SeqAIJ is slightly different. 519 Not a good example of code reuse. 520 */ 521 static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 522 { 523 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 524 IS isicol; 525 const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi; 526 PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp; 527 PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0; 528 PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd; 529 PetscBool col_identity, row_identity, both_identity, flg; 530 PetscReal f; 531 532 PetscFunctionBegin; 533 PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd)); 534 PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd); 535 536 f = info->fill; 537 levels = (PetscInt)info->levels; 538 diagonal_fill = (PetscInt)info->diagonal_fill; 539 540 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 541 542 PetscCall(ISIdentity(isrow, &row_identity)); 543 PetscCall(ISIdentity(iscol, &col_identity)); 544 both_identity = (PetscBool)(row_identity && col_identity); 545 546 if (!levels && both_identity) { /* special case copy the nonzero structure */ 547 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE)); 548 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 549 550 fact->factortype = MAT_FACTOR_ILU; 551 b = (Mat_SeqBAIJ *)fact->data; 552 b->row = isrow; 553 b->col = iscol; 554 PetscCall(PetscObjectReference((PetscObject)isrow)); 555 PetscCall(PetscObjectReference((PetscObject)iscol)); 556 b->icol = isicol; 557 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 558 559 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 560 PetscFunctionReturn(PETSC_SUCCESS); 561 } 562 563 /* general case perform the symbolic factorization */ 564 PetscCall(ISGetIndices(isrow, &r)); 565 PetscCall(ISGetIndices(isicol, &ic)); 566 567 /* get new row pointers */ 568 PetscCall(PetscMalloc1(n + 1, &ainew)); 569 ainew[0] = 0; 570 /* don't know how many column pointers are needed so estimate */ 571 jmax = (PetscInt)(f * ai[n] + 1); 572 PetscCall(PetscMalloc1(jmax, &ajnew)); 573 /* ajfill is level of fill for each fill entry */ 574 PetscCall(PetscMalloc1(jmax, &ajfill)); 575 /* fill is a linked list of nonzeros in active row */ 576 PetscCall(PetscMalloc1(n + 1, &fill)); 577 /* im is level for each filled value */ 578 PetscCall(PetscMalloc1(n + 1, &im)); 579 /* dloc is location of diagonal in factor */ 580 PetscCall(PetscMalloc1(n + 1, &dloc)); 581 dloc[0] = 0; 582 for (prow = 0; prow < n; prow++) { 583 /* copy prow into linked list */ 584 nzf = nz = ai[r[prow] + 1] - ai[r[prow]]; 585 PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow); 586 xi = aj + ai[r[prow]]; 587 fill[n] = n; 588 fill[prow] = -1; /* marker for diagonal entry */ 589 while (nz--) { 590 fm = n; 591 idx = ic[*xi++]; 592 do { 593 m = fm; 594 fm = fill[m]; 595 } while (fm < idx); 596 fill[m] = idx; 597 fill[idx] = fm; 598 im[idx] = 0; 599 } 600 601 /* make sure diagonal entry is included */ 602 if (diagonal_fill && fill[prow] == -1) { 603 fm = n; 604 while (fill[fm] < prow) fm = fill[fm]; 605 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 606 fill[fm] = prow; 607 im[prow] = 0; 608 nzf++; 609 dcount++; 610 } 611 612 nzi = 0; 613 row = fill[n]; 614 while (row < prow) { 615 incrlev = im[row] + 1; 616 nz = dloc[row]; 617 xi = ajnew + ainew[row] + nz + 1; 618 flev = ajfill + ainew[row] + nz + 1; 619 nnz = ainew[row + 1] - ainew[row] - nz - 1; 620 fm = row; 621 while (nnz-- > 0) { 622 idx = *xi++; 623 if (*flev + incrlev > levels) { 624 flev++; 625 continue; 626 } 627 do { 628 m = fm; 629 fm = fill[m]; 630 } while (fm < idx); 631 if (fm != idx) { 632 im[idx] = *flev + incrlev; 633 fill[m] = idx; 634 fill[idx] = fm; 635 fm = idx; 636 nzf++; 637 } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev; 638 flev++; 639 } 640 row = fill[row]; 641 nzi++; 642 } 643 /* copy new filled row into permanent storage */ 644 ainew[prow + 1] = ainew[prow] + nzf; 645 if (ainew[prow + 1] > jmax) { 646 /* estimate how much additional space we will need */ 647 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 648 /* just double the memory each time */ 649 PetscInt maxadd = jmax; 650 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 651 if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1); 652 jmax += maxadd; 653 654 /* allocate a longer ajnew and ajfill */ 655 PetscCall(PetscMalloc1(jmax, &xitmp)); 656 PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow])); 657 PetscCall(PetscFree(ajnew)); 658 ajnew = xitmp; 659 PetscCall(PetscMalloc1(jmax, &xitmp)); 660 PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow])); 661 PetscCall(PetscFree(ajfill)); 662 ajfill = xitmp; 663 reallocate++; /* count how many reallocations are needed */ 664 } 665 xitmp = ajnew + ainew[prow]; 666 flev = ajfill + ainew[prow]; 667 dloc[prow] = nzi; 668 fm = fill[n]; 669 while (nzf--) { 670 *xitmp++ = fm; 671 *flev++ = im[fm]; 672 fm = fill[fm]; 673 } 674 /* make sure row has diagonal entry */ 675 PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix, try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", prow); 676 } 677 PetscCall(PetscFree(ajfill)); 678 PetscCall(ISRestoreIndices(isrow, &r)); 679 PetscCall(ISRestoreIndices(isicol, &ic)); 680 PetscCall(PetscFree(fill)); 681 PetscCall(PetscFree(im)); 682 683 #if defined(PETSC_USE_INFO) 684 { 685 PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]); 686 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af)); 687 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 688 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 689 PetscCall(PetscInfo(A, "for best performance.\n")); 690 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 691 } 692 #endif 693 694 /* put together the new matrix */ 695 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 696 b = (Mat_SeqBAIJ *)fact->data; 697 698 b->free_a = PETSC_TRUE; 699 b->free_ij = PETSC_TRUE; 700 b->singlemalloc = PETSC_FALSE; 701 702 PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a)); 703 704 b->j = ajnew; 705 b->i = ainew; 706 for (i = 0; i < n; i++) dloc[i] += ainew[i]; 707 b->diag = dloc; 708 b->free_diag = PETSC_TRUE; 709 b->ilen = NULL; 710 b->imax = NULL; 711 b->row = isrow; 712 b->col = iscol; 713 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 714 715 PetscCall(PetscObjectReference((PetscObject)isrow)); 716 PetscCall(PetscObjectReference((PetscObject)iscol)); 717 b->icol = isicol; 718 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 719 /* In b structure: Free imax, ilen, old a, old j. 720 Allocate dloc, solve_work, new a, new j */ 721 b->maxnz = b->nz = ainew[n]; 722 723 fact->info.factor_mallocs = reallocate; 724 fact->info.fill_ratio_given = f; 725 fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]); 726 727 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 728 PetscFunctionReturn(PETSC_SUCCESS); 729 } 730 #endif 731 732 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 733 { 734 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 735 /* int i,*AJ=a->j,nz=a->nz; */ 736 737 PetscFunctionBegin; 738 /* Undo Column scaling */ 739 /* while (nz--) { */ 740 /* AJ[i] = AJ[i]/4; */ 741 /* } */ 742 /* This should really invoke a push/pop logic, but we don't have that yet. */ 743 A->ops->setunfactored = NULL; 744 PetscFunctionReturn(PETSC_SUCCESS); 745 } 746 747 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 748 { 749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 750 PetscInt *AJ = a->j, nz = a->nz; 751 unsigned short *aj = (unsigned short *)AJ; 752 753 PetscFunctionBegin; 754 /* Is this really necessary? */ 755 while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ } 756 A->ops->setunfactored = NULL; 757 PetscFunctionReturn(PETSC_SUCCESS); 758 } 759