1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool); 12 13 /* 14 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 15 */ 16 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 17 { 18 Mat C = B; 19 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 20 PetscInt i, j, k, ipvt[15]; 21 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj; 22 PetscInt nz, nzL, row; 23 MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225]; 24 const MatScalar *v, *aa = a->a; 25 PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg; 26 PetscInt sol_ver; 27 PetscBool allowzeropivot, zeropivotdetected; 28 29 PetscFunctionBegin; 30 allowzeropivot = PetscNot(A->erroriffailure); 31 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL)); 32 33 /* generate work space needed by the factorization */ 34 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 35 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 36 37 for (i = 0; i < n; i++) { 38 /* zero rtmp */ 39 /* L part */ 40 nz = bi[i + 1] - bi[i]; 41 bjtmp = bj + bi[i]; 42 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 43 44 /* U part */ 45 nz = bdiag[i] - bdiag[i + 1]; 46 bjtmp = bj + bdiag[i + 1] + 1; 47 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 48 49 /* load in initial (unfactored row) */ 50 nz = ai[i + 1] - ai[i]; 51 ajtmp = aj + ai[i]; 52 v = aa + bs2 * ai[i]; 53 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 54 55 /* elimination */ 56 bjtmp = bj + bi[i]; 57 nzL = bi[i + 1] - bi[i]; 58 for (k = 0; k < nzL; k++) { 59 row = bjtmp[k]; 60 pc = rtmp + bs2 * row; 61 for (flg = 0, j = 0; j < bs2; j++) { 62 if (pc[j] != 0.0) { 63 flg = 1; 64 break; 65 } 66 } 67 if (flg) { 68 pv = b->a + bs2 * bdiag[row]; 69 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); 70 /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */ 71 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 72 pv = b->a + bs2 * (bdiag[row + 1] + 1); 73 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 74 for (j = 0; j < nz; j++) { 75 vv = rtmp + bs2 * pj[j]; 76 PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv); 77 /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */ 78 pv += bs2; 79 } 80 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 81 } 82 } 83 84 /* finished row so stick it into b->a */ 85 /* L part */ 86 pv = b->a + bs2 * bi[i]; 87 pj = b->j + bi[i]; 88 nz = bi[i + 1] - bi[i]; 89 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 90 91 /* Mark diagonal and invert diagonal for simpler triangular solves */ 92 pv = b->a + bs2 * bdiag[i]; 93 pj = b->j + bdiag[i]; 94 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 95 PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected)); 96 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 107 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 108 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 109 C->assembled = PETSC_TRUE; 110 111 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 112 PetscFunctionReturn(PETSC_SUCCESS); 113 } 114 115 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info) 116 { 117 Mat C = B; 118 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 119 IS isrow = b->row, isicol = b->icol; 120 const PetscInt *r, *ic; 121 PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 122 PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj; 123 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 124 PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg; 125 MatScalar *v_work; 126 PetscBool col_identity, row_identity, both_identity; 127 PetscBool allowzeropivot, zeropivotdetected; 128 129 PetscFunctionBegin; 130 PetscCall(ISGetIndices(isrow, &r)); 131 PetscCall(ISGetIndices(isicol, &ic)); 132 allowzeropivot = PetscNot(A->erroriffailure); 133 134 PetscCall(PetscCalloc1(bs2 * n, &rtmp)); 135 136 /* generate work space needed by dense LU factorization */ 137 PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots)); 138 139 for (i = 0; i < n; i++) { 140 /* zero rtmp */ 141 /* L part */ 142 nz = bi[i + 1] - bi[i]; 143 bjtmp = bj + bi[i]; 144 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 145 146 /* U part */ 147 nz = bdiag[i] - bdiag[i + 1]; 148 bjtmp = bj + bdiag[i + 1] + 1; 149 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 150 151 /* load in initial (unfactored row) */ 152 nz = ai[r[i] + 1] - ai[r[i]]; 153 ajtmp = aj + ai[r[i]]; 154 v = aa + bs2 * ai[r[i]]; 155 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 156 157 /* elimination */ 158 bjtmp = bj + bi[i]; 159 nzL = bi[i + 1] - bi[i]; 160 for (k = 0; k < nzL; k++) { 161 row = bjtmp[k]; 162 pc = rtmp + bs2 * row; 163 for (flg = 0, j = 0; j < bs2; j++) { 164 if (pc[j] != 0.0) { 165 flg = 1; 166 break; 167 } 168 } 169 if (flg) { 170 pv = b->a + bs2 * bdiag[row]; 171 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */ 172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 173 pv = b->a + bs2 * (bdiag[row + 1] + 1); 174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 175 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 176 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 177 } 178 } 179 180 /* finished row so stick it into b->a */ 181 /* L part */ 182 pv = b->a + bs2 * bi[i]; 183 pj = b->j + bi[i]; 184 nz = bi[i + 1] - bi[i]; 185 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 186 187 /* Mark diagonal and invert diagonal for simpler triangular solves */ 188 pv = b->a + bs2 * bdiag[i]; 189 pj = b->j + bdiag[i]; 190 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 191 192 PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 193 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 194 195 /* U part */ 196 pv = b->a + bs2 * (bdiag[i + 1] + 1); 197 pj = b->j + bdiag[i + 1] + 1; 198 nz = bdiag[i] - bdiag[i + 1] - 1; 199 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 200 } 201 202 PetscCall(PetscFree(rtmp)); 203 PetscCall(PetscFree3(v_work, mwork, v_pivots)); 204 PetscCall(ISRestoreIndices(isicol, &ic)); 205 PetscCall(ISRestoreIndices(isrow, &r)); 206 207 PetscCall(ISIdentity(isrow, &row_identity)); 208 PetscCall(ISIdentity(isicol, &col_identity)); 209 210 both_identity = (PetscBool)(row_identity && col_identity); 211 if (both_identity) { 212 switch (bs) { 213 case 9: 214 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 215 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 216 #else 217 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 218 #endif 219 break; 220 case 11: 221 C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; 222 break; 223 case 12: 224 C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; 225 break; 226 case 13: 227 C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; 228 break; 229 case 14: 230 C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; 231 break; 232 default: 233 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 234 break; 235 } 236 } else { 237 C->ops->solve = MatSolve_SeqBAIJ_N; 238 } 239 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 240 241 C->assembled = PETSC_TRUE; 242 243 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 244 PetscFunctionReturn(PETSC_SUCCESS); 245 } 246 247 /* 248 ilu(0) with natural ordering under new data structure. 249 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 250 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 251 */ 252 253 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 254 { 255 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 256 PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2; 257 PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp; 258 259 PetscFunctionBegin; 260 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE)); 261 b = (Mat_SeqBAIJ *)(fact)->data; 262 263 /* allocate matrix arrays for new data structure */ 264 PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i)); 265 266 b->singlemalloc = PETSC_TRUE; 267 b->free_a = PETSC_TRUE; 268 b->free_ij = PETSC_TRUE; 269 fact->preallocated = PETSC_TRUE; 270 fact->assembled = PETSC_TRUE; 271 if (!b->diag) PetscCall(PetscMalloc1(n + 1, &b->diag)); 272 bdiag = b->diag; 273 274 if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n])); 275 276 /* set bi and bj with new data structure */ 277 bi = b->i; 278 bj = b->j; 279 280 /* L part */ 281 bi[0] = 0; 282 for (i = 0; i < n; i++) { 283 nz = adiag[i] - ai[i]; 284 bi[i + 1] = bi[i] + nz; 285 aj = a->j + ai[i]; 286 for (j = 0; j < nz; j++) { 287 *bj = aj[j]; 288 bj++; 289 } 290 } 291 292 /* U part */ 293 bi_temp = bi[n]; 294 bdiag[n] = bi[n] - 1; 295 for (i = n - 1; i >= 0; i--) { 296 nz = ai[i + 1] - adiag[i] - 1; 297 bi_temp = bi_temp + nz + 1; 298 aj = a->j + adiag[i] + 1; 299 for (j = 0; j < nz; j++) { 300 *bj = aj[j]; 301 bj++; 302 } 303 /* diag[i] */ 304 *bj = i; 305 bj++; 306 bdiag[i] = bi_temp - 1; 307 } 308 PetscFunctionReturn(PETSC_SUCCESS); 309 } 310 311 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 312 { 313 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 314 IS isicol; 315 const PetscInt *r, *ic; 316 PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d; 317 PetscInt *bi, *cols, nnz, *cols_lvl; 318 PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0; 319 PetscInt i, levels, diagonal_fill; 320 PetscBool col_identity, row_identity, both_identity; 321 PetscReal f; 322 PetscInt nlnk, *lnk, *lnk_lvl = NULL; 323 PetscBT lnkbt; 324 PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr; 325 PetscFreeSpaceList free_space = NULL, current_space = NULL; 326 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 327 PetscBool missing; 328 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 329 330 PetscFunctionBegin; 331 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 332 if (bs > 1) { /* check shifttype */ 333 PetscCheck(info->shifttype != (PetscReal)MAT_SHIFT_NONZERO && info->shifttype != (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 334 } 335 336 PetscCall(MatMissingDiagonal(A, &missing, &d)); 337 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 338 339 f = info->fill; 340 levels = (PetscInt)info->levels; 341 diagonal_fill = (PetscInt)info->diagonal_fill; 342 343 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 344 345 PetscCall(ISIdentity(isrow, &row_identity)); 346 PetscCall(ISIdentity(iscol, &col_identity)); 347 348 both_identity = (PetscBool)(row_identity && col_identity); 349 350 if (!levels && both_identity) { 351 /* special case: ilu(0) with natural ordering */ 352 PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info)); 353 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 354 355 fact->factortype = MAT_FACTOR_ILU; 356 (fact)->info.factor_mallocs = 0; 357 (fact)->info.fill_ratio_given = info->fill; 358 (fact)->info.fill_ratio_needed = 1.0; 359 360 b = (Mat_SeqBAIJ *)(fact)->data; 361 b->row = isrow; 362 b->col = iscol; 363 b->icol = isicol; 364 PetscCall(PetscObjectReference((PetscObject)isrow)); 365 PetscCall(PetscObjectReference((PetscObject)iscol)); 366 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 367 368 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 369 PetscFunctionReturn(PETSC_SUCCESS); 370 } 371 372 PetscCall(ISGetIndices(isrow, &r)); 373 PetscCall(ISGetIndices(isicol, &ic)); 374 375 /* get new row pointers */ 376 PetscCall(PetscMalloc1(n + 1, &bi)); 377 bi[0] = 0; 378 /* bdiag is location of diagonal in factor */ 379 PetscCall(PetscMalloc1(n + 1, &bdiag)); 380 bdiag[0] = 0; 381 382 PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr)); 383 384 /* create a linked list for storing column indices of the active row */ 385 nlnk = n + 1; 386 PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt)); 387 388 /* initial FreeSpace size is f*(ai[n]+1) */ 389 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space)); 390 current_space = free_space; 391 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl)); 392 current_space_lvl = free_space_lvl; 393 394 for (i = 0; i < n; i++) { 395 nzi = 0; 396 /* copy current row into linked list */ 397 nnz = ai[r[i] + 1] - ai[r[i]]; 398 PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 399 cols = aj + ai[r[i]]; 400 lnk[i] = -1; /* marker to indicate if diagonal exists */ 401 PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt)); 402 nzi += nlnk; 403 404 /* make sure diagonal entry is included */ 405 if (diagonal_fill && lnk[i] == -1) { 406 fm = n; 407 while (lnk[fm] < i) fm = lnk[fm]; 408 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 409 lnk[fm] = i; 410 lnk_lvl[i] = 0; 411 nzi++; 412 dcount++; 413 } 414 415 /* add pivot rows into the active row */ 416 nzbd = 0; 417 prow = lnk[n]; 418 while (prow < i) { 419 nnz = bdiag[prow]; 420 cols = bj_ptr[prow] + nnz + 1; 421 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 422 nnz = bi[prow + 1] - bi[prow] - nnz - 1; 423 424 PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow)); 425 nzi += nlnk; 426 prow = lnk[prow]; 427 nzbd++; 428 } 429 bdiag[i] = nzbd; 430 bi[i + 1] = bi[i] + nzi; 431 432 /* if free space is not available, make more free space */ 433 if (current_space->local_remaining < nzi) { 434 nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */ 435 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space)); 436 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl)); 437 reallocs++; 438 } 439 440 /* copy data into free_space and free_space_lvl, then initialize lnk */ 441 PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 442 443 bj_ptr[i] = current_space->array; 444 bjlvl_ptr[i] = current_space_lvl->array; 445 446 /* make sure the active row i has diagonal entry */ 447 PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i); 448 449 current_space->array += nzi; 450 current_space->local_used += nzi; 451 current_space->local_remaining -= nzi; 452 453 current_space_lvl->array += nzi; 454 current_space_lvl->local_used += nzi; 455 current_space_lvl->local_remaining -= nzi; 456 } 457 458 PetscCall(ISRestoreIndices(isrow, &r)); 459 PetscCall(ISRestoreIndices(isicol, &ic)); 460 461 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 462 PetscCall(PetscMalloc1(bi[n] + 1, &bj)); 463 PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag)); 464 465 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 466 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 467 PetscCall(PetscFree2(bj_ptr, bjlvl_ptr)); 468 469 #if defined(PETSC_USE_INFO) 470 { 471 PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 472 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 473 PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af)); 474 PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af)); 475 PetscCall(PetscInfo(A, "for best performance.\n")); 476 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 477 } 478 #endif 479 480 /* put together the new matrix */ 481 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 482 483 b = (Mat_SeqBAIJ *)(fact)->data; 484 b->free_a = PETSC_TRUE; 485 b->free_ij = PETSC_TRUE; 486 b->singlemalloc = PETSC_FALSE; 487 488 PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a)); 489 490 b->j = bj; 491 b->i = bi; 492 b->diag = bdiag; 493 b->free_diag = PETSC_TRUE; 494 b->ilen = NULL; 495 b->imax = NULL; 496 b->row = isrow; 497 b->col = iscol; 498 PetscCall(PetscObjectReference((PetscObject)isrow)); 499 PetscCall(PetscObjectReference((PetscObject)iscol)); 500 b->icol = isicol; 501 502 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 503 /* In b structure: Free imax, ilen, old a, old j. 504 Allocate bdiag, solve_work, new a, new j */ 505 b->maxnz = b->nz = bdiag[0] + 1; 506 507 fact->info.factor_mallocs = reallocs; 508 fact->info.fill_ratio_given = f; 509 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 510 511 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 512 PetscFunctionReturn(PETSC_SUCCESS); 513 } 514 515 #if 0 516 // unused 517 /* 518 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 519 except that the data structure of Mat_SeqAIJ is slightly different. 520 Not a good example of code reuse. 521 */ 522 static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 523 { 524 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 525 IS isicol; 526 const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi; 527 PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp; 528 PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0; 529 PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd; 530 PetscBool col_identity, row_identity, both_identity, flg; 531 PetscReal f; 532 533 PetscFunctionBegin; 534 PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd)); 535 PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd); 536 537 f = info->fill; 538 levels = (PetscInt)info->levels; 539 diagonal_fill = (PetscInt)info->diagonal_fill; 540 541 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 542 543 PetscCall(ISIdentity(isrow, &row_identity)); 544 PetscCall(ISIdentity(iscol, &col_identity)); 545 both_identity = (PetscBool)(row_identity && col_identity); 546 547 if (!levels && both_identity) { /* special case copy the nonzero structure */ 548 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE)); 549 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 550 551 fact->factortype = MAT_FACTOR_ILU; 552 b = (Mat_SeqBAIJ *)fact->data; 553 b->row = isrow; 554 b->col = iscol; 555 PetscCall(PetscObjectReference((PetscObject)isrow)); 556 PetscCall(PetscObjectReference((PetscObject)iscol)); 557 b->icol = isicol; 558 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 559 560 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 561 PetscFunctionReturn(PETSC_SUCCESS); 562 } 563 564 /* general case perform the symbolic factorization */ 565 PetscCall(ISGetIndices(isrow, &r)); 566 PetscCall(ISGetIndices(isicol, &ic)); 567 568 /* get new row pointers */ 569 PetscCall(PetscMalloc1(n + 1, &ainew)); 570 ainew[0] = 0; 571 /* don't know how many column pointers are needed so estimate */ 572 jmax = (PetscInt)(f * ai[n] + 1); 573 PetscCall(PetscMalloc1(jmax, &ajnew)); 574 /* ajfill is level of fill for each fill entry */ 575 PetscCall(PetscMalloc1(jmax, &ajfill)); 576 /* fill is a linked list of nonzeros in active row */ 577 PetscCall(PetscMalloc1(n + 1, &fill)); 578 /* im is level for each filled value */ 579 PetscCall(PetscMalloc1(n + 1, &im)); 580 /* dloc is location of diagonal in factor */ 581 PetscCall(PetscMalloc1(n + 1, &dloc)); 582 dloc[0] = 0; 583 for (prow = 0; prow < n; prow++) { 584 /* copy prow into linked list */ 585 nzf = nz = ai[r[prow] + 1] - ai[r[prow]]; 586 PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow); 587 xi = aj + ai[r[prow]]; 588 fill[n] = n; 589 fill[prow] = -1; /* marker for diagonal entry */ 590 while (nz--) { 591 fm = n; 592 idx = ic[*xi++]; 593 do { 594 m = fm; 595 fm = fill[m]; 596 } while (fm < idx); 597 fill[m] = idx; 598 fill[idx] = fm; 599 im[idx] = 0; 600 } 601 602 /* make sure diagonal entry is included */ 603 if (diagonal_fill && fill[prow] == -1) { 604 fm = n; 605 while (fill[fm] < prow) fm = fill[fm]; 606 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 607 fill[fm] = prow; 608 im[prow] = 0; 609 nzf++; 610 dcount++; 611 } 612 613 nzi = 0; 614 row = fill[n]; 615 while (row < prow) { 616 incrlev = im[row] + 1; 617 nz = dloc[row]; 618 xi = ajnew + ainew[row] + nz + 1; 619 flev = ajfill + ainew[row] + nz + 1; 620 nnz = ainew[row + 1] - ainew[row] - nz - 1; 621 fm = row; 622 while (nnz-- > 0) { 623 idx = *xi++; 624 if (*flev + incrlev > levels) { 625 flev++; 626 continue; 627 } 628 do { 629 m = fm; 630 fm = fill[m]; 631 } while (fm < idx); 632 if (fm != idx) { 633 im[idx] = *flev + incrlev; 634 fill[m] = idx; 635 fill[idx] = fm; 636 fm = idx; 637 nzf++; 638 } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev; 639 flev++; 640 } 641 row = fill[row]; 642 nzi++; 643 } 644 /* copy new filled row into permanent storage */ 645 ainew[prow + 1] = ainew[prow] + nzf; 646 if (ainew[prow + 1] > jmax) { 647 /* estimate how much additional space we will need */ 648 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 649 /* just double the memory each time */ 650 PetscInt maxadd = jmax; 651 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 652 if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1); 653 jmax += maxadd; 654 655 /* allocate a longer ajnew and ajfill */ 656 PetscCall(PetscMalloc1(jmax, &xitmp)); 657 PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow])); 658 PetscCall(PetscFree(ajnew)); 659 ajnew = xitmp; 660 PetscCall(PetscMalloc1(jmax, &xitmp)); 661 PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow])); 662 PetscCall(PetscFree(ajfill)); 663 ajfill = xitmp; 664 reallocate++; /* count how many reallocations are needed */ 665 } 666 xitmp = ajnew + ainew[prow]; 667 flev = ajfill + ainew[prow]; 668 dloc[prow] = nzi; 669 fm = fill[n]; 670 while (nzf--) { 671 *xitmp++ = fm; 672 *flev++ = im[fm]; 673 fm = fill[fm]; 674 } 675 /* make sure row has diagonal entry */ 676 PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\ 677 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", 678 prow); 679 } 680 PetscCall(PetscFree(ajfill)); 681 PetscCall(ISRestoreIndices(isrow, &r)); 682 PetscCall(ISRestoreIndices(isicol, &ic)); 683 PetscCall(PetscFree(fill)); 684 PetscCall(PetscFree(im)); 685 686 #if defined(PETSC_USE_INFO) 687 { 688 PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]); 689 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af)); 690 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 691 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 692 PetscCall(PetscInfo(A, "for best performance.\n")); 693 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 694 } 695 #endif 696 697 /* put together the new matrix */ 698 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 699 b = (Mat_SeqBAIJ *)fact->data; 700 701 b->free_a = PETSC_TRUE; 702 b->free_ij = PETSC_TRUE; 703 b->singlemalloc = PETSC_FALSE; 704 705 PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a)); 706 707 b->j = ajnew; 708 b->i = ainew; 709 for (i = 0; i < n; i++) dloc[i] += ainew[i]; 710 b->diag = dloc; 711 b->free_diag = PETSC_TRUE; 712 b->ilen = NULL; 713 b->imax = NULL; 714 b->row = isrow; 715 b->col = iscol; 716 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 717 718 PetscCall(PetscObjectReference((PetscObject)isrow)); 719 PetscCall(PetscObjectReference((PetscObject)iscol)); 720 b->icol = isicol; 721 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 722 /* In b structure: Free imax, ilen, old a, old j. 723 Allocate dloc, solve_work, new a, new j */ 724 b->maxnz = b->nz = ainew[n]; 725 726 fact->info.factor_mallocs = reallocate; 727 fact->info.fill_ratio_given = f; 728 fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]); 729 730 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 731 PetscFunctionReturn(PETSC_SUCCESS); 732 } 733 #endif 734 735 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 736 { 737 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 738 /* int i,*AJ=a->j,nz=a->nz; */ 739 740 PetscFunctionBegin; 741 /* Undo Column scaling */ 742 /* while (nz--) { */ 743 /* AJ[i] = AJ[i]/4; */ 744 /* } */ 745 /* This should really invoke a push/pop logic, but we don't have that yet. */ 746 A->ops->setunfactored = NULL; 747 PetscFunctionReturn(PETSC_SUCCESS); 748 } 749 750 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 751 { 752 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 753 PetscInt *AJ = a->j, nz = a->nz; 754 unsigned short *aj = (unsigned short *)AJ; 755 756 PetscFunctionBegin; 757 /* Is this really necessary? */ 758 while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ } 759 A->ops->setunfactored = NULL; 760 PetscFunctionReturn(PETSC_SUCCESS); 761 } 762