1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 /* ----------------------------------------------------------------*/ 12 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool); 13 14 /* 15 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 16 */ 17 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 18 Mat C = B; 19 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 20 PetscInt i, j, k, ipvt[15]; 21 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj; 22 PetscInt nz, nzL, row; 23 MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225]; 24 const MatScalar *v, *aa = a->a; 25 PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg; 26 PetscInt sol_ver; 27 PetscBool allowzeropivot, zeropivotdetected; 28 29 PetscFunctionBegin; 30 allowzeropivot = PetscNot(A->erroriffailure); 31 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL)); 32 33 /* generate work space needed by the factorization */ 34 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 35 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 36 37 for (i = 0; i < n; i++) { 38 /* zero rtmp */ 39 /* L part */ 40 nz = bi[i + 1] - bi[i]; 41 bjtmp = bj + bi[i]; 42 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 43 44 /* U part */ 45 nz = bdiag[i] - bdiag[i + 1]; 46 bjtmp = bj + bdiag[i + 1] + 1; 47 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 48 49 /* load in initial (unfactored row) */ 50 nz = ai[i + 1] - ai[i]; 51 ajtmp = aj + ai[i]; 52 v = aa + bs2 * ai[i]; 53 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); } 54 55 /* elimination */ 56 bjtmp = bj + bi[i]; 57 nzL = bi[i + 1] - bi[i]; 58 for (k = 0; k < nzL; k++) { 59 row = bjtmp[k]; 60 pc = rtmp + bs2 * row; 61 for (flg = 0, j = 0; j < bs2; j++) { 62 if (pc[j] != 0.0) { 63 flg = 1; 64 break; 65 } 66 } 67 if (flg) { 68 pv = b->a + bs2 * bdiag[row]; 69 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); 70 /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */ 71 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 72 pv = b->a + bs2 * (bdiag[row + 1] + 1); 73 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 74 for (j = 0; j < nz; j++) { 75 vv = rtmp + bs2 * pj[j]; 76 PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv); 77 /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */ 78 pv += bs2; 79 } 80 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 81 } 82 } 83 84 /* finished row so stick it into b->a */ 85 /* L part */ 86 pv = b->a + bs2 * bi[i]; 87 pj = b->j + bi[i]; 88 nz = bi[i + 1] - bi[i]; 89 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 90 91 /* Mark diagonal and invert diagonal for simpler triangular solves */ 92 pv = b->a + bs2 * bdiag[i]; 93 pj = b->j + bdiag[i]; 94 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 95 PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected)); 96 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 107 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 108 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 109 C->assembled = PETSC_TRUE; 110 111 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 112 PetscFunctionReturn(0); 113 } 114 115 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info) { 116 Mat C = B; 117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 118 IS isrow = b->row, isicol = b->icol; 119 const PetscInt *r, *ic; 120 PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 121 PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj; 122 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 123 PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg; 124 MatScalar *v_work; 125 PetscBool col_identity, row_identity, both_identity; 126 PetscBool allowzeropivot, zeropivotdetected; 127 128 PetscFunctionBegin; 129 PetscCall(ISGetIndices(isrow, &r)); 130 PetscCall(ISGetIndices(isicol, &ic)); 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 PetscCall(PetscCalloc1(bs2 * n, &rtmp)); 134 135 /* generate work space needed by dense LU factorization */ 136 PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots)); 137 138 for (i = 0; i < n; i++) { 139 /* zero rtmp */ 140 /* L part */ 141 nz = bi[i + 1] - bi[i]; 142 bjtmp = bj + bi[i]; 143 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 144 145 /* U part */ 146 nz = bdiag[i] - bdiag[i + 1]; 147 bjtmp = bj + bdiag[i + 1] + 1; 148 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 149 150 /* load in initial (unfactored row) */ 151 nz = ai[r[i] + 1] - ai[r[i]]; 152 ajtmp = aj + ai[r[i]]; 153 v = aa + bs2 * ai[r[i]]; 154 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); } 155 156 /* elimination */ 157 bjtmp = bj + bi[i]; 158 nzL = bi[i + 1] - bi[i]; 159 for (k = 0; k < nzL; k++) { 160 row = bjtmp[k]; 161 pc = rtmp + bs2 * row; 162 for (flg = 0, j = 0; j < bs2; j++) { 163 if (pc[j] != 0.0) { 164 flg = 1; 165 break; 166 } 167 } 168 if (flg) { 169 pv = b->a + bs2 * bdiag[row]; 170 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */ 171 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 172 pv = b->a + bs2 * (bdiag[row + 1] + 1); 173 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 174 for (j = 0; j < nz; j++) { PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); } 175 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 176 } 177 } 178 179 /* finished row so stick it into b->a */ 180 /* L part */ 181 pv = b->a + bs2 * bi[i]; 182 pj = b->j + bi[i]; 183 nz = bi[i + 1] - bi[i]; 184 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 185 186 /* Mark diagonal and invert diagonal for simpler triangular solves */ 187 pv = b->a + bs2 * bdiag[i]; 188 pj = b->j + bdiag[i]; 189 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 190 191 PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 192 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 193 194 /* U part */ 195 pv = b->a + bs2 * (bdiag[i + 1] + 1); 196 pj = b->j + bdiag[i + 1] + 1; 197 nz = bdiag[i] - bdiag[i + 1] - 1; 198 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 199 } 200 201 PetscCall(PetscFree(rtmp)); 202 PetscCall(PetscFree3(v_work, mwork, v_pivots)); 203 PetscCall(ISRestoreIndices(isicol, &ic)); 204 PetscCall(ISRestoreIndices(isrow, &r)); 205 206 PetscCall(ISIdentity(isrow, &row_identity)); 207 PetscCall(ISIdentity(isicol, &col_identity)); 208 209 both_identity = (PetscBool)(row_identity && col_identity); 210 if (both_identity) { 211 switch (bs) { 212 case 9: 213 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 214 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 215 #else 216 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 217 #endif 218 break; 219 case 11: C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; break; 220 case 12: C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; break; 221 case 13: C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; break; 222 case 14: C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; break; 223 default: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; break; 224 } 225 } else { 226 C->ops->solve = MatSolve_SeqBAIJ_N; 227 } 228 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 229 230 C->assembled = PETSC_TRUE; 231 232 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 233 PetscFunctionReturn(0); 234 } 235 236 /* 237 ilu(0) with natural ordering under new data structure. 238 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 239 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 240 */ 241 242 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 243 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 244 PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2; 245 PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp; 246 247 PetscFunctionBegin; 248 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE)); 249 b = (Mat_SeqBAIJ *)(fact)->data; 250 251 /* allocate matrix arrays for new data structure */ 252 PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i)); 253 PetscCall(PetscLogObjectMemory((PetscObject)fact, ai[n] * (bs2 * sizeof(PetscScalar) + sizeof(PetscInt)) + (n + 1) * sizeof(PetscInt))); 254 255 b->singlemalloc = PETSC_TRUE; 256 b->free_a = PETSC_TRUE; 257 b->free_ij = PETSC_TRUE; 258 fact->preallocated = PETSC_TRUE; 259 fact->assembled = PETSC_TRUE; 260 if (!b->diag) { 261 PetscCall(PetscMalloc1(n + 1, &b->diag)); 262 PetscCall(PetscLogObjectMemory((PetscObject)fact, (n + 1) * sizeof(PetscInt))); 263 } 264 bdiag = b->diag; 265 266 if (n > 0) { PetscCall(PetscArrayzero(b->a, bs2 * ai[n])); } 267 268 /* set bi and bj with new data structure */ 269 bi = b->i; 270 bj = b->j; 271 272 /* L part */ 273 bi[0] = 0; 274 for (i = 0; i < n; i++) { 275 nz = adiag[i] - ai[i]; 276 bi[i + 1] = bi[i] + nz; 277 aj = a->j + ai[i]; 278 for (j = 0; j < nz; j++) { 279 *bj = aj[j]; 280 bj++; 281 } 282 } 283 284 /* U part */ 285 bi_temp = bi[n]; 286 bdiag[n] = bi[n] - 1; 287 for (i = n - 1; i >= 0; i--) { 288 nz = ai[i + 1] - adiag[i] - 1; 289 bi_temp = bi_temp + nz + 1; 290 aj = a->j + adiag[i] + 1; 291 for (j = 0; j < nz; j++) { 292 *bj = aj[j]; 293 bj++; 294 } 295 /* diag[i] */ 296 *bj = i; 297 bj++; 298 bdiag[i] = bi_temp - 1; 299 } 300 PetscFunctionReturn(0); 301 } 302 303 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 305 IS isicol; 306 const PetscInt *r, *ic; 307 PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d; 308 PetscInt *bi, *cols, nnz, *cols_lvl; 309 PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0; 310 PetscInt i, levels, diagonal_fill; 311 PetscBool col_identity, row_identity, both_identity; 312 PetscReal f; 313 PetscInt nlnk, *lnk, *lnk_lvl = NULL; 314 PetscBT lnkbt; 315 PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr; 316 PetscFreeSpaceList free_space = NULL, current_space = NULL; 317 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 318 PetscBool missing; 319 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 320 321 PetscFunctionBegin; 322 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 323 if (bs > 1) { /* check shifttype */ 324 PetscCheck(info->shifttype != MAT_SHIFT_NONZERO && info->shifttype != MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 325 } 326 327 PetscCall(MatMissingDiagonal(A, &missing, &d)); 328 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 329 330 f = info->fill; 331 levels = (PetscInt)info->levels; 332 diagonal_fill = (PetscInt)info->diagonal_fill; 333 334 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 335 336 PetscCall(ISIdentity(isrow, &row_identity)); 337 PetscCall(ISIdentity(iscol, &col_identity)); 338 339 both_identity = (PetscBool)(row_identity && col_identity); 340 341 if (!levels && both_identity) { 342 /* special case: ilu(0) with natural ordering */ 343 PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info)); 344 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 345 346 fact->factortype = MAT_FACTOR_ILU; 347 (fact)->info.factor_mallocs = 0; 348 (fact)->info.fill_ratio_given = info->fill; 349 (fact)->info.fill_ratio_needed = 1.0; 350 351 b = (Mat_SeqBAIJ *)(fact)->data; 352 b->row = isrow; 353 b->col = iscol; 354 b->icol = isicol; 355 PetscCall(PetscObjectReference((PetscObject)isrow)); 356 PetscCall(PetscObjectReference((PetscObject)iscol)); 357 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 358 359 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 360 PetscFunctionReturn(0); 361 } 362 363 PetscCall(ISGetIndices(isrow, &r)); 364 PetscCall(ISGetIndices(isicol, &ic)); 365 366 /* get new row pointers */ 367 PetscCall(PetscMalloc1(n + 1, &bi)); 368 bi[0] = 0; 369 /* bdiag is location of diagonal in factor */ 370 PetscCall(PetscMalloc1(n + 1, &bdiag)); 371 bdiag[0] = 0; 372 373 PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr)); 374 375 /* create a linked list for storing column indices of the active row */ 376 nlnk = n + 1; 377 PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt)); 378 379 /* initial FreeSpace size is f*(ai[n]+1) */ 380 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space)); 381 current_space = free_space; 382 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl)); 383 current_space_lvl = free_space_lvl; 384 385 for (i = 0; i < n; i++) { 386 nzi = 0; 387 /* copy current row into linked list */ 388 nnz = ai[r[i] + 1] - ai[r[i]]; 389 PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 390 cols = aj + ai[r[i]]; 391 lnk[i] = -1; /* marker to indicate if diagonal exists */ 392 PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt)); 393 nzi += nlnk; 394 395 /* make sure diagonal entry is included */ 396 if (diagonal_fill && lnk[i] == -1) { 397 fm = n; 398 while (lnk[fm] < i) fm = lnk[fm]; 399 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 400 lnk[fm] = i; 401 lnk_lvl[i] = 0; 402 nzi++; 403 dcount++; 404 } 405 406 /* add pivot rows into the active row */ 407 nzbd = 0; 408 prow = lnk[n]; 409 while (prow < i) { 410 nnz = bdiag[prow]; 411 cols = bj_ptr[prow] + nnz + 1; 412 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 413 nnz = bi[prow + 1] - bi[prow] - nnz - 1; 414 415 PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow)); 416 nzi += nlnk; 417 prow = lnk[prow]; 418 nzbd++; 419 } 420 bdiag[i] = nzbd; 421 bi[i + 1] = bi[i] + nzi; 422 423 /* if free space is not available, make more free space */ 424 if (current_space->local_remaining < nzi) { 425 nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */ 426 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space)); 427 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl)); 428 reallocs++; 429 } 430 431 /* copy data into free_space and free_space_lvl, then initialize lnk */ 432 PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 433 434 bj_ptr[i] = current_space->array; 435 bjlvl_ptr[i] = current_space_lvl->array; 436 437 /* make sure the active row i has diagonal entry */ 438 PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i); 439 440 current_space->array += nzi; 441 current_space->local_used += nzi; 442 current_space->local_remaining -= nzi; 443 444 current_space_lvl->array += nzi; 445 current_space_lvl->local_used += nzi; 446 current_space_lvl->local_remaining -= nzi; 447 } 448 449 PetscCall(ISRestoreIndices(isrow, &r)); 450 PetscCall(ISRestoreIndices(isicol, &ic)); 451 452 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 453 PetscCall(PetscMalloc1(bi[n] + 1, &bj)); 454 PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag)); 455 456 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 457 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 458 PetscCall(PetscFree2(bj_ptr, bjlvl_ptr)); 459 460 #if defined(PETSC_USE_INFO) 461 { 462 PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 463 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 464 PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af)); 465 PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af)); 466 PetscCall(PetscInfo(A, "for best performance.\n")); 467 if (diagonal_fill) { PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); } 468 } 469 #endif 470 471 /* put together the new matrix */ 472 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 473 PetscCall(PetscLogObjectParent((PetscObject)fact, (PetscObject)isicol)); 474 475 b = (Mat_SeqBAIJ *)(fact)->data; 476 b->free_a = PETSC_TRUE; 477 b->free_ij = PETSC_TRUE; 478 b->singlemalloc = PETSC_FALSE; 479 480 PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a)); 481 482 b->j = bj; 483 b->i = bi; 484 b->diag = bdiag; 485 b->free_diag = PETSC_TRUE; 486 b->ilen = NULL; 487 b->imax = NULL; 488 b->row = isrow; 489 b->col = iscol; 490 PetscCall(PetscObjectReference((PetscObject)isrow)); 491 PetscCall(PetscObjectReference((PetscObject)iscol)); 492 b->icol = isicol; 493 494 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 495 /* In b structure: Free imax, ilen, old a, old j. 496 Allocate bdiag, solve_work, new a, new j */ 497 PetscCall(PetscLogObjectMemory((PetscObject)fact, (bdiag[0] + 1) * (sizeof(PetscInt) + bs2 * sizeof(PetscScalar)))); 498 b->maxnz = b->nz = bdiag[0] + 1; 499 500 fact->info.factor_mallocs = reallocs; 501 fact->info.fill_ratio_given = f; 502 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 503 504 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 505 PetscFunctionReturn(0); 506 } 507 508 /* 509 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 510 except that the data structure of Mat_SeqAIJ is slightly different. 511 Not a good example of code reuse. 512 */ 513 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 514 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 515 IS isicol; 516 const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi; 517 PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp; 518 PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0; 519 PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd; 520 PetscBool col_identity, row_identity, both_identity, flg; 521 PetscReal f; 522 523 PetscFunctionBegin; 524 PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd)); 525 PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd); 526 527 f = info->fill; 528 levels = (PetscInt)info->levels; 529 diagonal_fill = (PetscInt)info->diagonal_fill; 530 531 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 532 533 PetscCall(ISIdentity(isrow, &row_identity)); 534 PetscCall(ISIdentity(iscol, &col_identity)); 535 both_identity = (PetscBool)(row_identity && col_identity); 536 537 if (!levels && both_identity) { /* special case copy the nonzero structure */ 538 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE)); 539 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 540 541 fact->factortype = MAT_FACTOR_ILU; 542 b = (Mat_SeqBAIJ *)fact->data; 543 b->row = isrow; 544 b->col = iscol; 545 PetscCall(PetscObjectReference((PetscObject)isrow)); 546 PetscCall(PetscObjectReference((PetscObject)iscol)); 547 b->icol = isicol; 548 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 549 550 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 551 PetscFunctionReturn(0); 552 } 553 554 /* general case perform the symbolic factorization */ 555 PetscCall(ISGetIndices(isrow, &r)); 556 PetscCall(ISGetIndices(isicol, &ic)); 557 558 /* get new row pointers */ 559 PetscCall(PetscMalloc1(n + 1, &ainew)); 560 ainew[0] = 0; 561 /* don't know how many column pointers are needed so estimate */ 562 jmax = (PetscInt)(f * ai[n] + 1); 563 PetscCall(PetscMalloc1(jmax, &ajnew)); 564 /* ajfill is level of fill for each fill entry */ 565 PetscCall(PetscMalloc1(jmax, &ajfill)); 566 /* fill is a linked list of nonzeros in active row */ 567 PetscCall(PetscMalloc1(n + 1, &fill)); 568 /* im is level for each filled value */ 569 PetscCall(PetscMalloc1(n + 1, &im)); 570 /* dloc is location of diagonal in factor */ 571 PetscCall(PetscMalloc1(n + 1, &dloc)); 572 dloc[0] = 0; 573 for (prow = 0; prow < n; prow++) { 574 /* copy prow into linked list */ 575 nzf = nz = ai[r[prow] + 1] - ai[r[prow]]; 576 PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow); 577 xi = aj + ai[r[prow]]; 578 fill[n] = n; 579 fill[prow] = -1; /* marker for diagonal entry */ 580 while (nz--) { 581 fm = n; 582 idx = ic[*xi++]; 583 do { 584 m = fm; 585 fm = fill[m]; 586 } while (fm < idx); 587 fill[m] = idx; 588 fill[idx] = fm; 589 im[idx] = 0; 590 } 591 592 /* make sure diagonal entry is included */ 593 if (diagonal_fill && fill[prow] == -1) { 594 fm = n; 595 while (fill[fm] < prow) fm = fill[fm]; 596 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 597 fill[fm] = prow; 598 im[prow] = 0; 599 nzf++; 600 dcount++; 601 } 602 603 nzi = 0; 604 row = fill[n]; 605 while (row < prow) { 606 incrlev = im[row] + 1; 607 nz = dloc[row]; 608 xi = ajnew + ainew[row] + nz + 1; 609 flev = ajfill + ainew[row] + nz + 1; 610 nnz = ainew[row + 1] - ainew[row] - nz - 1; 611 fm = row; 612 while (nnz-- > 0) { 613 idx = *xi++; 614 if (*flev + incrlev > levels) { 615 flev++; 616 continue; 617 } 618 do { 619 m = fm; 620 fm = fill[m]; 621 } while (fm < idx); 622 if (fm != idx) { 623 im[idx] = *flev + incrlev; 624 fill[m] = idx; 625 fill[idx] = fm; 626 fm = idx; 627 nzf++; 628 } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev; 629 flev++; 630 } 631 row = fill[row]; 632 nzi++; 633 } 634 /* copy new filled row into permanent storage */ 635 ainew[prow + 1] = ainew[prow] + nzf; 636 if (ainew[prow + 1] > jmax) { 637 /* estimate how much additional space we will need */ 638 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 639 /* just double the memory each time */ 640 PetscInt maxadd = jmax; 641 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 642 if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1); 643 jmax += maxadd; 644 645 /* allocate a longer ajnew and ajfill */ 646 PetscCall(PetscMalloc1(jmax, &xitmp)); 647 PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow])); 648 PetscCall(PetscFree(ajnew)); 649 ajnew = xitmp; 650 PetscCall(PetscMalloc1(jmax, &xitmp)); 651 PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow])); 652 PetscCall(PetscFree(ajfill)); 653 ajfill = xitmp; 654 reallocate++; /* count how many reallocations are needed */ 655 } 656 xitmp = ajnew + ainew[prow]; 657 flev = ajfill + ainew[prow]; 658 dloc[prow] = nzi; 659 fm = fill[n]; 660 while (nzf--) { 661 *xitmp++ = fm; 662 *flev++ = im[fm]; 663 fm = fill[fm]; 664 } 665 /* make sure row has diagonal entry */ 666 PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\ 667 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", 668 prow); 669 } 670 PetscCall(PetscFree(ajfill)); 671 PetscCall(ISRestoreIndices(isrow, &r)); 672 PetscCall(ISRestoreIndices(isicol, &ic)); 673 PetscCall(PetscFree(fill)); 674 PetscCall(PetscFree(im)); 675 676 #if defined(PETSC_USE_INFO) 677 { 678 PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]); 679 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af)); 680 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 681 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 682 PetscCall(PetscInfo(A, "for best performance.\n")); 683 if (diagonal_fill) { PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); } 684 } 685 #endif 686 687 /* put together the new matrix */ 688 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 689 PetscCall(PetscLogObjectParent((PetscObject)fact, (PetscObject)isicol)); 690 b = (Mat_SeqBAIJ *)fact->data; 691 692 b->free_a = PETSC_TRUE; 693 b->free_ij = PETSC_TRUE; 694 b->singlemalloc = PETSC_FALSE; 695 696 PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a)); 697 698 b->j = ajnew; 699 b->i = ainew; 700 for (i = 0; i < n; i++) dloc[i] += ainew[i]; 701 b->diag = dloc; 702 b->free_diag = PETSC_TRUE; 703 b->ilen = NULL; 704 b->imax = NULL; 705 b->row = isrow; 706 b->col = iscol; 707 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 708 709 PetscCall(PetscObjectReference((PetscObject)isrow)); 710 PetscCall(PetscObjectReference((PetscObject)iscol)); 711 b->icol = isicol; 712 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 713 /* In b structure: Free imax, ilen, old a, old j. 714 Allocate dloc, solve_work, new a, new j */ 715 PetscCall(PetscLogObjectMemory((PetscObject)fact, (ainew[n] - n) * (sizeof(PetscInt)) + bs2 * ainew[n] * sizeof(PetscScalar))); 716 b->maxnz = b->nz = ainew[n]; 717 718 fact->info.factor_mallocs = reallocate; 719 fact->info.fill_ratio_given = f; 720 fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]); 721 722 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 723 PetscFunctionReturn(0); 724 } 725 726 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) { 727 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 728 /* int i,*AJ=a->j,nz=a->nz; */ 729 730 PetscFunctionBegin; 731 /* Undo Column scaling */ 732 /* while (nz--) { */ 733 /* AJ[i] = AJ[i]/4; */ 734 /* } */ 735 /* This should really invoke a push/pop logic, but we don't have that yet. */ 736 A->ops->setunfactored = NULL; 737 PetscFunctionReturn(0); 738 } 739 740 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) { 741 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 742 PetscInt *AJ = a->j, nz = a->nz; 743 unsigned short *aj = (unsigned short *)AJ; 744 745 PetscFunctionBegin; 746 /* Is this really necessary? */ 747 while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ } 748 A->ops->setunfactored = NULL; 749 PetscFunctionReturn(0); 750 } 751