1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 /* ----------------------------------------------------------------*/ 12 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool); 13 14 /* 15 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 16 */ 17 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 18 Mat C = B; 19 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 20 PetscInt i, j, k, ipvt[15]; 21 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj; 22 PetscInt nz, nzL, row; 23 MatScalar *rtmp, *pc, *mwork, *pv, *vv, work[225]; 24 const MatScalar *v, *aa = a->a; 25 PetscInt bs2 = a->bs2, bs = A->rmap->bs, flg; 26 PetscInt sol_ver; 27 PetscBool allowzeropivot, zeropivotdetected; 28 29 PetscFunctionBegin; 30 allowzeropivot = PetscNot(A->erroriffailure); 31 PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL)); 32 33 /* generate work space needed by the factorization */ 34 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 35 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 36 37 for (i = 0; i < n; i++) { 38 /* zero rtmp */ 39 /* L part */ 40 nz = bi[i + 1] - bi[i]; 41 bjtmp = bj + bi[i]; 42 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 43 44 /* U part */ 45 nz = bdiag[i] - bdiag[i + 1]; 46 bjtmp = bj + bdiag[i + 1] + 1; 47 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 48 49 /* load in initial (unfactored row) */ 50 nz = ai[i + 1] - ai[i]; 51 ajtmp = aj + ai[i]; 52 v = aa + bs2 * ai[i]; 53 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 54 55 /* elimination */ 56 bjtmp = bj + bi[i]; 57 nzL = bi[i + 1] - bi[i]; 58 for (k = 0; k < nzL; k++) { 59 row = bjtmp[k]; 60 pc = rtmp + bs2 * row; 61 for (flg = 0, j = 0; j < bs2; j++) { 62 if (pc[j] != 0.0) { 63 flg = 1; 64 break; 65 } 66 } 67 if (flg) { 68 pv = b->a + bs2 * bdiag[row]; 69 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); 70 /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */ 71 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 72 pv = b->a + bs2 * (bdiag[row + 1] + 1); 73 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 74 for (j = 0; j < nz; j++) { 75 vv = rtmp + bs2 * pj[j]; 76 PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv); 77 /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */ 78 pv += bs2; 79 } 80 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 81 } 82 } 83 84 /* finished row so stick it into b->a */ 85 /* L part */ 86 pv = b->a + bs2 * bi[i]; 87 pj = b->j + bi[i]; 88 nz = bi[i + 1] - bi[i]; 89 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 90 91 /* Mark diagonal and invert diagonal for simpler triangular solves */ 92 pv = b->a + bs2 * bdiag[i]; 93 pj = b->j + bdiag[i]; 94 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 95 PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected)); 96 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97 98 /* U part */ 99 pv = b->a + bs2 * (bdiag[i + 1] + 1); 100 pj = b->j + bdiag[i + 1] + 1; 101 nz = bdiag[i] - bdiag[i + 1] - 1; 102 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 103 } 104 105 PetscCall(PetscFree2(rtmp, mwork)); 106 107 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 108 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 109 C->assembled = PETSC_TRUE; 110 111 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 112 PetscFunctionReturn(0); 113 } 114 115 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info) { 116 Mat C = B; 117 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 118 IS isrow = b->row, isicol = b->icol; 119 const PetscInt *r, *ic; 120 PetscInt i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 121 PetscInt *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj; 122 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 123 PetscInt bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg; 124 MatScalar *v_work; 125 PetscBool col_identity, row_identity, both_identity; 126 PetscBool allowzeropivot, zeropivotdetected; 127 128 PetscFunctionBegin; 129 PetscCall(ISGetIndices(isrow, &r)); 130 PetscCall(ISGetIndices(isicol, &ic)); 131 allowzeropivot = PetscNot(A->erroriffailure); 132 133 PetscCall(PetscCalloc1(bs2 * n, &rtmp)); 134 135 /* generate work space needed by dense LU factorization */ 136 PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots)); 137 138 for (i = 0; i < n; i++) { 139 /* zero rtmp */ 140 /* L part */ 141 nz = bi[i + 1] - bi[i]; 142 bjtmp = bj + bi[i]; 143 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 144 145 /* U part */ 146 nz = bdiag[i] - bdiag[i + 1]; 147 bjtmp = bj + bdiag[i + 1] + 1; 148 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 149 150 /* load in initial (unfactored row) */ 151 nz = ai[r[i] + 1] - ai[r[i]]; 152 ajtmp = aj + ai[r[i]]; 153 v = aa + bs2 * ai[r[i]]; 154 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 155 156 /* elimination */ 157 bjtmp = bj + bi[i]; 158 nzL = bi[i + 1] - bi[i]; 159 for (k = 0; k < nzL; k++) { 160 row = bjtmp[k]; 161 pc = rtmp + bs2 * row; 162 for (flg = 0, j = 0; j < bs2; j++) { 163 if (pc[j] != 0.0) { 164 flg = 1; 165 break; 166 } 167 } 168 if (flg) { 169 pv = b->a + bs2 * bdiag[row]; 170 PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */ 171 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 172 pv = b->a + bs2 * (bdiag[row + 1] + 1); 173 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 174 for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 175 PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 176 } 177 } 178 179 /* finished row so stick it into b->a */ 180 /* L part */ 181 pv = b->a + bs2 * bi[i]; 182 pj = b->j + bi[i]; 183 nz = bi[i + 1] - bi[i]; 184 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 185 186 /* Mark diagonal and invert diagonal for simpler triangular solves */ 187 pv = b->a + bs2 * bdiag[i]; 188 pj = b->j + bdiag[i]; 189 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 190 191 PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 192 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 193 194 /* U part */ 195 pv = b->a + bs2 * (bdiag[i + 1] + 1); 196 pj = b->j + bdiag[i + 1] + 1; 197 nz = bdiag[i] - bdiag[i + 1] - 1; 198 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 199 } 200 201 PetscCall(PetscFree(rtmp)); 202 PetscCall(PetscFree3(v_work, mwork, v_pivots)); 203 PetscCall(ISRestoreIndices(isicol, &ic)); 204 PetscCall(ISRestoreIndices(isrow, &r)); 205 206 PetscCall(ISIdentity(isrow, &row_identity)); 207 PetscCall(ISIdentity(isicol, &col_identity)); 208 209 both_identity = (PetscBool)(row_identity && col_identity); 210 if (both_identity) { 211 switch (bs) { 212 case 9: 213 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 214 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 215 #else 216 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 217 #endif 218 break; 219 case 11: C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; break; 220 case 12: C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; break; 221 case 13: C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; break; 222 case 14: C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; break; 223 default: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; break; 224 } 225 } else { 226 C->ops->solve = MatSolve_SeqBAIJ_N; 227 } 228 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 229 230 C->assembled = PETSC_TRUE; 231 232 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */ 233 PetscFunctionReturn(0); 234 } 235 236 /* 237 ilu(0) with natural ordering under new data structure. 238 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 239 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 240 */ 241 242 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 243 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 244 PetscInt n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2; 245 PetscInt i, j, nz, *bi, *bj, *bdiag, bi_temp; 246 247 PetscFunctionBegin; 248 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE)); 249 b = (Mat_SeqBAIJ *)(fact)->data; 250 251 /* allocate matrix arrays for new data structure */ 252 PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i)); 253 254 b->singlemalloc = PETSC_TRUE; 255 b->free_a = PETSC_TRUE; 256 b->free_ij = PETSC_TRUE; 257 fact->preallocated = PETSC_TRUE; 258 fact->assembled = PETSC_TRUE; 259 if (!b->diag) { PetscCall(PetscMalloc1(n + 1, &b->diag)); } 260 bdiag = b->diag; 261 262 if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n])); 263 264 /* set bi and bj with new data structure */ 265 bi = b->i; 266 bj = b->j; 267 268 /* L part */ 269 bi[0] = 0; 270 for (i = 0; i < n; i++) { 271 nz = adiag[i] - ai[i]; 272 bi[i + 1] = bi[i] + nz; 273 aj = a->j + ai[i]; 274 for (j = 0; j < nz; j++) { 275 *bj = aj[j]; 276 bj++; 277 } 278 } 279 280 /* U part */ 281 bi_temp = bi[n]; 282 bdiag[n] = bi[n] - 1; 283 for (i = n - 1; i >= 0; i--) { 284 nz = ai[i + 1] - adiag[i] - 1; 285 bi_temp = bi_temp + nz + 1; 286 aj = a->j + adiag[i] + 1; 287 for (j = 0; j < nz; j++) { 288 *bj = aj[j]; 289 bj++; 290 } 291 /* diag[i] */ 292 *bj = i; 293 bj++; 294 bdiag[i] = bi_temp - 1; 295 } 296 PetscFunctionReturn(0); 297 } 298 299 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 300 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 301 IS isicol; 302 const PetscInt *r, *ic; 303 PetscInt n = a->mbs, *ai = a->i, *aj = a->j, d; 304 PetscInt *bi, *cols, nnz, *cols_lvl; 305 PetscInt *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0; 306 PetscInt i, levels, diagonal_fill; 307 PetscBool col_identity, row_identity, both_identity; 308 PetscReal f; 309 PetscInt nlnk, *lnk, *lnk_lvl = NULL; 310 PetscBT lnkbt; 311 PetscInt nzi, *bj, **bj_ptr, **bjlvl_ptr; 312 PetscFreeSpaceList free_space = NULL, current_space = NULL; 313 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 314 PetscBool missing; 315 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 316 317 PetscFunctionBegin; 318 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 319 if (bs > 1) { /* check shifttype */ 320 PetscCheck(info->shifttype != MAT_SHIFT_NONZERO && info->shifttype != MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 321 } 322 323 PetscCall(MatMissingDiagonal(A, &missing, &d)); 324 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 325 326 f = info->fill; 327 levels = (PetscInt)info->levels; 328 diagonal_fill = (PetscInt)info->diagonal_fill; 329 330 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 331 332 PetscCall(ISIdentity(isrow, &row_identity)); 333 PetscCall(ISIdentity(iscol, &col_identity)); 334 335 both_identity = (PetscBool)(row_identity && col_identity); 336 337 if (!levels && both_identity) { 338 /* special case: ilu(0) with natural ordering */ 339 PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info)); 340 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 341 342 fact->factortype = MAT_FACTOR_ILU; 343 (fact)->info.factor_mallocs = 0; 344 (fact)->info.fill_ratio_given = info->fill; 345 (fact)->info.fill_ratio_needed = 1.0; 346 347 b = (Mat_SeqBAIJ *)(fact)->data; 348 b->row = isrow; 349 b->col = iscol; 350 b->icol = isicol; 351 PetscCall(PetscObjectReference((PetscObject)isrow)); 352 PetscCall(PetscObjectReference((PetscObject)iscol)); 353 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 354 355 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 356 PetscFunctionReturn(0); 357 } 358 359 PetscCall(ISGetIndices(isrow, &r)); 360 PetscCall(ISGetIndices(isicol, &ic)); 361 362 /* get new row pointers */ 363 PetscCall(PetscMalloc1(n + 1, &bi)); 364 bi[0] = 0; 365 /* bdiag is location of diagonal in factor */ 366 PetscCall(PetscMalloc1(n + 1, &bdiag)); 367 bdiag[0] = 0; 368 369 PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr)); 370 371 /* create a linked list for storing column indices of the active row */ 372 nlnk = n + 1; 373 PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt)); 374 375 /* initial FreeSpace size is f*(ai[n]+1) */ 376 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space)); 377 current_space = free_space; 378 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl)); 379 current_space_lvl = free_space_lvl; 380 381 for (i = 0; i < n; i++) { 382 nzi = 0; 383 /* copy current row into linked list */ 384 nnz = ai[r[i] + 1] - ai[r[i]]; 385 PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 386 cols = aj + ai[r[i]]; 387 lnk[i] = -1; /* marker to indicate if diagonal exists */ 388 PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt)); 389 nzi += nlnk; 390 391 /* make sure diagonal entry is included */ 392 if (diagonal_fill && lnk[i] == -1) { 393 fm = n; 394 while (lnk[fm] < i) fm = lnk[fm]; 395 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 396 lnk[fm] = i; 397 lnk_lvl[i] = 0; 398 nzi++; 399 dcount++; 400 } 401 402 /* add pivot rows into the active row */ 403 nzbd = 0; 404 prow = lnk[n]; 405 while (prow < i) { 406 nnz = bdiag[prow]; 407 cols = bj_ptr[prow] + nnz + 1; 408 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 409 nnz = bi[prow + 1] - bi[prow] - nnz - 1; 410 411 PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow)); 412 nzi += nlnk; 413 prow = lnk[prow]; 414 nzbd++; 415 } 416 bdiag[i] = nzbd; 417 bi[i + 1] = bi[i] + nzi; 418 419 /* if free space is not available, make more free space */ 420 if (current_space->local_remaining < nzi) { 421 nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */ 422 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space)); 423 PetscCall(PetscFreeSpaceGet(nnz, ¤t_space_lvl)); 424 reallocs++; 425 } 426 427 /* copy data into free_space and free_space_lvl, then initialize lnk */ 428 PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 429 430 bj_ptr[i] = current_space->array; 431 bjlvl_ptr[i] = current_space_lvl->array; 432 433 /* make sure the active row i has diagonal entry */ 434 PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i); 435 436 current_space->array += nzi; 437 current_space->local_used += nzi; 438 current_space->local_remaining -= nzi; 439 440 current_space_lvl->array += nzi; 441 current_space_lvl->local_used += nzi; 442 current_space_lvl->local_remaining -= nzi; 443 } 444 445 PetscCall(ISRestoreIndices(isrow, &r)); 446 PetscCall(ISRestoreIndices(isicol, &ic)); 447 448 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 449 PetscCall(PetscMalloc1(bi[n] + 1, &bj)); 450 PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag)); 451 452 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 453 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 454 PetscCall(PetscFree2(bj_ptr, bjlvl_ptr)); 455 456 #if defined(PETSC_USE_INFO) 457 { 458 PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 459 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 460 PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af)); 461 PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af)); 462 PetscCall(PetscInfo(A, "for best performance.\n")); 463 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 464 } 465 #endif 466 467 /* put together the new matrix */ 468 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 469 470 b = (Mat_SeqBAIJ *)(fact)->data; 471 b->free_a = PETSC_TRUE; 472 b->free_ij = PETSC_TRUE; 473 b->singlemalloc = PETSC_FALSE; 474 475 PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a)); 476 477 b->j = bj; 478 b->i = bi; 479 b->diag = bdiag; 480 b->free_diag = PETSC_TRUE; 481 b->ilen = NULL; 482 b->imax = NULL; 483 b->row = isrow; 484 b->col = iscol; 485 PetscCall(PetscObjectReference((PetscObject)isrow)); 486 PetscCall(PetscObjectReference((PetscObject)iscol)); 487 b->icol = isicol; 488 489 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 490 /* In b structure: Free imax, ilen, old a, old j. 491 Allocate bdiag, solve_work, new a, new j */ 492 b->maxnz = b->nz = bdiag[0] + 1; 493 494 fact->info.factor_mallocs = reallocs; 495 fact->info.fill_ratio_given = f; 496 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]); 497 498 PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity)); 499 PetscFunctionReturn(0); 500 } 501 502 /* 503 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 504 except that the data structure of Mat_SeqAIJ is slightly different. 505 Not a good example of code reuse. 506 */ 507 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 508 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 509 IS isicol; 510 const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi; 511 PetscInt prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp; 512 PetscInt *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0; 513 PetscInt incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd; 514 PetscBool col_identity, row_identity, both_identity, flg; 515 PetscReal f; 516 517 PetscFunctionBegin; 518 PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd)); 519 PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd); 520 521 f = info->fill; 522 levels = (PetscInt)info->levels; 523 diagonal_fill = (PetscInt)info->diagonal_fill; 524 525 PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 526 527 PetscCall(ISIdentity(isrow, &row_identity)); 528 PetscCall(ISIdentity(iscol, &col_identity)); 529 both_identity = (PetscBool)(row_identity && col_identity); 530 531 if (!levels && both_identity) { /* special case copy the nonzero structure */ 532 PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE)); 533 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 534 535 fact->factortype = MAT_FACTOR_ILU; 536 b = (Mat_SeqBAIJ *)fact->data; 537 b->row = isrow; 538 b->col = iscol; 539 PetscCall(PetscObjectReference((PetscObject)isrow)); 540 PetscCall(PetscObjectReference((PetscObject)iscol)); 541 b->icol = isicol; 542 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 543 544 PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work)); 545 PetscFunctionReturn(0); 546 } 547 548 /* general case perform the symbolic factorization */ 549 PetscCall(ISGetIndices(isrow, &r)); 550 PetscCall(ISGetIndices(isicol, &ic)); 551 552 /* get new row pointers */ 553 PetscCall(PetscMalloc1(n + 1, &ainew)); 554 ainew[0] = 0; 555 /* don't know how many column pointers are needed so estimate */ 556 jmax = (PetscInt)(f * ai[n] + 1); 557 PetscCall(PetscMalloc1(jmax, &ajnew)); 558 /* ajfill is level of fill for each fill entry */ 559 PetscCall(PetscMalloc1(jmax, &ajfill)); 560 /* fill is a linked list of nonzeros in active row */ 561 PetscCall(PetscMalloc1(n + 1, &fill)); 562 /* im is level for each filled value */ 563 PetscCall(PetscMalloc1(n + 1, &im)); 564 /* dloc is location of diagonal in factor */ 565 PetscCall(PetscMalloc1(n + 1, &dloc)); 566 dloc[0] = 0; 567 for (prow = 0; prow < n; prow++) { 568 /* copy prow into linked list */ 569 nzf = nz = ai[r[prow] + 1] - ai[r[prow]]; 570 PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow); 571 xi = aj + ai[r[prow]]; 572 fill[n] = n; 573 fill[prow] = -1; /* marker for diagonal entry */ 574 while (nz--) { 575 fm = n; 576 idx = ic[*xi++]; 577 do { 578 m = fm; 579 fm = fill[m]; 580 } while (fm < idx); 581 fill[m] = idx; 582 fill[idx] = fm; 583 im[idx] = 0; 584 } 585 586 /* make sure diagonal entry is included */ 587 if (diagonal_fill && fill[prow] == -1) { 588 fm = n; 589 while (fill[fm] < prow) fm = fill[fm]; 590 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 591 fill[fm] = prow; 592 im[prow] = 0; 593 nzf++; 594 dcount++; 595 } 596 597 nzi = 0; 598 row = fill[n]; 599 while (row < prow) { 600 incrlev = im[row] + 1; 601 nz = dloc[row]; 602 xi = ajnew + ainew[row] + nz + 1; 603 flev = ajfill + ainew[row] + nz + 1; 604 nnz = ainew[row + 1] - ainew[row] - nz - 1; 605 fm = row; 606 while (nnz-- > 0) { 607 idx = *xi++; 608 if (*flev + incrlev > levels) { 609 flev++; 610 continue; 611 } 612 do { 613 m = fm; 614 fm = fill[m]; 615 } while (fm < idx); 616 if (fm != idx) { 617 im[idx] = *flev + incrlev; 618 fill[m] = idx; 619 fill[idx] = fm; 620 fm = idx; 621 nzf++; 622 } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev; 623 flev++; 624 } 625 row = fill[row]; 626 nzi++; 627 } 628 /* copy new filled row into permanent storage */ 629 ainew[prow + 1] = ainew[prow] + nzf; 630 if (ainew[prow + 1] > jmax) { 631 /* estimate how much additional space we will need */ 632 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 633 /* just double the memory each time */ 634 PetscInt maxadd = jmax; 635 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 636 if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1); 637 jmax += maxadd; 638 639 /* allocate a longer ajnew and ajfill */ 640 PetscCall(PetscMalloc1(jmax, &xitmp)); 641 PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow])); 642 PetscCall(PetscFree(ajnew)); 643 ajnew = xitmp; 644 PetscCall(PetscMalloc1(jmax, &xitmp)); 645 PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow])); 646 PetscCall(PetscFree(ajfill)); 647 ajfill = xitmp; 648 reallocate++; /* count how many reallocations are needed */ 649 } 650 xitmp = ajnew + ainew[prow]; 651 flev = ajfill + ainew[prow]; 652 dloc[prow] = nzi; 653 fm = fill[n]; 654 while (nzf--) { 655 *xitmp++ = fm; 656 *flev++ = im[fm]; 657 fm = fill[fm]; 658 } 659 /* make sure row has diagonal entry */ 660 PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\ 661 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", 662 prow); 663 } 664 PetscCall(PetscFree(ajfill)); 665 PetscCall(ISRestoreIndices(isrow, &r)); 666 PetscCall(ISRestoreIndices(isicol, &ic)); 667 PetscCall(PetscFree(fill)); 668 PetscCall(PetscFree(im)); 669 670 #if defined(PETSC_USE_INFO) 671 { 672 PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]); 673 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af)); 674 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 675 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 676 PetscCall(PetscInfo(A, "for best performance.\n")); 677 if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount)); 678 } 679 #endif 680 681 /* put together the new matrix */ 682 PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 683 b = (Mat_SeqBAIJ *)fact->data; 684 685 b->free_a = PETSC_TRUE; 686 b->free_ij = PETSC_TRUE; 687 b->singlemalloc = PETSC_FALSE; 688 689 PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a)); 690 691 b->j = ajnew; 692 b->i = ainew; 693 for (i = 0; i < n; i++) dloc[i] += ainew[i]; 694 b->diag = dloc; 695 b->free_diag = PETSC_TRUE; 696 b->ilen = NULL; 697 b->imax = NULL; 698 b->row = isrow; 699 b->col = iscol; 700 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 701 702 PetscCall(PetscObjectReference((PetscObject)isrow)); 703 PetscCall(PetscObjectReference((PetscObject)iscol)); 704 b->icol = isicol; 705 PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work)); 706 /* In b structure: Free imax, ilen, old a, old j. 707 Allocate dloc, solve_work, new a, new j */ 708 b->maxnz = b->nz = ainew[n]; 709 710 fact->info.factor_mallocs = reallocate; 711 fact->info.fill_ratio_given = f; 712 fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]); 713 714 PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity)); 715 PetscFunctionReturn(0); 716 } 717 718 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) { 719 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 720 /* int i,*AJ=a->j,nz=a->nz; */ 721 722 PetscFunctionBegin; 723 /* Undo Column scaling */ 724 /* while (nz--) { */ 725 /* AJ[i] = AJ[i]/4; */ 726 /* } */ 727 /* This should really invoke a push/pop logic, but we don't have that yet. */ 728 A->ops->setunfactored = NULL; 729 PetscFunctionReturn(0); 730 } 731 732 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) { 733 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 734 PetscInt *AJ = a->j, nz = a->nz; 735 unsigned short *aj = (unsigned short *)AJ; 736 737 PetscFunctionBegin; 738 /* Is this really necessary? */ 739 while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ } 740 A->ops->setunfactored = NULL; 741 PetscFunctionReturn(0); 742 } 743