xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 8d47944aebcd8a3a8a4a1aebd4b3f440b4d70fab)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <petsc/private/kernels/blockinvert.h>
8 #include <petscbt.h>
9 #include <../src/mat/utils/freespace.h>
10 
11 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat, Mat, MatDuplicateOption, PetscBool);
12 
13 /*
14    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
15 */
16 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
17 {
18   Mat              C = B;
19   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
20   PetscInt         i, j, k, ipvt[15];
21   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *ajtmp, *bjtmp, *bdiag = b->diag, *pj;
22   PetscInt         nz, nzL, row;
23   MatScalar       *rtmp, *pc, *mwork, *pv, *vv, work[225];
24   const MatScalar *v, *aa = a->a;
25   PetscInt         bs2 = a->bs2, bs = A->rmap->bs, flg;
26   PetscInt         sol_ver;
27   PetscBool        allowzeropivot, zeropivotdetected;
28 
29   PetscFunctionBegin;
30   allowzeropivot = PetscNot(A->erroriffailure);
31   PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)A)->prefix, "-sol_ver", &sol_ver, NULL));
32 
33   /* generate work space needed by the factorization */
34   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
35   PetscCall(PetscArrayzero(rtmp, bs2 * n));
36 
37   for (i = 0; i < n; i++) {
38     /* zero rtmp */
39     /* L part */
40     nz    = bi[i + 1] - bi[i];
41     bjtmp = bj + bi[i];
42     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
43 
44     /* U part */
45     nz    = bdiag[i] - bdiag[i + 1];
46     bjtmp = bj + bdiag[i + 1] + 1;
47     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
48 
49     /* load in initial (unfactored row) */
50     nz    = ai[i + 1] - ai[i];
51     ajtmp = aj + ai[i];
52     v     = aa + bs2 * ai[i];
53     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
54 
55     /* elimination */
56     bjtmp = bj + bi[i];
57     nzL   = bi[i + 1] - bi[i];
58     for (k = 0; k < nzL; k++) {
59       row = bjtmp[k];
60       pc  = rtmp + bs2 * row;
61       for (flg = 0, j = 0; j < bs2; j++) {
62         if (pc[j] != 0.0) {
63           flg = 1;
64           break;
65         }
66       }
67       if (flg) {
68         pv = b->a + bs2 * bdiag[row];
69         PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork);
70         /* PetscCall(PetscKernel_A_gets_A_times_B_15(pc,pv,mwork)); */
71         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
72         pv = b->a + bs2 * (bdiag[row + 1] + 1);
73         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
74         for (j = 0; j < nz; j++) {
75           vv = rtmp + bs2 * pj[j];
76           PetscKernel_A_gets_A_minus_B_times_C(bs, vv, pc, pv);
77           /* PetscCall(PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv)); */
78           pv += bs2;
79         }
80         PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
81       }
82     }
83 
84     /* finished row so stick it into b->a */
85     /* L part */
86     pv = b->a + bs2 * bi[i];
87     pj = b->j + bi[i];
88     nz = bi[i + 1] - bi[i];
89     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
90 
91     /* Mark diagonal and invert diagonal for simpler triangular solves */
92     pv = b->a + bs2 * bdiag[i];
93     pj = b->j + bdiag[i];
94     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
95     PetscCall(PetscKernel_A_gets_inverse_A_15(pv, ipvt, work, info->shiftamount, allowzeropivot, &zeropivotdetected));
96     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
97 
98     /* U part */
99     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100     pj = b->j + bdiag[i + 1] + 1;
101     nz = bdiag[i] - bdiag[i + 1] - 1;
102     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103   }
104 
105   PetscCall(PetscFree2(rtmp, mwork));
106 
107   C->ops->solve          = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
108   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
109   C->assembled           = PETSC_TRUE;
110 
111   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
112   PetscFunctionReturn(PETSC_SUCCESS);
113 }
114 
115 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B, Mat A, const MatFactorInfo *info)
116 {
117   Mat             C = B;
118   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
119   IS              isrow = b->row, isicol = b->icol;
120   const PetscInt *r, *ic;
121   PetscInt        i, j, k, n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
122   PetscInt       *ajtmp, *bjtmp, nz, nzL, row, *bdiag = b->diag, *pj;
123   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa     = a->a;
124   PetscInt        bs = A->rmap->bs, bs2 = a->bs2, *v_pivots, flg;
125   MatScalar      *v_work;
126   PetscBool       col_identity, row_identity, both_identity;
127   PetscBool       allowzeropivot, zeropivotdetected;
128 
129   PetscFunctionBegin;
130   PetscCall(ISGetIndices(isrow, &r));
131   PetscCall(ISGetIndices(isicol, &ic));
132   allowzeropivot = PetscNot(A->erroriffailure);
133 
134   PetscCall(PetscCalloc1(bs2 * n, &rtmp));
135 
136   /* generate work space needed by dense LU factorization */
137   PetscCall(PetscMalloc3(bs, &v_work, bs2, &mwork, bs, &v_pivots));
138 
139   for (i = 0; i < n; i++) {
140     /* zero rtmp */
141     /* L part */
142     nz    = bi[i + 1] - bi[i];
143     bjtmp = bj + bi[i];
144     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
145 
146     /* U part */
147     nz    = bdiag[i] - bdiag[i + 1];
148     bjtmp = bj + bdiag[i + 1] + 1;
149     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
150 
151     /* load in initial (unfactored row) */
152     nz    = ai[r[i] + 1] - ai[r[i]];
153     ajtmp = aj + ai[r[i]];
154     v     = aa + bs2 * ai[r[i]];
155     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
156 
157     /* elimination */
158     bjtmp = bj + bi[i];
159     nzL   = bi[i + 1] - bi[i];
160     for (k = 0; k < nzL; k++) {
161       row = bjtmp[k];
162       pc  = rtmp + bs2 * row;
163       for (flg = 0, j = 0; j < bs2; j++) {
164         if (pc[j] != 0.0) {
165           flg = 1;
166           break;
167         }
168       }
169       if (flg) {
170         pv = b->a + bs2 * bdiag[row];
171         PetscKernel_A_gets_A_times_B(bs, pc, pv, mwork); /* *pc = *pc * (*pv); */
172         pj = b->j + bdiag[row + 1] + 1;                  /* beginning of U(row,:) */
173         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
175         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
176         PetscCall(PetscLogFlops(2.0 * bs2 * bs * (nz + 1) - bs2)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
177       }
178     }
179 
180     /* finished row so stick it into b->a */
181     /* L part */
182     pv = b->a + bs2 * bi[i];
183     pj = b->j + bi[i];
184     nz = bi[i + 1] - bi[i];
185     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
186 
187     /* Mark diagonal and invert diagonal for simpler triangular solves */
188     pv = b->a + bs2 * bdiag[i];
189     pj = b->j + bdiag[i];
190     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
191 
192     PetscCall(PetscKernel_A_gets_inverse_A(bs, pv, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
193     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
194 
195     /* U part */
196     pv = b->a + bs2 * (bdiag[i + 1] + 1);
197     pj = b->j + bdiag[i + 1] + 1;
198     nz = bdiag[i] - bdiag[i + 1] - 1;
199     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
200   }
201 
202   PetscCall(PetscFree(rtmp));
203   PetscCall(PetscFree3(v_work, mwork, v_pivots));
204   PetscCall(ISRestoreIndices(isicol, &ic));
205   PetscCall(ISRestoreIndices(isrow, &r));
206 
207   PetscCall(ISIdentity(isrow, &row_identity));
208   PetscCall(ISIdentity(isicol, &col_identity));
209 
210   both_identity = (PetscBool)(row_identity && col_identity);
211   if (both_identity) {
212     switch (bs) {
213     case 9:
214 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
215       C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering;
216 #else
217       C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
218 #endif
219       break;
220     case 11:
221       C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering;
222       break;
223     case 12:
224       C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering;
225       break;
226     case 13:
227       C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering;
228       break;
229     case 14:
230       C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering;
231       break;
232     default:
233       C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
234       break;
235     }
236   } else {
237     C->ops->solve = MatSolve_SeqBAIJ_N;
238   }
239   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
240 
241   C->assembled = PETSC_TRUE;
242 
243   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); /* from inverting diagonal blocks */
244   PetscFunctionReturn(PETSC_SUCCESS);
245 }
246 
247 /*
248    ilu(0) with natural ordering under new data structure.
249    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
250    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
251 */
252 
253 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
254 {
255   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b;
256   PetscInt     n = a->mbs, *ai = a->i, *aj, *adiag = a->diag, bs2 = a->bs2;
257   PetscInt     i, j, nz, *bi, *bj, *bdiag, bi_temp;
258 
259   PetscFunctionBegin;
260   PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_FALSE));
261   b = (Mat_SeqBAIJ *)(fact)->data;
262 
263   /* allocate matrix arrays for new data structure */
264   PetscCall(PetscMalloc3(bs2 * ai[n] + 1, &b->a, ai[n] + 1, &b->j, n + 1, &b->i));
265 
266   b->singlemalloc    = PETSC_TRUE;
267   b->free_a          = PETSC_TRUE;
268   b->free_ij         = PETSC_TRUE;
269   fact->preallocated = PETSC_TRUE;
270   fact->assembled    = PETSC_TRUE;
271   if (!b->diag) PetscCall(PetscMalloc1(n + 1, &b->diag));
272   bdiag = b->diag;
273 
274   if (n > 0) PetscCall(PetscArrayzero(b->a, bs2 * ai[n]));
275 
276   /* set bi and bj with new data structure */
277   bi = b->i;
278   bj = b->j;
279 
280   /* L part */
281   bi[0] = 0;
282   for (i = 0; i < n; i++) {
283     nz        = adiag[i] - ai[i];
284     bi[i + 1] = bi[i] + nz;
285     aj        = a->j + ai[i];
286     for (j = 0; j < nz; j++) {
287       *bj = aj[j];
288       bj++;
289     }
290   }
291 
292   /* U part */
293   bi_temp  = bi[n];
294   bdiag[n] = bi[n] - 1;
295   for (i = n - 1; i >= 0; i--) {
296     nz      = ai[i + 1] - adiag[i] - 1;
297     bi_temp = bi_temp + nz + 1;
298     aj      = a->j + adiag[i] + 1;
299     for (j = 0; j < nz; j++) {
300       *bj = aj[j];
301       bj++;
302     }
303     /* diag[i] */
304     *bj = i;
305     bj++;
306     bdiag[i] = bi_temp - 1;
307   }
308   PetscFunctionReturn(PETSC_SUCCESS);
309 }
310 
311 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
312 {
313   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data, *b;
314   IS                 isicol;
315   const PetscInt    *r, *ic;
316   PetscInt           n = a->mbs, *ai = a->i, *aj = a->j, d;
317   PetscInt          *bi, *cols, nnz, *cols_lvl;
318   PetscInt          *bdiag, prow, fm, nzbd, reallocs = 0, dcount = 0;
319   PetscInt           i, levels, diagonal_fill;
320   PetscBool          col_identity, row_identity, both_identity;
321   PetscReal          f;
322   PetscInt           nlnk, *lnk, *lnk_lvl = NULL;
323   PetscBT            lnkbt;
324   PetscInt           nzi, *bj, **bj_ptr, **bjlvl_ptr;
325   PetscFreeSpaceList free_space = NULL, current_space = NULL;
326   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
327   PetscBool          missing;
328   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
329 
330   PetscFunctionBegin;
331   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
332   if (bs > 1) { /* check shifttype */
333     PetscCheck(info->shifttype != (PetscReal)MAT_SHIFT_NONZERO && info->shifttype != (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
334   }
335 
336   PetscCall(MatMissingDiagonal(A, &missing, &d));
337   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
338 
339   f             = info->fill;
340   levels        = (PetscInt)info->levels;
341   diagonal_fill = (PetscInt)info->diagonal_fill;
342 
343   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
344 
345   PetscCall(ISIdentity(isrow, &row_identity));
346   PetscCall(ISIdentity(iscol, &col_identity));
347 
348   both_identity = (PetscBool)(row_identity && col_identity);
349 
350   if (!levels && both_identity) {
351     /* special case: ilu(0) with natural ordering */
352     PetscCall(MatILUFactorSymbolic_SeqBAIJ_ilu0(fact, A, isrow, iscol, info));
353     PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity));
354 
355     fact->factortype               = MAT_FACTOR_ILU;
356     (fact)->info.factor_mallocs    = 0;
357     (fact)->info.fill_ratio_given  = info->fill;
358     (fact)->info.fill_ratio_needed = 1.0;
359 
360     b       = (Mat_SeqBAIJ *)(fact)->data;
361     b->row  = isrow;
362     b->col  = iscol;
363     b->icol = isicol;
364     PetscCall(PetscObjectReference((PetscObject)isrow));
365     PetscCall(PetscObjectReference((PetscObject)iscol));
366     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
367 
368     PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work));
369     PetscFunctionReturn(PETSC_SUCCESS);
370   }
371 
372   PetscCall(ISGetIndices(isrow, &r));
373   PetscCall(ISGetIndices(isicol, &ic));
374 
375   /* get new row pointers */
376   PetscCall(PetscMalloc1(n + 1, &bi));
377   bi[0] = 0;
378   /* bdiag is location of diagonal in factor */
379   PetscCall(PetscMalloc1(n + 1, &bdiag));
380   bdiag[0] = 0;
381 
382   PetscCall(PetscMalloc2(n, &bj_ptr, n, &bjlvl_ptr));
383 
384   /* create a linked list for storing column indices of the active row */
385   nlnk = n + 1;
386   PetscCall(PetscIncompleteLLCreate(n, n, nlnk, lnk, lnk_lvl, lnkbt));
387 
388   /* initial FreeSpace size is f*(ai[n]+1) */
389   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space));
390   current_space = free_space;
391   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(f, ai[n] + 1), &free_space_lvl));
392   current_space_lvl = free_space_lvl;
393 
394   for (i = 0; i < n; i++) {
395     nzi = 0;
396     /* copy current row into linked list */
397     nnz = ai[r[i] + 1] - ai[r[i]];
398     PetscCheck(nnz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
399     cols   = aj + ai[r[i]];
400     lnk[i] = -1; /* marker to indicate if diagonal exists */
401     PetscCall(PetscIncompleteLLInit(nnz, cols, n, ic, &nlnk, lnk, lnk_lvl, lnkbt));
402     nzi += nlnk;
403 
404     /* make sure diagonal entry is included */
405     if (diagonal_fill && lnk[i] == -1) {
406       fm = n;
407       while (lnk[fm] < i) fm = lnk[fm];
408       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
409       lnk[fm]    = i;
410       lnk_lvl[i] = 0;
411       nzi++;
412       dcount++;
413     }
414 
415     /* add pivot rows into the active row */
416     nzbd = 0;
417     prow = lnk[n];
418     while (prow < i) {
419       nnz      = bdiag[prow];
420       cols     = bj_ptr[prow] + nnz + 1;
421       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
422       nnz      = bi[prow + 1] - bi[prow] - nnz - 1;
423 
424       PetscCall(PetscILULLAddSorted(nnz, cols, levels, cols_lvl, prow, &nlnk, lnk, lnk_lvl, lnkbt, prow));
425       nzi += nlnk;
426       prow = lnk[prow];
427       nzbd++;
428     }
429     bdiag[i]  = nzbd;
430     bi[i + 1] = bi[i] + nzi;
431 
432     /* if free space is not available, make more free space */
433     if (current_space->local_remaining < nzi) {
434       nnz = PetscIntMultTruncate(2, PetscIntMultTruncate(nzi, (n - i))); /* estimated and max additional space needed */
435       PetscCall(PetscFreeSpaceGet(nnz, &current_space));
436       PetscCall(PetscFreeSpaceGet(nnz, &current_space_lvl));
437       reallocs++;
438     }
439 
440     /* copy data into free_space and free_space_lvl, then initialize lnk */
441     PetscCall(PetscIncompleteLLClean(n, n, nzi, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
442 
443     bj_ptr[i]    = current_space->array;
444     bjlvl_ptr[i] = current_space_lvl->array;
445 
446     /* make sure the active row i has diagonal entry */
447     PetscCheck(*(bj_ptr[i] + bdiag[i]) == i, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill", i);
448 
449     current_space->array += nzi;
450     current_space->local_used += nzi;
451     current_space->local_remaining -= nzi;
452 
453     current_space_lvl->array += nzi;
454     current_space_lvl->local_used += nzi;
455     current_space_lvl->local_remaining -= nzi;
456   }
457 
458   PetscCall(ISRestoreIndices(isrow, &r));
459   PetscCall(ISRestoreIndices(isicol, &ic));
460 
461   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
462   PetscCall(PetscMalloc1(bi[n] + 1, &bj));
463   PetscCall(PetscFreeSpaceContiguous_LU(&free_space, bj, n, bi, bdiag));
464 
465   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
466   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
467   PetscCall(PetscFree2(bj_ptr, bjlvl_ptr));
468 
469 #if defined(PETSC_USE_INFO)
470   {
471     PetscReal af = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
472     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
473     PetscCall(PetscInfo(A, "Run with -[sub_]pc_factor_fill %g or use \n", (double)af));
474     PetscCall(PetscInfo(A, "PCFactorSetFill([sub]pc,%g);\n", (double)af));
475     PetscCall(PetscInfo(A, "for best performance.\n"));
476     if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount));
477   }
478 #endif
479 
480   /* put together the new matrix */
481   PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
482 
483   b               = (Mat_SeqBAIJ *)(fact)->data;
484   b->free_a       = PETSC_TRUE;
485   b->free_ij      = PETSC_TRUE;
486   b->singlemalloc = PETSC_FALSE;
487 
488   PetscCall(PetscMalloc1(bs2 * (bdiag[0] + 1), &b->a));
489 
490   b->j         = bj;
491   b->i         = bi;
492   b->diag      = bdiag;
493   b->free_diag = PETSC_TRUE;
494   b->ilen      = NULL;
495   b->imax      = NULL;
496   b->row       = isrow;
497   b->col       = iscol;
498   PetscCall(PetscObjectReference((PetscObject)isrow));
499   PetscCall(PetscObjectReference((PetscObject)iscol));
500   b->icol = isicol;
501 
502   PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
503   /* In b structure:  Free imax, ilen, old a, old j.
504      Allocate bdiag, solve_work, new a, new j */
505   b->maxnz = b->nz = bdiag[0] + 1;
506 
507   fact->info.factor_mallocs    = reallocs;
508   fact->info.fill_ratio_given  = f;
509   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0] + 1)) / ((PetscReal)ai[n]);
510 
511   PetscCall(MatSeqBAIJSetNumericFactorization(fact, both_identity));
512   PetscFunctionReturn(PETSC_SUCCESS);
513 }
514 
515 #if 0
516 // unused
517 /*
518      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
519    except that the data structure of Mat_SeqAIJ is slightly different.
520    Not a good example of code reuse.
521 */
522 static PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
523 {
524   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
525   IS              isicol;
526   const PetscInt *r, *ic, *ai = a->i, *aj = a->j, *xi;
527   PetscInt        prow, n = a->mbs, *ainew, *ajnew, jmax, *fill, nz, *im, *ajfill, *flev, *xitmp;
528   PetscInt       *dloc, idx, row, m, fm, nzf, nzi, reallocate = 0, dcount = 0;
529   PetscInt        incrlev, nnz, i, bs = A->rmap->bs, bs2 = a->bs2, levels, diagonal_fill, dd;
530   PetscBool       col_identity, row_identity, both_identity, flg;
531   PetscReal       f;
532 
533   PetscFunctionBegin;
534   PetscCall(MatMissingDiagonal_SeqBAIJ(A, &flg, &dd));
535   PetscCheck(!flg, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix A is missing diagonal entry in row %" PetscInt_FMT, dd);
536 
537   f             = info->fill;
538   levels        = (PetscInt)info->levels;
539   diagonal_fill = (PetscInt)info->diagonal_fill;
540 
541   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
542 
543   PetscCall(ISIdentity(isrow, &row_identity));
544   PetscCall(ISIdentity(iscol, &col_identity));
545   both_identity = (PetscBool)(row_identity && col_identity);
546 
547   if (!levels && both_identity) { /* special case copy the nonzero structure */
548     PetscCall(MatDuplicateNoCreate_SeqBAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE));
549     PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity));
550 
551     fact->factortype = MAT_FACTOR_ILU;
552     b                = (Mat_SeqBAIJ *)fact->data;
553     b->row           = isrow;
554     b->col           = iscol;
555     PetscCall(PetscObjectReference((PetscObject)isrow));
556     PetscCall(PetscObjectReference((PetscObject)iscol));
557     b->icol          = isicol;
558     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
559 
560     PetscCall(PetscMalloc1((n + 1) * bs, &b->solve_work));
561     PetscFunctionReturn(PETSC_SUCCESS);
562   }
563 
564   /* general case perform the symbolic factorization */
565   PetscCall(ISGetIndices(isrow, &r));
566   PetscCall(ISGetIndices(isicol, &ic));
567 
568   /* get new row pointers */
569   PetscCall(PetscMalloc1(n + 1, &ainew));
570   ainew[0] = 0;
571   /* don't know how many column pointers are needed so estimate */
572   jmax = (PetscInt)(f * ai[n] + 1);
573   PetscCall(PetscMalloc1(jmax, &ajnew));
574   /* ajfill is level of fill for each fill entry */
575   PetscCall(PetscMalloc1(jmax, &ajfill));
576   /* fill is a linked list of nonzeros in active row */
577   PetscCall(PetscMalloc1(n + 1, &fill));
578   /* im is level for each filled value */
579   PetscCall(PetscMalloc1(n + 1, &im));
580   /* dloc is location of diagonal in factor */
581   PetscCall(PetscMalloc1(n + 1, &dloc));
582   dloc[0] = 0;
583   for (prow = 0; prow < n; prow++) {
584     /* copy prow into linked list */
585     nzf = nz = ai[r[prow] + 1] - ai[r[prow]];
586     PetscCheck(nz, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[prow], prow);
587     xi         = aj + ai[r[prow]];
588     fill[n]    = n;
589     fill[prow] = -1; /* marker for diagonal entry */
590     while (nz--) {
591       fm  = n;
592       idx = ic[*xi++];
593       do {
594         m  = fm;
595         fm = fill[m];
596       } while (fm < idx);
597       fill[m]   = idx;
598       fill[idx] = fm;
599       im[idx]   = 0;
600     }
601 
602     /* make sure diagonal entry is included */
603     if (diagonal_fill && fill[prow] == -1) {
604       fm = n;
605       while (fill[fm] < prow) fm = fill[fm];
606       fill[prow] = fill[fm]; /* insert diagonal into linked list */
607       fill[fm]   = prow;
608       im[prow]   = 0;
609       nzf++;
610       dcount++;
611     }
612 
613     nzi = 0;
614     row = fill[n];
615     while (row < prow) {
616       incrlev = im[row] + 1;
617       nz      = dloc[row];
618       xi      = ajnew + ainew[row] + nz + 1;
619       flev    = ajfill + ainew[row] + nz + 1;
620       nnz     = ainew[row + 1] - ainew[row] - nz - 1;
621       fm      = row;
622       while (nnz-- > 0) {
623         idx = *xi++;
624         if (*flev + incrlev > levels) {
625           flev++;
626           continue;
627         }
628         do {
629           m  = fm;
630           fm = fill[m];
631         } while (fm < idx);
632         if (fm != idx) {
633           im[idx]   = *flev + incrlev;
634           fill[m]   = idx;
635           fill[idx] = fm;
636           fm        = idx;
637           nzf++;
638         } else if (im[idx] > *flev + incrlev) im[idx] = *flev + incrlev;
639         flev++;
640       }
641       row = fill[row];
642       nzi++;
643     }
644     /* copy new filled row into permanent storage */
645     ainew[prow + 1] = ainew[prow] + nzf;
646     if (ainew[prow + 1] > jmax) {
647       /* estimate how much additional space we will need */
648       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
649       /* just double the memory each time */
650       PetscInt maxadd = jmax;
651       /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
652       if (maxadd < nzf) maxadd = (n - prow) * (nzf + 1);
653       jmax += maxadd;
654 
655       /* allocate a longer ajnew and ajfill */
656       PetscCall(PetscMalloc1(jmax, &xitmp));
657       PetscCall(PetscArraycpy(xitmp, ajnew, ainew[prow]));
658       PetscCall(PetscFree(ajnew));
659       ajnew = xitmp;
660       PetscCall(PetscMalloc1(jmax, &xitmp));
661       PetscCall(PetscArraycpy(xitmp, ajfill, ainew[prow]));
662       PetscCall(PetscFree(ajfill));
663       ajfill = xitmp;
664       reallocate++; /* count how many reallocations are needed */
665     }
666     xitmp      = ajnew + ainew[prow];
667     flev       = ajfill + ainew[prow];
668     dloc[prow] = nzi;
669     fm         = fill[n];
670     while (nzf--) {
671       *xitmp++ = fm;
672       *flev++  = im[fm];
673       fm       = fill[fm];
674     }
675     /* make sure row has diagonal entry */
676     PetscCheck(ajnew[ainew[prow] + dloc[prow]] == prow, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Row %" PetscInt_FMT " has missing diagonal in factored matrix\n\
677                                                         try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",
678                prow);
679   }
680   PetscCall(PetscFree(ajfill));
681   PetscCall(ISRestoreIndices(isrow, &r));
682   PetscCall(ISRestoreIndices(isicol, &ic));
683   PetscCall(PetscFree(fill));
684   PetscCall(PetscFree(im));
685 
686   #if defined(PETSC_USE_INFO)
687   {
688     PetscReal af = ((PetscReal)ainew[n]) / ((PetscReal)ai[n]);
689     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocate, (double)f, (double)af));
690     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
691     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
692     PetscCall(PetscInfo(A, "for best performance.\n"));
693     if (diagonal_fill) PetscCall(PetscInfo(A, "Detected and replaced %" PetscInt_FMT " missing diagonals\n", dcount));
694   }
695   #endif
696 
697   /* put together the new matrix */
698   PetscCall(MatSeqBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
699   b = (Mat_SeqBAIJ *)fact->data;
700 
701   b->free_a       = PETSC_TRUE;
702   b->free_ij      = PETSC_TRUE;
703   b->singlemalloc = PETSC_FALSE;
704 
705   PetscCall(PetscMalloc1(bs2 * ainew[n], &b->a));
706 
707   b->j = ajnew;
708   b->i = ainew;
709   for (i = 0; i < n; i++) dloc[i] += ainew[i];
710   b->diag          = dloc;
711   b->free_diag     = PETSC_TRUE;
712   b->ilen          = NULL;
713   b->imax          = NULL;
714   b->row           = isrow;
715   b->col           = iscol;
716   b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
717 
718   PetscCall(PetscObjectReference((PetscObject)isrow));
719   PetscCall(PetscObjectReference((PetscObject)iscol));
720   b->icol = isicol;
721   PetscCall(PetscMalloc1(bs * n + bs, &b->solve_work));
722   /* In b structure:  Free imax, ilen, old a, old j.
723      Allocate dloc, solve_work, new a, new j */
724   b->maxnz = b->nz = ainew[n];
725 
726   fact->info.factor_mallocs    = reallocate;
727   fact->info.fill_ratio_given  = f;
728   fact->info.fill_ratio_needed = ((PetscReal)ainew[n]) / ((PetscReal)ai[prow]);
729 
730   PetscCall(MatSeqBAIJSetNumericFactorization_inplace(fact, both_identity));
731   PetscFunctionReturn(PETSC_SUCCESS);
732 }
733 #endif
734 
735 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
736 {
737   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
738   /* int i,*AJ=a->j,nz=a->nz; */
739 
740   PetscFunctionBegin;
741   /* Undo Column scaling */
742   /*    while (nz--) { */
743   /*      AJ[i] = AJ[i]/4; */
744   /*    } */
745   /* This should really invoke a push/pop logic, but we don't have that yet. */
746   A->ops->setunfactored = NULL;
747   PetscFunctionReturn(PETSC_SUCCESS);
748 }
749 
750 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
751 {
752   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
753   PetscInt       *AJ = a->j, nz = a->nz;
754   unsigned short *aj = (unsigned short *)AJ;
755 
756   PetscFunctionBegin;
757   /* Is this really necessary? */
758   while (nz--) { AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ }
759   A->ops->setunfactored = NULL;
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762