xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision f13dfd9ea68e0ddeee984e65c377a1819eab8a8a)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8 {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15   MatScalar      *rtmp, *pc, *mwork, *pv;
16   MatScalar      *aa             = a->a, *v;
17   PetscReal       shift          = info->shiftamount;
18   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
19 
20   PetscFunctionBegin;
21   PetscCall(ISGetIndices(isrow, &r));
22   PetscCall(ISGetIndices(isicol, &ic));
23 
24   /* generate work space needed by the factorization */
25   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26   PetscCall(PetscArrayzero(rtmp, bs2 * n));
27 
28   for (PetscInt i = 0; i < n; i++) {
29     /* zero rtmp */
30     /* L part */
31     PetscInt *pj;
32     PetscInt  nzL, nz = bi[i + 1] - bi[i];
33     bjtmp = bj + bi[i];
34     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
35 
36     /* U part */
37     nz    = bdiag[i] - bdiag[i + 1];
38     bjtmp = bj + bdiag[i + 1] + 1;
39     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
40 
41     /* load in initial (unfactored row) */
42     nz    = ai[r[i] + 1] - ai[r[i]];
43     ajtmp = aj + ai[r[i]];
44     v     = aa + bs2 * ai[r[i]];
45     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
46 
47     /* elimination */
48     bjtmp = bj + bi[i];
49     nzL   = bi[i + 1] - bi[i];
50     for (PetscInt k = 0; k < nzL; k++) {
51       PetscBool flg = PETSC_FALSE;
52       PetscInt  row = bjtmp[k];
53 
54       pc = rtmp + bs2 * row;
55       for (PetscInt j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = PETSC_TRUE;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (PetscInt j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     {
92       PetscBool zeropivotdetected;
93 
94       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96     }
97 
98     /* U part */
99     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100     pj = b->j + bdiag[i + 1] + 1;
101     nz = bdiag[i] - bdiag[i + 1] - 1;
102     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103   }
104 
105   PetscCall(PetscFree2(rtmp, mwork));
106   PetscCall(ISRestoreIndices(isicol, &ic));
107   PetscCall(ISRestoreIndices(isrow, &r));
108 
109   C->ops->solve          = MatSolve_SeqBAIJ_2;
110   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111   C->assembled           = PETSC_TRUE;
112 
113   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118 {
119   Mat             C = B;
120   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121   PetscInt        i, j, k, nz, nzL, row, *pj;
122   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124   MatScalar      *rtmp, *pc, *mwork, *pv;
125   MatScalar      *aa = a->a, *v;
126   PetscInt        flg;
127   PetscReal       shift = info->shiftamount;
128   PetscBool       allowzeropivot, zeropivotdetected;
129 
130   PetscFunctionBegin;
131   allowzeropivot = PetscNot(A->erroriffailure);
132 
133   /* generate work space needed by the factorization */
134   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135   PetscCall(PetscArrayzero(rtmp, bs2 * n));
136 
137   for (i = 0; i < n; i++) {
138     /* zero rtmp */
139     /* L part */
140     nz    = bi[i + 1] - bi[i];
141     bjtmp = bj + bi[i];
142     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143 
144     /* U part */
145     nz    = bdiag[i] - bdiag[i + 1];
146     bjtmp = bj + bdiag[i + 1] + 1;
147     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
148 
149     /* load in initial (unfactored row) */
150     nz    = ai[i + 1] - ai[i];
151     ajtmp = aj + ai[i];
152     v     = aa + bs2 * ai[i];
153     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
154 
155     /* elimination */
156     bjtmp = bj + bi[i];
157     nzL   = bi[i + 1] - bi[i];
158     for (k = 0; k < nzL; k++) {
159       row = bjtmp[k];
160       pc  = rtmp + bs2 * row;
161       for (flg = 0, j = 0; j < bs2; j++) {
162         if (pc[j] != (PetscScalar)0.0) {
163           flg = 1;
164           break;
165         }
166       }
167       if (flg) {
168         pv = b->a + bs2 * bdiag[row];
169         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
171 
172         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175         for (j = 0; j < nz; j++) {
176           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178           v = rtmp + 4 * pj[j];
179           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180           pv += 4;
181         }
182         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183       }
184     }
185 
186     /* finished row so stick it into b->a */
187     /* L part */
188     pv = b->a + bs2 * bi[i];
189     pj = b->j + bi[i];
190     nz = bi[i + 1] - bi[i];
191     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
192 
193     /* Mark diagonal and invert diagonal for simpler triangular solves */
194     pv = b->a + bs2 * bdiag[i];
195     pj = b->j + bdiag[i];
196     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199 
200     /* U part */
201     /*
202     pv = b->a + bs2*bi[2*n-i];
203     pj = b->j + bi[2*n-i];
204     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205     */
206     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207     pj = b->j + bdiag[i + 1] + 1;
208     nz = bdiag[i] - bdiag[i + 1] - 1;
209     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210   }
211   PetscCall(PetscFree2(rtmp, mwork));
212 
213   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217   C->assembled           = PETSC_TRUE;
218 
219   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224 {
225   Mat             C = B;
226   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227   IS              isrow = b->row, isicol = b->icol;
228   const PetscInt *r, *ic;
229   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230   PetscInt       *ajtmpold, *ajtmp, nz, row;
231   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
232   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
233   MatScalar       p1, p2, p3, p4;
234   MatScalar      *ba = b->a, *aa = a->a;
235   PetscReal       shift = info->shiftamount;
236   PetscBool       allowzeropivot, zeropivotdetected;
237 
238   PetscFunctionBegin;
239   allowzeropivot = PetscNot(A->erroriffailure);
240   PetscCall(ISGetIndices(isrow, &r));
241   PetscCall(ISGetIndices(isicol, &ic));
242   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
243 
244   for (i = 0; i < n; i++) {
245     nz    = bi[i + 1] - bi[i];
246     ajtmp = bj + bi[i];
247     for (j = 0; j < nz; j++) {
248       x    = rtmp + 4 * ajtmp[j];
249       x[0] = x[1] = x[2] = x[3] = 0.0;
250     }
251     /* load in initial (unfactored row) */
252     idx      = r[i];
253     nz       = ai[idx + 1] - ai[idx];
254     ajtmpold = aj + ai[idx];
255     v        = aa + 4 * ai[idx];
256     for (j = 0; j < nz; j++) {
257       x    = rtmp + 4 * ic[ajtmpold[j]];
258       x[0] = v[0];
259       x[1] = v[1];
260       x[2] = v[2];
261       x[3] = v[3];
262       v += 4;
263     }
264     row = *ajtmp++;
265     while (row < i) {
266       pc = rtmp + 4 * row;
267       p1 = pc[0];
268       p2 = pc[1];
269       p3 = pc[2];
270       p4 = pc[3];
271       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
272         pv    = ba + 4 * diag_offset[row];
273         pj    = bj + diag_offset[row] + 1;
274         x1    = pv[0];
275         x2    = pv[1];
276         x3    = pv[2];
277         x4    = pv[3];
278         pc[0] = m1 = p1 * x1 + p3 * x2;
279         pc[1] = m2 = p2 * x1 + p4 * x2;
280         pc[2] = m3 = p1 * x3 + p3 * x4;
281         pc[3] = m4 = p2 * x3 + p4 * x4;
282         nz         = bi[row + 1] - diag_offset[row] - 1;
283         pv += 4;
284         for (j = 0; j < nz; j++) {
285           x1 = pv[0];
286           x2 = pv[1];
287           x3 = pv[2];
288           x4 = pv[3];
289           x  = rtmp + 4 * pj[j];
290           x[0] -= m1 * x1 + m3 * x2;
291           x[1] -= m2 * x1 + m4 * x2;
292           x[2] -= m1 * x3 + m3 * x4;
293           x[3] -= m2 * x3 + m4 * x4;
294           pv += 4;
295         }
296         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
297       }
298       row = *ajtmp++;
299     }
300     /* finished row so stick it into b->a */
301     pv = ba + 4 * bi[i];
302     pj = bj + bi[i];
303     nz = bi[i + 1] - bi[i];
304     for (j = 0; j < nz; j++) {
305       x     = rtmp + 4 * pj[j];
306       pv[0] = x[0];
307       pv[1] = x[1];
308       pv[2] = x[2];
309       pv[3] = x[3];
310       pv += 4;
311     }
312     /* invert diagonal block */
313     w = ba + 4 * diag_offset[i];
314     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
315     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
316   }
317 
318   PetscCall(PetscFree(rtmp));
319   PetscCall(ISRestoreIndices(isicol, &ic));
320   PetscCall(ISRestoreIndices(isrow, &r));
321 
322   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
323   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
324   C->assembled           = PETSC_TRUE;
325 
326   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
327   PetscFunctionReturn(PETSC_SUCCESS);
328 }
329 /*
330       Version for when blocks are 2 by 2 Using natural ordering
331 */
332 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
333 {
334   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
335   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
336   PetscInt    *ajtmpold, *ajtmp, nz, row;
337   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
338   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
339   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
340   MatScalar   *ba = b->a, *aa = a->a;
341   PetscReal    shift = info->shiftamount;
342   PetscBool    allowzeropivot, zeropivotdetected;
343 
344   PetscFunctionBegin;
345   allowzeropivot = PetscNot(A->erroriffailure);
346   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
347   for (i = 0; i < n; i++) {
348     nz    = bi[i + 1] - bi[i];
349     ajtmp = bj + bi[i];
350     for (j = 0; j < nz; j++) {
351       x    = rtmp + 4 * ajtmp[j];
352       x[0] = x[1] = x[2] = x[3] = 0.0;
353     }
354     /* load in initial (unfactored row) */
355     nz       = ai[i + 1] - ai[i];
356     ajtmpold = aj + ai[i];
357     v        = aa + 4 * ai[i];
358     for (j = 0; j < nz; j++) {
359       x    = rtmp + 4 * ajtmpold[j];
360       x[0] = v[0];
361       x[1] = v[1];
362       x[2] = v[2];
363       x[3] = v[3];
364       v += 4;
365     }
366     row = *ajtmp++;
367     while (row < i) {
368       pc = rtmp + 4 * row;
369       p1 = pc[0];
370       p2 = pc[1];
371       p3 = pc[2];
372       p4 = pc[3];
373       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
374         pv    = ba + 4 * diag_offset[row];
375         pj    = bj + diag_offset[row] + 1;
376         x1    = pv[0];
377         x2    = pv[1];
378         x3    = pv[2];
379         x4    = pv[3];
380         pc[0] = m1 = p1 * x1 + p3 * x2;
381         pc[1] = m2 = p2 * x1 + p4 * x2;
382         pc[2] = m3 = p1 * x3 + p3 * x4;
383         pc[3] = m4 = p2 * x3 + p4 * x4;
384         nz         = bi[row + 1] - diag_offset[row] - 1;
385         pv += 4;
386         for (j = 0; j < nz; j++) {
387           x1 = pv[0];
388           x2 = pv[1];
389           x3 = pv[2];
390           x4 = pv[3];
391           x  = rtmp + 4 * pj[j];
392           x[0] -= m1 * x1 + m3 * x2;
393           x[1] -= m2 * x1 + m4 * x2;
394           x[2] -= m1 * x3 + m3 * x4;
395           x[3] -= m2 * x3 + m4 * x4;
396           pv += 4;
397         }
398         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
399       }
400       row = *ajtmp++;
401     }
402     /* finished row so stick it into b->a */
403     pv = ba + 4 * bi[i];
404     pj = bj + bi[i];
405     nz = bi[i + 1] - bi[i];
406     for (j = 0; j < nz; j++) {
407       x     = rtmp + 4 * pj[j];
408       pv[0] = x[0];
409       pv[1] = x[1];
410       pv[2] = x[2];
411       pv[3] = x[3];
412       /*
413       printf(" col %d:",pj[j]);
414       PetscInt j1;
415       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
416       printf("\n");
417       */
418       pv += 4;
419     }
420     /* invert diagonal block */
421     w = ba + 4 * diag_offset[i];
422     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
423     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
424   }
425 
426   PetscCall(PetscFree(rtmp));
427 
428   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
429   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
430   C->assembled           = PETSC_TRUE;
431 
432   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 
436 /*
437      Version for when blocks are 1 by 1.
438 */
439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
440 {
441   Mat              C = B;
442   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
443   IS               isrow = b->row, isicol = b->icol;
444   const PetscInt  *r, *ic, *ics;
445   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
446   PetscInt         i, j, k, nz, nzL, row, *pj;
447   const PetscInt  *ajtmp, *bjtmp;
448   MatScalar       *rtmp, *pc, multiplier, *pv;
449   const MatScalar *aa = a->a, *v;
450   PetscBool        row_identity, col_identity;
451   FactorShiftCtx   sctx;
452   const PetscInt  *ddiag;
453   PetscReal        rs;
454   MatScalar        d;
455 
456   PetscFunctionBegin;
457   /* MatPivotSetUp(): initialize shift context sctx */
458   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
459 
460   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
461     ddiag          = a->diag;
462     sctx.shift_top = info->zeropivot;
463     for (i = 0; i < n; i++) {
464       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
465       d  = (aa)[ddiag[i]];
466       rs = -PetscAbsScalar(d) - PetscRealPart(d);
467       v  = aa + ai[i];
468       nz = ai[i + 1] - ai[i];
469       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
470       if (rs > sctx.shift_top) sctx.shift_top = rs;
471     }
472     sctx.shift_top *= 1.1;
473     sctx.nshift_max = 5;
474     sctx.shift_lo   = 0.;
475     sctx.shift_hi   = 1.;
476   }
477 
478   PetscCall(ISGetIndices(isrow, &r));
479   PetscCall(ISGetIndices(isicol, &ic));
480   PetscCall(PetscMalloc1(n + 1, &rtmp));
481   ics = ic;
482 
483   do {
484     sctx.newshift = PETSC_FALSE;
485     for (i = 0; i < n; i++) {
486       /* zero rtmp */
487       /* L part */
488       nz    = bi[i + 1] - bi[i];
489       bjtmp = bj + bi[i];
490       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
491 
492       /* U part */
493       nz    = bdiag[i] - bdiag[i + 1];
494       bjtmp = bj + bdiag[i + 1] + 1;
495       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
496 
497       /* load in initial (unfactored row) */
498       nz    = ai[r[i] + 1] - ai[r[i]];
499       ajtmp = aj + ai[r[i]];
500       v     = aa + ai[r[i]];
501       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
502 
503       /* ZeropivotApply() */
504       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
505 
506       /* elimination */
507       bjtmp = bj + bi[i];
508       row   = *bjtmp++;
509       nzL   = bi[i + 1] - bi[i];
510       for (k = 0; k < nzL; k++) {
511         pc = rtmp + row;
512         if (*pc != (PetscScalar)0.0) {
513           pv         = b->a + bdiag[row];
514           multiplier = *pc * (*pv);
515           *pc        = multiplier;
516 
517           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
518           pv = b->a + bdiag[row + 1] + 1;
519           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
520           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
521           PetscCall(PetscLogFlops(2.0 * nz));
522         }
523         row = *bjtmp++;
524       }
525 
526       /* finished row so stick it into b->a */
527       rs = 0.0;
528       /* L part */
529       pv = b->a + bi[i];
530       pj = b->j + bi[i];
531       nz = bi[i + 1] - bi[i];
532       for (j = 0; j < nz; j++) {
533         pv[j] = rtmp[pj[j]];
534         rs += PetscAbsScalar(pv[j]);
535       }
536 
537       /* U part */
538       pv = b->a + bdiag[i + 1] + 1;
539       pj = b->j + bdiag[i + 1] + 1;
540       nz = bdiag[i] - bdiag[i + 1] - 1;
541       for (j = 0; j < nz; j++) {
542         pv[j] = rtmp[pj[j]];
543         rs += PetscAbsScalar(pv[j]);
544       }
545 
546       sctx.rs = rs;
547       sctx.pv = rtmp[i];
548       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
549       if (sctx.newshift) break; /* break for-loop */
550       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
551 
552       /* Mark diagonal and invert diagonal for simpler triangular solves */
553       pv  = b->a + bdiag[i];
554       *pv = (PetscScalar)1.0 / rtmp[i];
555 
556     } /* endof for (i=0; i<n; i++) { */
557 
558     /* MatPivotRefine() */
559     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
560       /*
561        * if no shift in this attempt & shifting & started shifting & can refine,
562        * then try lower shift
563        */
564       sctx.shift_hi       = sctx.shift_fraction;
565       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
566       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
567       sctx.newshift       = PETSC_TRUE;
568       sctx.nshift++;
569     }
570   } while (sctx.newshift);
571 
572   PetscCall(PetscFree(rtmp));
573   PetscCall(ISRestoreIndices(isicol, &ic));
574   PetscCall(ISRestoreIndices(isrow, &r));
575 
576   PetscCall(ISIdentity(isrow, &row_identity));
577   PetscCall(ISIdentity(isicol, &col_identity));
578   if (row_identity && col_identity) {
579     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
580     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
581     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
582     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
583   } else {
584     C->ops->solve          = MatSolve_SeqBAIJ_1;
585     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
586   }
587   C->assembled = PETSC_TRUE;
588   PetscCall(PetscLogFlops(C->cmap->n));
589 
590   /* MatShiftView(A,info,&sctx) */
591   if (sctx.nshift) {
592     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
593       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
594     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
595       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
596     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
597       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
598     }
599   }
600   PetscFunctionReturn(PETSC_SUCCESS);
601 }
602 
603 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
604 {
605   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
606   IS              isrow = b->row, isicol = b->icol;
607   const PetscInt *r, *ic;
608   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
609   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
610   PetscInt       *diag_offset = b->diag, diag, *pj;
611   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
612   MatScalar      *ba = b->a, *aa = a->a;
613   PetscBool       row_identity, col_identity;
614 
615   PetscFunctionBegin;
616   PetscCall(ISGetIndices(isrow, &r));
617   PetscCall(ISGetIndices(isicol, &ic));
618   PetscCall(PetscMalloc1(n + 1, &rtmp));
619 
620   for (i = 0; i < n; i++) {
621     nz    = bi[i + 1] - bi[i];
622     ajtmp = bj + bi[i];
623     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
624 
625     /* load in initial (unfactored row) */
626     nz       = ai[r[i] + 1] - ai[r[i]];
627     ajtmpold = aj + ai[r[i]];
628     v        = aa + ai[r[i]];
629     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
630 
631     row = *ajtmp++;
632     while (row < i) {
633       pc = rtmp + row;
634       if (*pc != 0.0) {
635         pv         = ba + diag_offset[row];
636         pj         = bj + diag_offset[row] + 1;
637         multiplier = *pc * *pv++;
638         *pc        = multiplier;
639         nz         = bi[row + 1] - diag_offset[row] - 1;
640         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
641         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
642       }
643       row = *ajtmp++;
644     }
645     /* finished row so stick it into b->a */
646     pv = ba + bi[i];
647     pj = bj + bi[i];
648     nz = bi[i + 1] - bi[i];
649     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
650     diag = diag_offset[i] - bi[i];
651     /* check pivot entry for current row */
652     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
653     pv[diag] = 1.0 / pv[diag];
654   }
655 
656   PetscCall(PetscFree(rtmp));
657   PetscCall(ISRestoreIndices(isicol, &ic));
658   PetscCall(ISRestoreIndices(isrow, &r));
659   PetscCall(ISIdentity(isrow, &row_identity));
660   PetscCall(ISIdentity(isicol, &col_identity));
661   if (row_identity && col_identity) {
662     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
663     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
664   } else {
665     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
666     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
667   }
668   C->assembled = PETSC_TRUE;
669   PetscCall(PetscLogFlops(C->cmap->n));
670   PetscFunctionReturn(PETSC_SUCCESS);
671 }
672 
673 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
674 {
675   PetscFunctionBegin;
676   *type = MATSOLVERPETSC;
677   PetscFunctionReturn(PETSC_SUCCESS);
678 }
679 
680 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
681 {
682   PetscInt n = A->rmap->n;
683 
684   PetscFunctionBegin;
685 #if defined(PETSC_USE_COMPLEX)
686   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
687 #endif
688   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
689   PetscCall(MatSetSizes(*B, n, n, n, n));
690   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
691     PetscCall(MatSetType(*B, MATSEQBAIJ));
692 
693     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
694     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
695     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
696     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
697     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
698   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
699     PetscCall(MatSetType(*B, MATSEQSBAIJ));
700     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
701 
702     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
703     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
704     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
705     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
706     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
707   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
708   (*B)->factortype     = ftype;
709   (*B)->canuseordering = PETSC_TRUE;
710 
711   PetscCall(PetscFree((*B)->solvertype));
712   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
713   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
714   PetscFunctionReturn(PETSC_SUCCESS);
715 }
716 
717 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
718 {
719   Mat C;
720 
721   PetscFunctionBegin;
722   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
723   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
724   PetscCall(MatLUFactorNumeric(C, A, info));
725 
726   A->ops->solve          = C->ops->solve;
727   A->ops->solvetranspose = C->ops->solvetranspose;
728 
729   PetscCall(MatHeaderMerge(A, &C));
730   PetscFunctionReturn(PETSC_SUCCESS);
731 }
732 
733 #include <../src/mat/impls/sbaij/seq/sbaij.h>
734 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
735 {
736   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
737   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
738   IS              ip = b->row;
739   const PetscInt *rip;
740   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
741   PetscInt       *ai = a->i, *aj = a->j;
742   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
743   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
744   PetscReal       rs;
745   FactorShiftCtx  sctx;
746 
747   PetscFunctionBegin;
748   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
749     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
750     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
751     PetscCall(MatDestroy(&a->sbaijMat));
752     PetscFunctionReturn(PETSC_SUCCESS);
753   }
754 
755   /* MatPivotSetUp(): initialize shift context sctx */
756   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
757 
758   PetscCall(ISGetIndices(ip, &rip));
759   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
760 
761   sctx.shift_amount = 0.;
762   sctx.nshift       = 0;
763   do {
764     sctx.newshift = PETSC_FALSE;
765     for (i = 0; i < mbs; i++) {
766       rtmp[i] = 0.0;
767       jl[i]   = mbs;
768       il[0]   = 0;
769     }
770 
771     for (k = 0; k < mbs; k++) {
772       bval = ba + bi[k];
773       /* initialize k-th row by the perm[k]-th row of A */
774       jmin = ai[rip[k]];
775       jmax = ai[rip[k] + 1];
776       for (j = jmin; j < jmax; j++) {
777         col = rip[aj[j]];
778         if (col >= k) { /* only take upper triangular entry */
779           rtmp[col] = aa[j];
780           *bval++   = 0.0; /* for in-place factorization */
781         }
782       }
783 
784       /* shift the diagonal of the matrix */
785       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
786 
787       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
788       dk = rtmp[k];
789       i  = jl[k]; /* first row to be added to k_th row  */
790 
791       while (i < k) {
792         nexti = jl[i]; /* next row to be added to k_th row */
793 
794         /* compute multiplier, update diag(k) and U(i,k) */
795         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
796         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
797         dk += uikdi * ba[ili];
798         ba[ili] = uikdi; /* -U(i,k) */
799 
800         /* add multiple of row i to k-th row */
801         jmin = ili + 1;
802         jmax = bi[i + 1];
803         if (jmin < jmax) {
804           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
805           /* update il and jl for row i */
806           il[i] = jmin;
807           j     = bj[jmin];
808           jl[i] = jl[j];
809           jl[j] = i;
810         }
811         i = nexti;
812       }
813 
814       /* shift the diagonals when zero pivot is detected */
815       /* compute rs=sum of abs(off-diagonal) */
816       rs   = 0.0;
817       jmin = bi[k] + 1;
818       nz   = bi[k + 1] - jmin;
819       if (nz) {
820         bcol = bj + jmin;
821         while (nz--) {
822           rs += PetscAbsScalar(rtmp[*bcol]);
823           bcol++;
824         }
825       }
826 
827       sctx.rs = rs;
828       sctx.pv = dk;
829       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
830       if (sctx.newshift) break;
831       dk = sctx.pv;
832 
833       /* copy data into U(k,:) */
834       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
835       jmin      = bi[k] + 1;
836       jmax      = bi[k + 1];
837       if (jmin < jmax) {
838         for (j = jmin; j < jmax; j++) {
839           col       = bj[j];
840           ba[j]     = rtmp[col];
841           rtmp[col] = 0.0;
842         }
843         /* add the k-th row into il and jl */
844         il[k] = jmin;
845         i     = bj[jmin];
846         jl[k] = jl[i];
847         jl[i] = k;
848       }
849     }
850   } while (sctx.newshift);
851   PetscCall(PetscFree3(rtmp, il, jl));
852 
853   PetscCall(ISRestoreIndices(ip, &rip));
854 
855   C->assembled    = PETSC_TRUE;
856   C->preallocated = PETSC_TRUE;
857 
858   PetscCall(PetscLogFlops(C->rmap->N));
859   if (sctx.nshift) {
860     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
861       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
862     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
863       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
864     }
865   }
866   PetscFunctionReturn(PETSC_SUCCESS);
867 }
868 
869 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
870 {
871   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
872   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
873   PetscInt       i, j, am = a->mbs;
874   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
875   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
876   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
877   PetscReal      rs;
878   FactorShiftCtx sctx;
879 
880   PetscFunctionBegin;
881   /* MatPivotSetUp(): initialize shift context sctx */
882   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
883 
884   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
885 
886   do {
887     sctx.newshift = PETSC_FALSE;
888     for (i = 0; i < am; i++) {
889       rtmp[i] = 0.0;
890       jl[i]   = am;
891       il[0]   = 0;
892     }
893 
894     for (k = 0; k < am; k++) {
895       /* initialize k-th row with elements nonzero in row perm(k) of A */
896       nz   = ai[k + 1] - ai[k];
897       acol = aj + ai[k];
898       aval = aa + ai[k];
899       bval = ba + bi[k];
900       while (nz--) {
901         if (*acol < k) { /* skip lower triangular entries */
902           acol++;
903           aval++;
904         } else {
905           rtmp[*acol++] = *aval++;
906           *bval++       = 0.0; /* for in-place factorization */
907         }
908       }
909 
910       /* shift the diagonal of the matrix */
911       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
912 
913       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
914       dk = rtmp[k];
915       i  = jl[k]; /* first row to be added to k_th row  */
916 
917       while (i < k) {
918         nexti = jl[i]; /* next row to be added to k_th row */
919         /* compute multiplier, update D(k) and U(i,k) */
920         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
921         uikdi = -ba[ili] * ba[bi[i]];
922         dk += uikdi * ba[ili];
923         ba[ili] = uikdi; /* -U(i,k) */
924 
925         /* add multiple of row i to k-th row ... */
926         jmin = ili + 1;
927         nz   = bi[i + 1] - jmin;
928         if (nz > 0) {
929           bcol = bj + jmin;
930           bval = ba + jmin;
931           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
932           /* update il and jl for i-th row */
933           il[i] = jmin;
934           j     = bj[jmin];
935           jl[i] = jl[j];
936           jl[j] = i;
937         }
938         i = nexti;
939       }
940 
941       /* shift the diagonals when zero pivot is detected */
942       /* compute rs=sum of abs(off-diagonal) */
943       rs   = 0.0;
944       jmin = bi[k] + 1;
945       nz   = bi[k + 1] - jmin;
946       if (nz) {
947         bcol = bj + jmin;
948         while (nz--) {
949           rs += PetscAbsScalar(rtmp[*bcol]);
950           bcol++;
951         }
952       }
953 
954       sctx.rs = rs;
955       sctx.pv = dk;
956       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
957       if (sctx.newshift) break; /* sctx.shift_amount is updated */
958       dk = sctx.pv;
959 
960       /* copy data into U(k,:) */
961       ba[bi[k]] = 1.0 / dk;
962       jmin      = bi[k] + 1;
963       nz        = bi[k + 1] - jmin;
964       if (nz) {
965         bcol = bj + jmin;
966         bval = ba + jmin;
967         while (nz--) {
968           *bval++       = rtmp[*bcol];
969           rtmp[*bcol++] = 0.0;
970         }
971         /* add k-th row into il and jl */
972         il[k] = jmin;
973         i     = bj[jmin];
974         jl[k] = jl[i];
975         jl[i] = k;
976       }
977     }
978   } while (sctx.newshift);
979   PetscCall(PetscFree3(rtmp, il, jl));
980 
981   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
982   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983   C->assembled           = PETSC_TRUE;
984   C->preallocated        = PETSC_TRUE;
985 
986   PetscCall(PetscLogFlops(C->rmap->N));
987   if (sctx.nshift) {
988     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
989       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
990     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
991       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
992     }
993   }
994   PetscFunctionReturn(PETSC_SUCCESS);
995 }
996 
997 #include <petscbt.h>
998 #include <../src/mat/utils/freespace.h>
999 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1000 {
1001   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1002   Mat_SeqSBAIJ      *b;
1003   Mat                B;
1004   PetscBool          perm_identity, missing;
1005   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1006   const PetscInt    *rip;
1007   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1008   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1009   PetscReal          fill = info->fill, levels = info->levels;
1010   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1011   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1012   PetscBT            lnkbt;
1013 
1014   PetscFunctionBegin;
1015   PetscCall(MatMissingDiagonal(A, &missing, &i));
1016   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1017 
1018   if (bs > 1) {
1019     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1020     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1021 
1022     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1023     PetscFunctionReturn(PETSC_SUCCESS);
1024   }
1025 
1026   PetscCall(ISIdentity(perm, &perm_identity));
1027   PetscCall(ISGetIndices(perm, &rip));
1028 
1029   /* special case that simply copies fill pattern */
1030   if (!levels && perm_identity) {
1031     PetscCall(PetscMalloc1(am + 1, &ui));
1032     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1033     B = fact;
1034     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1035 
1036     b  = (Mat_SeqSBAIJ *)B->data;
1037     uj = b->j;
1038     for (i = 0; i < am; i++) {
1039       aj = a->j + a->diag[i];
1040       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1041       b->ilen[i] = ui[i];
1042     }
1043     PetscCall(PetscFree(ui));
1044 
1045     B->factortype = MAT_FACTOR_NONE;
1046 
1047     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1048     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1049     B->factortype = MAT_FACTOR_ICC;
1050 
1051     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1052     PetscFunctionReturn(PETSC_SUCCESS);
1053   }
1054 
1055   /* initialization */
1056   PetscCall(PetscMalloc1(am + 1, &ui));
1057   ui[0] = 0;
1058   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1059 
1060   /* jl: linked list for storing indices of the pivot rows
1061      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1062   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1063   for (i = 0; i < am; i++) {
1064     jl[i] = am;
1065     il[i] = 0;
1066   }
1067 
1068   /* create and initialize a linked list for storing column indices of the active row k */
1069   nlnk = am + 1;
1070   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1071 
1072   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1073   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1074 
1075   current_space = free_space;
1076 
1077   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1078   current_space_lvl = free_space_lvl;
1079 
1080   for (k = 0; k < am; k++) { /* for each active row k */
1081     /* initialize lnk by the column indices of row rip[k] of A */
1082     nzk         = 0;
1083     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1084     ncols_upper = 0;
1085     cols        = cols_lvl + am;
1086     for (j = 0; j < ncols; j++) {
1087       i = rip[*(aj + ai[rip[k]] + j)];
1088       if (i >= k) { /* only take upper triangular entry */
1089         cols[ncols_upper]     = i;
1090         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1091         ncols_upper++;
1092       }
1093     }
1094     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1095     nzk += nlnk;
1096 
1097     /* update lnk by computing fill-in for each pivot row to be merged in */
1098     prow = jl[k]; /* 1st pivot row */
1099 
1100     while (prow < k) {
1101       nextprow = jl[prow];
1102 
1103       /* merge prow into k-th row */
1104       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1105       jmax  = ui[prow + 1];
1106       ncols = jmax - jmin;
1107       i     = jmin - ui[prow];
1108       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1109       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1110       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1111       nzk += nlnk;
1112 
1113       /* update il and jl for prow */
1114       if (jmin < jmax) {
1115         il[prow] = jmin;
1116 
1117         j        = *cols;
1118         jl[prow] = jl[j];
1119         jl[j]    = prow;
1120       }
1121       prow = nextprow;
1122     }
1123 
1124     /* if free space is not available, make more free space */
1125     if (current_space->local_remaining < nzk) {
1126       i = am - k + 1;                                                             /* num of unfactored rows */
1127       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1128       PetscCall(PetscFreeSpaceGet(i, &current_space));
1129       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1130       reallocs++;
1131     }
1132 
1133     /* copy data into free_space and free_space_lvl, then initialize lnk */
1134     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1135 
1136     /* add the k-th row into il and jl */
1137     if (nzk - 1 > 0) {
1138       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1139       jl[k] = jl[i];
1140       jl[i] = k;
1141       il[k] = ui[k] + 1;
1142     }
1143     uj_ptr[k]     = current_space->array;
1144     uj_lvl_ptr[k] = current_space_lvl->array;
1145 
1146     current_space->array += nzk;
1147     current_space->local_used += nzk;
1148     current_space->local_remaining -= nzk;
1149 
1150     current_space_lvl->array += nzk;
1151     current_space_lvl->local_used += nzk;
1152     current_space_lvl->local_remaining -= nzk;
1153 
1154     ui[k + 1] = ui[k] + nzk;
1155   }
1156 
1157   PetscCall(ISRestoreIndices(perm, &rip));
1158   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1159   PetscCall(PetscFree(cols_lvl));
1160 
1161   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1162   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1163   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1164   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1165   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1166 
1167   /* put together the new matrix in MATSEQSBAIJ format */
1168   B = fact;
1169   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1170 
1171   b               = (Mat_SeqSBAIJ *)B->data;
1172   b->singlemalloc = PETSC_FALSE;
1173   b->free_a       = PETSC_TRUE;
1174   b->free_ij      = PETSC_TRUE;
1175 
1176   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1177 
1178   b->j             = uj;
1179   b->i             = ui;
1180   b->diag          = NULL;
1181   b->ilen          = NULL;
1182   b->imax          = NULL;
1183   b->row           = perm;
1184   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1185 
1186   PetscCall(PetscObjectReference((PetscObject)perm));
1187 
1188   b->icol = perm;
1189 
1190   PetscCall(PetscObjectReference((PetscObject)perm));
1191   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1192 
1193   b->maxnz = b->nz = ui[am];
1194 
1195   B->info.factor_mallocs   = reallocs;
1196   B->info.fill_ratio_given = fill;
1197   if (ai[am] != 0.) {
1198     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1199     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1200   } else {
1201     B->info.fill_ratio_needed = 0.0;
1202   }
1203 #if defined(PETSC_USE_INFO)
1204   if (ai[am] != 0) {
1205     PetscReal af = B->info.fill_ratio_needed;
1206     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1207     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1208     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1209   } else {
1210     PetscCall(PetscInfo(A, "Empty matrix\n"));
1211   }
1212 #endif
1213   if (perm_identity) {
1214     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1215     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1217   } else {
1218     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1219   }
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1224 {
1225   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1226   Mat_SeqSBAIJ      *b;
1227   Mat                B;
1228   PetscBool          perm_identity, missing;
1229   PetscReal          fill = info->fill;
1230   const PetscInt    *rip;
1231   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1232   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1233   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1234   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1235   PetscBT            lnkbt;
1236 
1237   PetscFunctionBegin;
1238   if (bs > 1) { /* convert to seqsbaij */
1239     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1240     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1241 
1242     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1243     PetscFunctionReturn(PETSC_SUCCESS);
1244   }
1245 
1246   PetscCall(MatMissingDiagonal(A, &missing, &i));
1247   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1248 
1249   /* check whether perm is the identity mapping */
1250   PetscCall(ISIdentity(perm, &perm_identity));
1251   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1252   PetscCall(ISGetIndices(perm, &rip));
1253 
1254   /* initialization */
1255   PetscCall(PetscMalloc1(mbs + 1, &ui));
1256   ui[0] = 0;
1257 
1258   /* jl: linked list for storing indices of the pivot rows
1259      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1260   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1261   for (i = 0; i < mbs; i++) {
1262     jl[i] = mbs;
1263     il[i] = 0;
1264   }
1265 
1266   /* create and initialize a linked list for storing column indices of the active row k */
1267   nlnk = mbs + 1;
1268   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1269 
1270   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1271   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1272 
1273   current_space = free_space;
1274 
1275   for (k = 0; k < mbs; k++) { /* for each active row k */
1276     /* initialize lnk by the column indices of row rip[k] of A */
1277     nzk         = 0;
1278     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1279     ncols_upper = 0;
1280     for (j = 0; j < ncols; j++) {
1281       i = rip[*(aj + ai[rip[k]] + j)];
1282       if (i >= k) { /* only take upper triangular entry */
1283         cols[ncols_upper] = i;
1284         ncols_upper++;
1285       }
1286     }
1287     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1288     nzk += nlnk;
1289 
1290     /* update lnk by computing fill-in for each pivot row to be merged in */
1291     prow = jl[k]; /* 1st pivot row */
1292 
1293     while (prow < k) {
1294       nextprow = jl[prow];
1295       /* merge prow into k-th row */
1296       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1297       jmax   = ui[prow + 1];
1298       ncols  = jmax - jmin;
1299       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1300       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1301       nzk += nlnk;
1302 
1303       /* update il and jl for prow */
1304       if (jmin < jmax) {
1305         il[prow] = jmin;
1306         j        = *uj_ptr;
1307         jl[prow] = jl[j];
1308         jl[j]    = prow;
1309       }
1310       prow = nextprow;
1311     }
1312 
1313     /* if free space is not available, make more free space */
1314     if (current_space->local_remaining < nzk) {
1315       i = mbs - k + 1;                                                            /* num of unfactored rows */
1316       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1317       PetscCall(PetscFreeSpaceGet(i, &current_space));
1318       reallocs++;
1319     }
1320 
1321     /* copy data into free space, then initialize lnk */
1322     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1323 
1324     /* add the k-th row into il and jl */
1325     if (nzk - 1 > 0) {
1326       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1327       jl[k] = jl[i];
1328       jl[i] = k;
1329       il[k] = ui[k] + 1;
1330     }
1331     ui_ptr[k] = current_space->array;
1332     current_space->array += nzk;
1333     current_space->local_used += nzk;
1334     current_space->local_remaining -= nzk;
1335 
1336     ui[k + 1] = ui[k] + nzk;
1337   }
1338 
1339   PetscCall(ISRestoreIndices(perm, &rip));
1340   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1341 
1342   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1343   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1344   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1345   PetscCall(PetscLLDestroy(lnk, lnkbt));
1346 
1347   /* put together the new matrix in MATSEQSBAIJ format */
1348   B = fact;
1349   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1350 
1351   b               = (Mat_SeqSBAIJ *)B->data;
1352   b->singlemalloc = PETSC_FALSE;
1353   b->free_a       = PETSC_TRUE;
1354   b->free_ij      = PETSC_TRUE;
1355 
1356   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1357 
1358   b->j             = uj;
1359   b->i             = ui;
1360   b->diag          = NULL;
1361   b->ilen          = NULL;
1362   b->imax          = NULL;
1363   b->row           = perm;
1364   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1365 
1366   PetscCall(PetscObjectReference((PetscObject)perm));
1367   b->icol = perm;
1368   PetscCall(PetscObjectReference((PetscObject)perm));
1369   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1370   b->maxnz = b->nz = ui[mbs];
1371 
1372   B->info.factor_mallocs   = reallocs;
1373   B->info.fill_ratio_given = fill;
1374   if (ai[mbs] != 0.) {
1375     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1376     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1377   } else {
1378     B->info.fill_ratio_needed = 0.0;
1379   }
1380 #if defined(PETSC_USE_INFO)
1381   if (ai[mbs] != 0.) {
1382     PetscReal af = B->info.fill_ratio_needed;
1383     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1384     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1385     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1386   } else {
1387     PetscCall(PetscInfo(A, "Empty matrix\n"));
1388   }
1389 #endif
1390   if (perm_identity) {
1391     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1392   } else {
1393     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1394   }
1395   PetscFunctionReturn(PETSC_SUCCESS);
1396 }
1397 
1398 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1399 {
1400   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1401   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1402   PetscInt           i, k, n                       = a->mbs;
1403   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1404   const MatScalar   *aa = a->a, *v;
1405   PetscScalar       *x, *s, *t, *ls;
1406   const PetscScalar *b;
1407 
1408   PetscFunctionBegin;
1409   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1410   PetscCall(VecGetArrayRead(bb, &b));
1411   PetscCall(VecGetArray(xx, &x));
1412   t = a->solve_work;
1413 
1414   /* forward solve the lower triangular */
1415   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1416 
1417   for (i = 1; i < n; i++) {
1418     v  = aa + bs2 * ai[i];
1419     vi = aj + ai[i];
1420     nz = ai[i + 1] - ai[i];
1421     s  = t + bs * i;
1422     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1423     for (k = 0; k < nz; k++) {
1424       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1425       v += bs2;
1426     }
1427   }
1428 
1429   /* backward solve the upper triangular */
1430   ls = a->solve_work + A->cmap->n;
1431   for (i = n - 1; i >= 0; i--) {
1432     v  = aa + bs2 * (adiag[i + 1] + 1);
1433     vi = aj + adiag[i + 1] + 1;
1434     nz = adiag[i] - adiag[i + 1] - 1;
1435     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1436     for (k = 0; k < nz; k++) {
1437       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1438       v += bs2;
1439     }
1440     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1441     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1442   }
1443 
1444   PetscCall(VecRestoreArrayRead(bb, &b));
1445   PetscCall(VecRestoreArray(xx, &x));
1446   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1447   PetscFunctionReturn(PETSC_SUCCESS);
1448 }
1449 
1450 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1451 {
1452   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1453   IS                 iscol = a->col, isrow = a->row;
1454   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1455   PetscInt           i, m, n = a->mbs;
1456   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1457   const MatScalar   *aa = a->a, *v;
1458   PetscScalar       *x, *s, *t, *ls;
1459   const PetscScalar *b;
1460 
1461   PetscFunctionBegin;
1462   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1463   PetscCall(VecGetArrayRead(bb, &b));
1464   PetscCall(VecGetArray(xx, &x));
1465   t = a->solve_work;
1466 
1467   PetscCall(ISGetIndices(isrow, &rout));
1468   r = rout;
1469   PetscCall(ISGetIndices(iscol, &cout));
1470   c = cout;
1471 
1472   /* forward solve the lower triangular */
1473   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1474   for (i = 1; i < n; i++) {
1475     v  = aa + bs2 * ai[i];
1476     vi = aj + ai[i];
1477     nz = ai[i + 1] - ai[i];
1478     s  = t + bs * i;
1479     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1480     for (m = 0; m < nz; m++) {
1481       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1482       v += bs2;
1483     }
1484   }
1485 
1486   /* backward solve the upper triangular */
1487   ls = a->solve_work + A->cmap->n;
1488   for (i = n - 1; i >= 0; i--) {
1489     v  = aa + bs2 * (adiag[i + 1] + 1);
1490     vi = aj + adiag[i + 1] + 1;
1491     nz = adiag[i] - adiag[i + 1] - 1;
1492     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1493     for (m = 0; m < nz; m++) {
1494       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1495       v += bs2;
1496     }
1497     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1498     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1499   }
1500   PetscCall(ISRestoreIndices(isrow, &rout));
1501   PetscCall(ISRestoreIndices(iscol, &cout));
1502   PetscCall(VecRestoreArrayRead(bb, &b));
1503   PetscCall(VecRestoreArray(xx, &x));
1504   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1505   PetscFunctionReturn(PETSC_SUCCESS);
1506 }
1507 
1508 /*
1509     For each block in an block array saves the largest absolute value in the block into another array
1510 */
1511 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1512 {
1513   PetscInt i, j;
1514 
1515   PetscFunctionBegin;
1516   PetscCall(PetscArrayzero(absarray, nbs + 1));
1517   for (i = 0; i < nbs; i++) {
1518     for (j = 0; j < bs2; j++) {
1519       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1520     }
1521   }
1522   PetscFunctionReturn(PETSC_SUCCESS);
1523 }
1524 
1525 /*
1526      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1527 */
1528 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1529 {
1530   Mat             B = *fact;
1531   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1532   IS              isicol;
1533   const PetscInt *r, *ic;
1534   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1535   PetscInt       *bi, *bj, *bdiag;
1536 
1537   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1538   PetscInt   nlnk, *lnk;
1539   PetscBT    lnkbt;
1540   PetscBool  row_identity, icol_identity;
1541   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1542   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1543 
1544   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1545   PetscInt   nnz_max;
1546   PetscBool  missing;
1547   PetscReal *vtmp_abs;
1548   MatScalar *v_work;
1549   PetscInt  *v_pivots;
1550   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1551 
1552   PetscFunctionBegin;
1553   /* symbolic factorization, can be reused */
1554   PetscCall(MatMissingDiagonal(A, &missing, &i));
1555   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1556   adiag = a->diag;
1557 
1558   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1559 
1560   /* bdiag is location of diagonal in factor */
1561   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1562 
1563   /* allocate row pointers bi */
1564   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1565 
1566   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1567   dtcount = (PetscInt)info->dtcount;
1568   if (dtcount > mbs - 1) dtcount = mbs - 1;
1569   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1570   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1571   PetscCall(PetscMalloc1(nnz_max, &bj));
1572   nnz_max = nnz_max * bs2;
1573   PetscCall(PetscMalloc1(nnz_max, &ba));
1574 
1575   /* put together the new matrix */
1576   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1577 
1578   b               = (Mat_SeqBAIJ *)(B)->data;
1579   b->free_a       = PETSC_TRUE;
1580   b->free_ij      = PETSC_TRUE;
1581   b->singlemalloc = PETSC_FALSE;
1582 
1583   b->a    = ba;
1584   b->j    = bj;
1585   b->i    = bi;
1586   b->diag = bdiag;
1587   b->ilen = NULL;
1588   b->imax = NULL;
1589   b->row  = isrow;
1590   b->col  = iscol;
1591 
1592   PetscCall(PetscObjectReference((PetscObject)isrow));
1593   PetscCall(PetscObjectReference((PetscObject)iscol));
1594 
1595   b->icol = isicol;
1596   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1597   b->maxnz = nnz_max / bs2;
1598 
1599   (B)->factortype            = MAT_FACTOR_ILUDT;
1600   (B)->info.factor_mallocs   = 0;
1601   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1602   /* end of symbolic factorization */
1603   PetscCall(ISGetIndices(isrow, &r));
1604   PetscCall(ISGetIndices(isicol, &ic));
1605 
1606   /* linked list for storing column indices of the active row */
1607   nlnk = mbs + 1;
1608   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1609 
1610   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1611   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1612   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1613   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1614   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1615   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1616 
1617   allowzeropivot  = PetscNot(A->erroriffailure);
1618   bi[0]           = 0;
1619   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1620   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1621   for (i = 0; i < mbs; i++) {
1622     /* copy initial fill into linked list */
1623     nzi = ai[r[i] + 1] - ai[r[i]];
1624     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1625     nzi_al = adiag[r[i]] - ai[r[i]];
1626     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1627 
1628     /* load in initial unfactored row */
1629     ajtmp = aj + ai[r[i]];
1630     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1631     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1632     aatmp = a->a + bs2 * ai[r[i]];
1633     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1634 
1635     /* add pivot rows into linked list */
1636     row = lnk[mbs];
1637     while (row < i) {
1638       nzi_bl = bi[row + 1] - bi[row] + 1;
1639       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1640       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1641       nzi += nlnk;
1642       row = lnk[row];
1643     }
1644 
1645     /* copy data from lnk into jtmp, then initialize lnk */
1646     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1647 
1648     /* numerical factorization */
1649     bjtmp = jtmp;
1650     row   = *bjtmp++; /* 1st pivot row */
1651 
1652     while (row < i) {
1653       pc = rtmp + bs2 * row;
1654       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1655       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1656       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1657       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1658         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1659         pv = ba + bs2 * (bdiag[row + 1] + 1);
1660         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1661         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1662         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1663       }
1664       row = *bjtmp++;
1665     }
1666 
1667     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1668     nzi_bl = 0;
1669     j      = 0;
1670     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1671       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1672       nzi_bl++;
1673       j++;
1674     }
1675     nzi_bu = nzi - nzi_bl - 1;
1676 
1677     while (j < nzi) { /* U-part */
1678       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1679       j++;
1680     }
1681 
1682     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1683 
1684     bjtmp = bj + bi[i];
1685     batmp = ba + bs2 * bi[i];
1686     /* apply level dropping rule to L part */
1687     ncut = nzi_al + dtcount;
1688     if (ncut < nzi_bl) {
1689       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1690       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1691     } else {
1692       ncut = nzi_bl;
1693     }
1694     for (j = 0; j < ncut; j++) {
1695       bjtmp[j] = jtmp[j];
1696       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1697     }
1698     bi[i + 1] = bi[i] + ncut;
1699     nzi       = ncut + 1;
1700 
1701     /* apply level dropping rule to U part */
1702     ncut = nzi_au + dtcount;
1703     if (ncut < nzi_bu) {
1704       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1705       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1706     } else {
1707       ncut = nzi_bu;
1708     }
1709     nzi += ncut;
1710 
1711     /* mark bdiagonal */
1712     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1713     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1714 
1715     bjtmp = bj + bdiag[i];
1716     batmp = ba + bs2 * bdiag[i];
1717     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1718     *bjtmp = i;
1719 
1720     bjtmp = bj + bdiag[i + 1] + 1;
1721     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1722 
1723     for (k = 0; k < ncut; k++) {
1724       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1725       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1726     }
1727 
1728     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1729 
1730     /* invert diagonal block for simpler triangular solves - add shift??? */
1731     batmp = ba + bs2 * bdiag[i];
1732 
1733     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1734     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1735   } /* for (i=0; i<mbs; i++) */
1736   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1737 
1738   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1739   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1740 
1741   PetscCall(ISRestoreIndices(isrow, &r));
1742   PetscCall(ISRestoreIndices(isicol, &ic));
1743 
1744   PetscCall(PetscLLDestroy(lnk, lnkbt));
1745 
1746   PetscCall(PetscFree2(im, jtmp));
1747   PetscCall(PetscFree2(rtmp, vtmp));
1748 
1749   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1750   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1751 
1752   PetscCall(ISIdentity(isrow, &row_identity));
1753   PetscCall(ISIdentity(isicol, &icol_identity));
1754   if (row_identity && icol_identity) {
1755     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1756   } else {
1757     B->ops->solve = MatSolve_SeqBAIJ_N;
1758   }
1759 
1760   B->ops->solveadd          = NULL;
1761   B->ops->solvetranspose    = NULL;
1762   B->ops->solvetransposeadd = NULL;
1763   B->ops->matsolve          = NULL;
1764   B->assembled              = PETSC_TRUE;
1765   B->preallocated           = PETSC_TRUE;
1766   PetscFunctionReturn(PETSC_SUCCESS);
1767 }
1768