xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 7bccdc57bcbb04bf81915df4b7fb460de8a2065b)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8 {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15   MatScalar      *rtmp, *pc, *mwork, *pv;
16   MatScalar      *aa             = a->a, *v;
17   PetscReal       shift          = info->shiftamount;
18   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
19 
20   PetscFunctionBegin;
21   PetscCall(ISGetIndices(isrow, &r));
22   PetscCall(ISGetIndices(isicol, &ic));
23 
24   /* generate work space needed by the factorization */
25   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26   PetscCall(PetscArrayzero(rtmp, bs2 * n));
27 
28   for (PetscInt i = 0; i < n; i++) {
29     /* zero rtmp */
30     /* L part */
31     PetscInt *pj;
32     PetscInt  nzL, nz = bi[i + 1] - bi[i];
33     bjtmp = bj + bi[i];
34     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
35 
36     /* U part */
37     nz    = bdiag[i] - bdiag[i + 1];
38     bjtmp = bj + bdiag[i + 1] + 1;
39     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
40 
41     /* load in initial (unfactored row) */
42     nz    = ai[r[i] + 1] - ai[r[i]];
43     ajtmp = aj + ai[r[i]];
44     v     = aa + bs2 * ai[r[i]];
45     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
46 
47     /* elimination */
48     bjtmp = bj + bi[i];
49     nzL   = bi[i + 1] - bi[i];
50     for (PetscInt k = 0; k < nzL; k++) {
51       PetscBool flg = PETSC_FALSE;
52       PetscInt  row = bjtmp[k];
53 
54       pc = rtmp + bs2 * row;
55       for (PetscInt j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = PETSC_TRUE;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (PetscInt j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     {
92       PetscBool zeropivotdetected;
93 
94       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96     }
97 
98     /* U part */
99     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100     pj = b->j + bdiag[i + 1] + 1;
101     nz = bdiag[i] - bdiag[i + 1] - 1;
102     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103   }
104 
105   PetscCall(PetscFree2(rtmp, mwork));
106   PetscCall(ISRestoreIndices(isicol, &ic));
107   PetscCall(ISRestoreIndices(isrow, &r));
108 
109   C->ops->solve          = MatSolve_SeqBAIJ_2;
110   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111   C->assembled           = PETSC_TRUE;
112 
113   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118 {
119   Mat             C = B;
120   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121   PetscInt        i, j, k, nz, nzL, row, *pj;
122   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124   MatScalar      *rtmp, *pc, *mwork, *pv;
125   MatScalar      *aa = a->a, *v;
126   PetscInt        flg;
127   PetscReal       shift = info->shiftamount;
128   PetscBool       allowzeropivot, zeropivotdetected;
129 
130   PetscFunctionBegin;
131   allowzeropivot = PetscNot(A->erroriffailure);
132 
133   /* generate work space needed by the factorization */
134   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135   PetscCall(PetscArrayzero(rtmp, bs2 * n));
136 
137   for (i = 0; i < n; i++) {
138     /* zero rtmp */
139     /* L part */
140     nz    = bi[i + 1] - bi[i];
141     bjtmp = bj + bi[i];
142     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143 
144     /* U part */
145     nz    = bdiag[i] - bdiag[i + 1];
146     bjtmp = bj + bdiag[i + 1] + 1;
147     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
148 
149     /* load in initial (unfactored row) */
150     nz    = ai[i + 1] - ai[i];
151     ajtmp = aj + ai[i];
152     v     = aa + bs2 * ai[i];
153     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
154 
155     /* elimination */
156     bjtmp = bj + bi[i];
157     nzL   = bi[i + 1] - bi[i];
158     for (k = 0; k < nzL; k++) {
159       row = bjtmp[k];
160       pc  = rtmp + bs2 * row;
161       for (flg = 0, j = 0; j < bs2; j++) {
162         if (pc[j] != (PetscScalar)0.0) {
163           flg = 1;
164           break;
165         }
166       }
167       if (flg) {
168         pv = b->a + bs2 * bdiag[row];
169         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
171 
172         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175         for (j = 0; j < nz; j++) {
176           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178           v = rtmp + 4 * pj[j];
179           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180           pv += 4;
181         }
182         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183       }
184     }
185 
186     /* finished row so stick it into b->a */
187     /* L part */
188     pv = b->a + bs2 * bi[i];
189     pj = b->j + bi[i];
190     nz = bi[i + 1] - bi[i];
191     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
192 
193     /* Mark diagonal and invert diagonal for simpler triangular solves */
194     pv = b->a + bs2 * bdiag[i];
195     pj = b->j + bdiag[i];
196     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199 
200     /* U part */
201     /*
202     pv = b->a + bs2*bi[2*n-i];
203     pj = b->j + bi[2*n-i];
204     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205     */
206     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207     pj = b->j + bdiag[i + 1] + 1;
208     nz = bdiag[i] - bdiag[i + 1] - 1;
209     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210   }
211   PetscCall(PetscFree2(rtmp, mwork));
212 
213   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217   C->assembled           = PETSC_TRUE;
218 
219   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224 {
225   Mat             C = B;
226   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227   IS              isrow = b->row, isicol = b->icol;
228   const PetscInt *r, *ic;
229   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230   PetscInt       *ajtmpold, *ajtmp, nz, row;
231   PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
232   const PetscInt *diag_offset;
233   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234   MatScalar       p1, p2, p3, p4;
235   MatScalar      *ba = b->a, *aa = a->a;
236   PetscReal       shift = info->shiftamount;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241   A->factortype = MAT_FACTOR_NONE;
242   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243   A->factortype  = MAT_FACTOR_ILU;
244   allowzeropivot = PetscNot(A->erroriffailure);
245   PetscCall(ISGetIndices(isrow, &r));
246   PetscCall(ISGetIndices(isicol, &ic));
247   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
248 
249   for (i = 0; i < n; i++) {
250     nz    = bi[i + 1] - bi[i];
251     ajtmp = bj + bi[i];
252     for (j = 0; j < nz; j++) {
253       x    = rtmp + 4 * ajtmp[j];
254       x[0] = x[1] = x[2] = x[3] = 0.0;
255     }
256     /* load in initial (unfactored row) */
257     idx      = r[i];
258     nz       = ai[idx + 1] - ai[idx];
259     ajtmpold = aj + ai[idx];
260     v        = aa + 4 * ai[idx];
261     for (j = 0; j < nz; j++) {
262       x    = rtmp + 4 * ic[ajtmpold[j]];
263       x[0] = v[0];
264       x[1] = v[1];
265       x[2] = v[2];
266       x[3] = v[3];
267       v += 4;
268     }
269     row = *ajtmp++;
270     while (row < i) {
271       pc = rtmp + 4 * row;
272       p1 = pc[0];
273       p2 = pc[1];
274       p3 = pc[2];
275       p4 = pc[3];
276       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277         pv    = ba + 4 * diag_offset[row];
278         pj    = bj + diag_offset[row] + 1;
279         x1    = pv[0];
280         x2    = pv[1];
281         x3    = pv[2];
282         x4    = pv[3];
283         pc[0] = m1 = p1 * x1 + p3 * x2;
284         pc[1] = m2 = p2 * x1 + p4 * x2;
285         pc[2] = m3 = p1 * x3 + p3 * x4;
286         pc[3] = m4 = p2 * x3 + p4 * x4;
287         nz         = bi[row + 1] - diag_offset[row] - 1;
288         pv += 4;
289         for (j = 0; j < nz; j++) {
290           x1 = pv[0];
291           x2 = pv[1];
292           x3 = pv[2];
293           x4 = pv[3];
294           x  = rtmp + 4 * pj[j];
295           x[0] -= m1 * x1 + m3 * x2;
296           x[1] -= m2 * x1 + m4 * x2;
297           x[2] -= m1 * x3 + m3 * x4;
298           x[3] -= m2 * x3 + m4 * x4;
299           pv += 4;
300         }
301         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302       }
303       row = *ajtmp++;
304     }
305     /* finished row so stick it into b->a */
306     pv = ba + 4 * bi[i];
307     pj = bj + bi[i];
308     nz = bi[i + 1] - bi[i];
309     for (j = 0; j < nz; j++) {
310       x     = rtmp + 4 * pj[j];
311       pv[0] = x[0];
312       pv[1] = x[1];
313       pv[2] = x[2];
314       pv[3] = x[3];
315       pv += 4;
316     }
317     /* invert diagonal block */
318     w = ba + 4 * diag_offset[i];
319     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321   }
322 
323   PetscCall(PetscFree(rtmp));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 /*
335       Version for when blocks are 2 by 2 Using natural ordering
336 */
337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338 {
339   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341   PetscInt       *ajtmpold, *ajtmp, nz, row;
342   PetscInt       *ai = a->i, *aj = a->j, *pj;
343   const PetscInt *diag_offset;
344   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
345   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346   MatScalar      *ba = b->a, *aa = a->a;
347   PetscReal       shift = info->shiftamount;
348   PetscBool       allowzeropivot, zeropivotdetected;
349 
350   PetscFunctionBegin;
351   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352   A->factortype = MAT_FACTOR_NONE;
353   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354   A->factortype  = MAT_FACTOR_ILU;
355   allowzeropivot = PetscNot(A->erroriffailure);
356   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357   for (i = 0; i < n; i++) {
358     nz    = bi[i + 1] - bi[i];
359     ajtmp = bj + bi[i];
360     for (j = 0; j < nz; j++) {
361       x    = rtmp + 4 * ajtmp[j];
362       x[0] = x[1] = x[2] = x[3] = 0.0;
363     }
364     /* load in initial (unfactored row) */
365     nz       = ai[i + 1] - ai[i];
366     ajtmpold = aj + ai[i];
367     v        = aa + 4 * ai[i];
368     for (j = 0; j < nz; j++) {
369       x    = rtmp + 4 * ajtmpold[j];
370       x[0] = v[0];
371       x[1] = v[1];
372       x[2] = v[2];
373       x[3] = v[3];
374       v += 4;
375     }
376     row = *ajtmp++;
377     while (row < i) {
378       pc = rtmp + 4 * row;
379       p1 = pc[0];
380       p2 = pc[1];
381       p3 = pc[2];
382       p4 = pc[3];
383       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384         pv    = ba + 4 * diag_offset[row];
385         pj    = bj + diag_offset[row] + 1;
386         x1    = pv[0];
387         x2    = pv[1];
388         x3    = pv[2];
389         x4    = pv[3];
390         pc[0] = m1 = p1 * x1 + p3 * x2;
391         pc[1] = m2 = p2 * x1 + p4 * x2;
392         pc[2] = m3 = p1 * x3 + p3 * x4;
393         pc[3] = m4 = p2 * x3 + p4 * x4;
394         nz         = bi[row + 1] - diag_offset[row] - 1;
395         pv += 4;
396         for (j = 0; j < nz; j++) {
397           x1 = pv[0];
398           x2 = pv[1];
399           x3 = pv[2];
400           x4 = pv[3];
401           x  = rtmp + 4 * pj[j];
402           x[0] -= m1 * x1 + m3 * x2;
403           x[1] -= m2 * x1 + m4 * x2;
404           x[2] -= m1 * x3 + m3 * x4;
405           x[3] -= m2 * x3 + m4 * x4;
406           pv += 4;
407         }
408         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409       }
410       row = *ajtmp++;
411     }
412     /* finished row so stick it into b->a */
413     pv = ba + 4 * bi[i];
414     pj = bj + bi[i];
415     nz = bi[i + 1] - bi[i];
416     for (j = 0; j < nz; j++) {
417       x     = rtmp + 4 * pj[j];
418       pv[0] = x[0];
419       pv[1] = x[1];
420       pv[2] = x[2];
421       pv[3] = x[3];
422       /*
423       printf(" col %d:",pj[j]);
424       PetscInt j1;
425       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426       printf("\n");
427       */
428       pv += 4;
429     }
430     /* invert diagonal block */
431     w = ba + 4 * diag_offset[i];
432     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434   }
435 
436   PetscCall(PetscFree(rtmp));
437 
438   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440   C->assembled           = PETSC_TRUE;
441 
442   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /*
447      Version for when blocks are 1 by 1.
448 */
449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450 {
451   Mat              C = B;
452   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453   IS               isrow = b->row, isicol = b->icol;
454   const PetscInt  *r, *ic, *ics;
455   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456   PetscInt         i, j, k, nz, nzL, row, *pj;
457   const PetscInt  *ajtmp, *bjtmp;
458   MatScalar       *rtmp, *pc, multiplier, *pv;
459   const MatScalar *aa = a->a, *v;
460   PetscBool        row_identity, col_identity;
461   FactorShiftCtx   sctx;
462   const PetscInt  *ddiag;
463   PetscReal        rs;
464   MatScalar        d;
465 
466   PetscFunctionBegin;
467   /* MatPivotSetUp(): initialize shift context sctx */
468   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
469 
470   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471     PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472     sctx.shift_top = info->zeropivot;
473     for (i = 0; i < n; i++) {
474       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475       d  = (aa)[ddiag[i]];
476       rs = -PetscAbsScalar(d) - PetscRealPart(d);
477       v  = aa + ai[i];
478       nz = ai[i + 1] - ai[i];
479       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480       if (rs > sctx.shift_top) sctx.shift_top = rs;
481     }
482     sctx.shift_top *= 1.1;
483     sctx.nshift_max = 5;
484     sctx.shift_lo   = 0.;
485     sctx.shift_hi   = 1.;
486   }
487 
488   PetscCall(ISGetIndices(isrow, &r));
489   PetscCall(ISGetIndices(isicol, &ic));
490   PetscCall(PetscMalloc1(n + 1, &rtmp));
491   ics = ic;
492 
493   do {
494     sctx.newshift = PETSC_FALSE;
495     for (i = 0; i < n; i++) {
496       /* zero rtmp */
497       /* L part */
498       nz    = bi[i + 1] - bi[i];
499       bjtmp = bj + bi[i];
500       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
501 
502       /* U part */
503       nz    = bdiag[i] - bdiag[i + 1];
504       bjtmp = bj + bdiag[i + 1] + 1;
505       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
506 
507       /* load in initial (unfactored row) */
508       nz    = ai[r[i] + 1] - ai[r[i]];
509       ajtmp = aj + ai[r[i]];
510       v     = aa + ai[r[i]];
511       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
512 
513       /* ZeropivotApply() */
514       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
515 
516       /* elimination */
517       bjtmp = bj + bi[i];
518       row   = *bjtmp++;
519       nzL   = bi[i + 1] - bi[i];
520       for (k = 0; k < nzL; k++) {
521         pc = rtmp + row;
522         if (*pc != (PetscScalar)0.0) {
523           pv         = b->a + bdiag[row];
524           multiplier = *pc * (*pv);
525           *pc        = multiplier;
526 
527           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528           pv = b->a + bdiag[row + 1] + 1;
529           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531           PetscCall(PetscLogFlops(2.0 * nz));
532         }
533         row = *bjtmp++;
534       }
535 
536       /* finished row so stick it into b->a */
537       rs = 0.0;
538       /* L part */
539       pv = b->a + bi[i];
540       pj = b->j + bi[i];
541       nz = bi[i + 1] - bi[i];
542       for (j = 0; j < nz; j++) {
543         pv[j] = rtmp[pj[j]];
544         rs += PetscAbsScalar(pv[j]);
545       }
546 
547       /* U part */
548       pv = b->a + bdiag[i + 1] + 1;
549       pj = b->j + bdiag[i + 1] + 1;
550       nz = bdiag[i] - bdiag[i + 1] - 1;
551       for (j = 0; j < nz; j++) {
552         pv[j] = rtmp[pj[j]];
553         rs += PetscAbsScalar(pv[j]);
554       }
555 
556       sctx.rs = rs;
557       sctx.pv = rtmp[i];
558       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559       if (sctx.newshift) break; /* break for-loop */
560       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
561 
562       /* Mark diagonal and invert diagonal for simpler triangular solves */
563       pv  = b->a + bdiag[i];
564       *pv = (PetscScalar)1.0 / rtmp[i];
565 
566     } /* endof for (i=0; i<n; i++) { */
567 
568     /* MatPivotRefine() */
569     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570       /*
571        * if no shift in this attempt & shifting & started shifting & can refine,
572        * then try lower shift
573        */
574       sctx.shift_hi       = sctx.shift_fraction;
575       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
577       sctx.newshift       = PETSC_TRUE;
578       sctx.nshift++;
579     }
580   } while (sctx.newshift);
581 
582   PetscCall(PetscFree(rtmp));
583   PetscCall(ISRestoreIndices(isicol, &ic));
584   PetscCall(ISRestoreIndices(isrow, &r));
585 
586   PetscCall(ISIdentity(isrow, &row_identity));
587   PetscCall(ISIdentity(isicol, &col_identity));
588   if (row_identity && col_identity) {
589     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
590     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593   } else {
594     C->ops->solve          = MatSolve_SeqBAIJ_1;
595     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596   }
597   C->assembled = PETSC_TRUE;
598   PetscCall(PetscLogFlops(C->cmap->n));
599 
600   /* MatShiftView(A,info,&sctx) */
601   if (sctx.nshift) {
602     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608     }
609   }
610   PetscFunctionReturn(PETSC_SUCCESS);
611 }
612 
613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614 {
615   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616   IS              isrow = b->row, isicol = b->icol;
617   const PetscInt *r, *ic;
618   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620   PetscInt        diag, *pj;
621   const PetscInt *diag_offset;
622   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
623   MatScalar      *ba = b->a, *aa = a->a;
624   PetscBool       row_identity, col_identity;
625 
626   PetscFunctionBegin;
627   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628   A->factortype = MAT_FACTOR_NONE;
629   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630   A->factortype = MAT_FACTOR_ILU;
631   PetscCall(ISGetIndices(isrow, &r));
632   PetscCall(ISGetIndices(isicol, &ic));
633   PetscCall(PetscMalloc1(n + 1, &rtmp));
634 
635   for (i = 0; i < n; i++) {
636     nz    = bi[i + 1] - bi[i];
637     ajtmp = bj + bi[i];
638     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
639 
640     /* load in initial (unfactored row) */
641     nz       = ai[r[i] + 1] - ai[r[i]];
642     ajtmpold = aj + ai[r[i]];
643     v        = aa + ai[r[i]];
644     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
645 
646     row = *ajtmp++;
647     while (row < i) {
648       pc = rtmp + row;
649       if (*pc != 0.0) {
650         pv         = ba + diag_offset[row];
651         pj         = bj + diag_offset[row] + 1;
652         multiplier = *pc * *pv++;
653         *pc        = multiplier;
654         nz         = bi[row + 1] - diag_offset[row] - 1;
655         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657       }
658       row = *ajtmp++;
659     }
660     /* finished row so stick it into b->a */
661     pv = ba + bi[i];
662     pj = bj + bi[i];
663     nz = bi[i + 1] - bi[i];
664     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665     diag = diag_offset[i] - bi[i];
666     /* check pivot entry for current row */
667     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668     pv[diag] = 1.0 / pv[diag];
669   }
670 
671   PetscCall(PetscFree(rtmp));
672   PetscCall(ISRestoreIndices(isicol, &ic));
673   PetscCall(ISRestoreIndices(isrow, &r));
674   PetscCall(ISIdentity(isrow, &row_identity));
675   PetscCall(ISIdentity(isicol, &col_identity));
676   if (row_identity && col_identity) {
677     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679   } else {
680     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
681     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682   }
683   C->assembled = PETSC_TRUE;
684   PetscCall(PetscLogFlops(C->cmap->n));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689 {
690   PetscFunctionBegin;
691   *type = MATSOLVERPETSC;
692   PetscFunctionReturn(PETSC_SUCCESS);
693 }
694 
695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696 {
697   PetscInt n = A->rmap->n;
698 
699   PetscFunctionBegin;
700 #if defined(PETSC_USE_COMPLEX)
701   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
702 #endif
703   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
704   PetscCall(MatSetSizes(*B, n, n, n, n));
705   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
706     PetscCall(MatSetType(*B, MATSEQBAIJ));
707 
708     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
709     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
710     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
711     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
712     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
713   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
714     PetscCall(MatSetType(*B, MATSEQSBAIJ));
715     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
716 
717     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
718     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
719     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
720     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
721     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
722   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
723   (*B)->factortype     = ftype;
724   (*B)->canuseordering = PETSC_TRUE;
725 
726   PetscCall(PetscFree((*B)->solvertype));
727   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
728   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
729   PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 
732 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
733 {
734   Mat C;
735 
736   PetscFunctionBegin;
737   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
738   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
739   PetscCall(MatLUFactorNumeric(C, A, info));
740 
741   A->ops->solve          = C->ops->solve;
742   A->ops->solvetranspose = C->ops->solvetranspose;
743 
744   PetscCall(MatHeaderMerge(A, &C));
745   PetscFunctionReturn(PETSC_SUCCESS);
746 }
747 
748 #include <../src/mat/impls/sbaij/seq/sbaij.h>
749 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
750 {
751   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
752   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
753   IS              ip = b->row;
754   const PetscInt *rip;
755   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
756   PetscInt       *ai = a->i, *aj = a->j;
757   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
758   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
759   PetscReal       rs;
760   FactorShiftCtx  sctx;
761 
762   PetscFunctionBegin;
763   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
764     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
765     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
766     PetscCall(MatDestroy(&a->sbaijMat));
767     PetscFunctionReturn(PETSC_SUCCESS);
768   }
769 
770   /* MatPivotSetUp(): initialize shift context sctx */
771   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
772 
773   PetscCall(ISGetIndices(ip, &rip));
774   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
775 
776   sctx.shift_amount = 0.;
777   sctx.nshift       = 0;
778   do {
779     sctx.newshift = PETSC_FALSE;
780     for (i = 0; i < mbs; i++) {
781       rtmp[i] = 0.0;
782       jl[i]   = mbs;
783       il[0]   = 0;
784     }
785 
786     for (k = 0; k < mbs; k++) {
787       bval = ba + bi[k];
788       /* initialize k-th row by the perm[k]-th row of A */
789       jmin = ai[rip[k]];
790       jmax = ai[rip[k] + 1];
791       for (j = jmin; j < jmax; j++) {
792         col = rip[aj[j]];
793         if (col >= k) { /* only take upper triangular entry */
794           rtmp[col] = aa[j];
795           *bval++   = 0.0; /* for in-place factorization */
796         }
797       }
798 
799       /* shift the diagonal of the matrix */
800       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
801 
802       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
803       dk = rtmp[k];
804       i  = jl[k]; /* first row to be added to k_th row  */
805 
806       while (i < k) {
807         nexti = jl[i]; /* next row to be added to k_th row */
808 
809         /* compute multiplier, update diag(k) and U(i,k) */
810         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
811         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
812         dk += uikdi * ba[ili];
813         ba[ili] = uikdi; /* -U(i,k) */
814 
815         /* add multiple of row i to k-th row */
816         jmin = ili + 1;
817         jmax = bi[i + 1];
818         if (jmin < jmax) {
819           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
820           /* update il and jl for row i */
821           il[i] = jmin;
822           j     = bj[jmin];
823           jl[i] = jl[j];
824           jl[j] = i;
825         }
826         i = nexti;
827       }
828 
829       /* shift the diagonals when zero pivot is detected */
830       /* compute rs=sum of abs(off-diagonal) */
831       rs   = 0.0;
832       jmin = bi[k] + 1;
833       nz   = bi[k + 1] - jmin;
834       if (nz) {
835         bcol = bj + jmin;
836         while (nz--) {
837           rs += PetscAbsScalar(rtmp[*bcol]);
838           bcol++;
839         }
840       }
841 
842       sctx.rs = rs;
843       sctx.pv = dk;
844       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
845       if (sctx.newshift) break;
846       dk = sctx.pv;
847 
848       /* copy data into U(k,:) */
849       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
850       jmin      = bi[k] + 1;
851       jmax      = bi[k + 1];
852       if (jmin < jmax) {
853         for (j = jmin; j < jmax; j++) {
854           col       = bj[j];
855           ba[j]     = rtmp[col];
856           rtmp[col] = 0.0;
857         }
858         /* add the k-th row into il and jl */
859         il[k] = jmin;
860         i     = bj[jmin];
861         jl[k] = jl[i];
862         jl[i] = k;
863       }
864     }
865   } while (sctx.newshift);
866   PetscCall(PetscFree3(rtmp, il, jl));
867 
868   PetscCall(ISRestoreIndices(ip, &rip));
869 
870   C->assembled    = PETSC_TRUE;
871   C->preallocated = PETSC_TRUE;
872 
873   PetscCall(PetscLogFlops(C->rmap->N));
874   if (sctx.nshift) {
875     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
876       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
877     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
878       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
879     }
880   }
881   PetscFunctionReturn(PETSC_SUCCESS);
882 }
883 
884 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
885 {
886   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
887   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
888   PetscInt       i, j, am = a->mbs;
889   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
890   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
891   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
892   PetscReal      rs;
893   FactorShiftCtx sctx;
894 
895   PetscFunctionBegin;
896   /* MatPivotSetUp(): initialize shift context sctx */
897   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
898 
899   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
900 
901   do {
902     sctx.newshift = PETSC_FALSE;
903     for (i = 0; i < am; i++) {
904       rtmp[i] = 0.0;
905       jl[i]   = am;
906       il[0]   = 0;
907     }
908 
909     for (k = 0; k < am; k++) {
910       /* initialize k-th row with elements nonzero in row perm(k) of A */
911       nz   = ai[k + 1] - ai[k];
912       acol = aj + ai[k];
913       aval = aa + ai[k];
914       bval = ba + bi[k];
915       while (nz--) {
916         if (*acol < k) { /* skip lower triangular entries */
917           acol++;
918           aval++;
919         } else {
920           rtmp[*acol++] = *aval++;
921           *bval++       = 0.0; /* for in-place factorization */
922         }
923       }
924 
925       /* shift the diagonal of the matrix */
926       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
927 
928       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
929       dk = rtmp[k];
930       i  = jl[k]; /* first row to be added to k_th row  */
931 
932       while (i < k) {
933         nexti = jl[i]; /* next row to be added to k_th row */
934         /* compute multiplier, update D(k) and U(i,k) */
935         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
936         uikdi = -ba[ili] * ba[bi[i]];
937         dk += uikdi * ba[ili];
938         ba[ili] = uikdi; /* -U(i,k) */
939 
940         /* add multiple of row i to k-th row ... */
941         jmin = ili + 1;
942         nz   = bi[i + 1] - jmin;
943         if (nz > 0) {
944           bcol = bj + jmin;
945           bval = ba + jmin;
946           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
947           /* update il and jl for i-th row */
948           il[i] = jmin;
949           j     = bj[jmin];
950           jl[i] = jl[j];
951           jl[j] = i;
952         }
953         i = nexti;
954       }
955 
956       /* shift the diagonals when zero pivot is detected */
957       /* compute rs=sum of abs(off-diagonal) */
958       rs   = 0.0;
959       jmin = bi[k] + 1;
960       nz   = bi[k + 1] - jmin;
961       if (nz) {
962         bcol = bj + jmin;
963         while (nz--) {
964           rs += PetscAbsScalar(rtmp[*bcol]);
965           bcol++;
966         }
967       }
968 
969       sctx.rs = rs;
970       sctx.pv = dk;
971       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
972       if (sctx.newshift) break; /* sctx.shift_amount is updated */
973       dk = sctx.pv;
974 
975       /* copy data into U(k,:) */
976       ba[bi[k]] = 1.0 / dk;
977       jmin      = bi[k] + 1;
978       nz        = bi[k + 1] - jmin;
979       if (nz) {
980         bcol = bj + jmin;
981         bval = ba + jmin;
982         while (nz--) {
983           *bval++       = rtmp[*bcol];
984           rtmp[*bcol++] = 0.0;
985         }
986         /* add k-th row into il and jl */
987         il[k] = jmin;
988         i     = bj[jmin];
989         jl[k] = jl[i];
990         jl[i] = k;
991       }
992     }
993   } while (sctx.newshift);
994   PetscCall(PetscFree3(rtmp, il, jl));
995 
996   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
997   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
998   C->assembled           = PETSC_TRUE;
999   C->preallocated        = PETSC_TRUE;
1000 
1001   PetscCall(PetscLogFlops(C->rmap->N));
1002   if (sctx.nshift) {
1003     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1004       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1005     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1006       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1007     }
1008   }
1009   PetscFunctionReturn(PETSC_SUCCESS);
1010 }
1011 
1012 #include <petscbt.h>
1013 #include <../src/mat/utils/freespace.h>
1014 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1015 {
1016   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1017   Mat_SeqSBAIJ      *b;
1018   Mat                B;
1019   PetscBool          perm_identity;
1020   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1021   const PetscInt    *rip;
1022   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1023   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1024   PetscReal          fill = info->fill, levels = info->levels;
1025   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1026   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1027   PetscBT            lnkbt;
1028   PetscBool          diagDense;
1029   const PetscInt    *adiag;
1030 
1031   PetscFunctionBegin;
1032   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1033   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1034 
1035   if (bs > 1) {
1036     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1037     fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1038 
1039     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1040     PetscFunctionReturn(PETSC_SUCCESS);
1041   }
1042 
1043   PetscCall(ISIdentity(perm, &perm_identity));
1044   PetscCall(ISGetIndices(perm, &rip));
1045 
1046   /* special case that simply copies fill pattern */
1047   if (!levels && perm_identity) {
1048     PetscCall(PetscMalloc1(am + 1, &ui));
1049     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1050     B = fact;
1051     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1052 
1053     b  = (Mat_SeqSBAIJ *)B->data;
1054     uj = b->j;
1055     for (i = 0; i < am; i++) {
1056       aj = a->j + adiag[i];
1057       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1058       b->ilen[i] = ui[i];
1059     }
1060     PetscCall(PetscFree(ui));
1061 
1062     B->factortype = MAT_FACTOR_NONE;
1063 
1064     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1065     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1066     B->factortype = MAT_FACTOR_ICC;
1067 
1068     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1069     PetscFunctionReturn(PETSC_SUCCESS);
1070   }
1071 
1072   /* initialization */
1073   PetscCall(PetscMalloc1(am + 1, &ui));
1074   ui[0] = 0;
1075   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1076 
1077   /* jl: linked list for storing indices of the pivot rows
1078      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1079   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1080   for (i = 0; i < am; i++) {
1081     jl[i] = am;
1082     il[i] = 0;
1083   }
1084 
1085   /* create and initialize a linked list for storing column indices of the active row k */
1086   nlnk = am + 1;
1087   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1088 
1089   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1090   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1091 
1092   current_space = free_space;
1093 
1094   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1095   current_space_lvl = free_space_lvl;
1096 
1097   for (k = 0; k < am; k++) { /* for each active row k */
1098     /* initialize lnk by the column indices of row rip[k] of A */
1099     nzk         = 0;
1100     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1101     ncols_upper = 0;
1102     cols        = cols_lvl + am;
1103     for (j = 0; j < ncols; j++) {
1104       i = rip[*(aj + ai[rip[k]] + j)];
1105       if (i >= k) { /* only take upper triangular entry */
1106         cols[ncols_upper]     = i;
1107         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1108         ncols_upper++;
1109       }
1110     }
1111     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112     nzk += nlnk;
1113 
1114     /* update lnk by computing fill-in for each pivot row to be merged in */
1115     prow = jl[k]; /* 1st pivot row */
1116 
1117     while (prow < k) {
1118       nextprow = jl[prow];
1119 
1120       /* merge prow into k-th row */
1121       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1122       jmax  = ui[prow + 1];
1123       ncols = jmax - jmin;
1124       i     = jmin - ui[prow];
1125       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1126       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1127       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1128       nzk += nlnk;
1129 
1130       /* update il and jl for prow */
1131       if (jmin < jmax) {
1132         il[prow] = jmin;
1133 
1134         j        = *cols;
1135         jl[prow] = jl[j];
1136         jl[j]    = prow;
1137       }
1138       prow = nextprow;
1139     }
1140 
1141     /* if free space is not available, make more free space */
1142     if (current_space->local_remaining < nzk) {
1143       i = am - k + 1;                                                             /* num of unfactored rows */
1144       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1145       PetscCall(PetscFreeSpaceGet(i, &current_space));
1146       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1147       reallocs++;
1148     }
1149 
1150     /* copy data into free_space and free_space_lvl, then initialize lnk */
1151     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1152 
1153     /* add the k-th row into il and jl */
1154     if (nzk - 1 > 0) {
1155       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1156       jl[k] = jl[i];
1157       jl[i] = k;
1158       il[k] = ui[k] + 1;
1159     }
1160     uj_ptr[k]     = current_space->array;
1161     uj_lvl_ptr[k] = current_space_lvl->array;
1162 
1163     current_space->array += nzk;
1164     current_space->local_used += nzk;
1165     current_space->local_remaining -= nzk;
1166 
1167     current_space_lvl->array += nzk;
1168     current_space_lvl->local_used += nzk;
1169     current_space_lvl->local_remaining -= nzk;
1170 
1171     ui[k + 1] = ui[k] + nzk;
1172   }
1173 
1174   PetscCall(ISRestoreIndices(perm, &rip));
1175   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1176   PetscCall(PetscFree(cols_lvl));
1177 
1178   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1179   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1180   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1181   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1182   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1183 
1184   /* put together the new matrix in MATSEQSBAIJ format */
1185   B = fact;
1186   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1187 
1188   b          = (Mat_SeqSBAIJ *)B->data;
1189   b->free_a  = PETSC_TRUE;
1190   b->free_ij = PETSC_TRUE;
1191 
1192   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1193 
1194   b->j             = uj;
1195   b->i             = ui;
1196   b->diag          = NULL;
1197   b->ilen          = NULL;
1198   b->imax          = NULL;
1199   b->row           = perm;
1200   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1201 
1202   PetscCall(PetscObjectReference((PetscObject)perm));
1203 
1204   b->icol = perm;
1205 
1206   PetscCall(PetscObjectReference((PetscObject)perm));
1207   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1208 
1209   b->maxnz = b->nz = ui[am];
1210 
1211   B->info.factor_mallocs   = reallocs;
1212   B->info.fill_ratio_given = fill;
1213   if (ai[am] != 0.) {
1214     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1215     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1216   } else {
1217     B->info.fill_ratio_needed = 0.0;
1218   }
1219 #if defined(PETSC_USE_INFO)
1220   if (ai[am] != 0) {
1221     PetscReal af = B->info.fill_ratio_needed;
1222     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1223     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1224     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1225   } else {
1226     PetscCall(PetscInfo(A, "Empty matrix\n"));
1227   }
1228 #endif
1229   if (perm_identity) {
1230     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1231     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1232     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1233   } else {
1234     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1235   }
1236   PetscFunctionReturn(PETSC_SUCCESS);
1237 }
1238 
1239 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1240 {
1241   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1242   Mat_SeqSBAIJ      *b;
1243   Mat                B;
1244   PetscBool          perm_identity;
1245   PetscReal          fill = info->fill;
1246   const PetscInt    *rip;
1247   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1248   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1249   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1250   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1251   PetscBT            lnkbt;
1252   PetscBool          diagDense;
1253 
1254   PetscFunctionBegin;
1255   if (bs > 1) { /* convert to seqsbaij */
1256     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1257     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1258 
1259     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1260     PetscFunctionReturn(PETSC_SUCCESS);
1261   }
1262   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1263   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1264 
1265   /* check whether perm is the identity mapping */
1266   PetscCall(ISIdentity(perm, &perm_identity));
1267   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1268   PetscCall(ISGetIndices(perm, &rip));
1269 
1270   /* initialization */
1271   PetscCall(PetscMalloc1(mbs + 1, &ui));
1272   ui[0] = 0;
1273 
1274   /* jl: linked list for storing indices of the pivot rows
1275      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1276   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1277   for (i = 0; i < mbs; i++) {
1278     jl[i] = mbs;
1279     il[i] = 0;
1280   }
1281 
1282   /* create and initialize a linked list for storing column indices of the active row k */
1283   nlnk = mbs + 1;
1284   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1285 
1286   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1287   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1288 
1289   current_space = free_space;
1290 
1291   for (k = 0; k < mbs; k++) { /* for each active row k */
1292     /* initialize lnk by the column indices of row rip[k] of A */
1293     nzk         = 0;
1294     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1295     ncols_upper = 0;
1296     for (j = 0; j < ncols; j++) {
1297       i = rip[*(aj + ai[rip[k]] + j)];
1298       if (i >= k) { /* only take upper triangular entry */
1299         cols[ncols_upper] = i;
1300         ncols_upper++;
1301       }
1302     }
1303     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1304     nzk += nlnk;
1305 
1306     /* update lnk by computing fill-in for each pivot row to be merged in */
1307     prow = jl[k]; /* 1st pivot row */
1308 
1309     while (prow < k) {
1310       nextprow = jl[prow];
1311       /* merge prow into k-th row */
1312       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1313       jmax   = ui[prow + 1];
1314       ncols  = jmax - jmin;
1315       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1316       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1317       nzk += nlnk;
1318 
1319       /* update il and jl for prow */
1320       if (jmin < jmax) {
1321         il[prow] = jmin;
1322         j        = *uj_ptr;
1323         jl[prow] = jl[j];
1324         jl[j]    = prow;
1325       }
1326       prow = nextprow;
1327     }
1328 
1329     /* if free space is not available, make more free space */
1330     if (current_space->local_remaining < nzk) {
1331       i = mbs - k + 1;                                                            /* num of unfactored rows */
1332       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1333       PetscCall(PetscFreeSpaceGet(i, &current_space));
1334       reallocs++;
1335     }
1336 
1337     /* copy data into free space, then initialize lnk */
1338     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1339 
1340     /* add the k-th row into il and jl */
1341     if (nzk - 1 > 0) {
1342       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1343       jl[k] = jl[i];
1344       jl[i] = k;
1345       il[k] = ui[k] + 1;
1346     }
1347     ui_ptr[k] = current_space->array;
1348     current_space->array += nzk;
1349     current_space->local_used += nzk;
1350     current_space->local_remaining -= nzk;
1351 
1352     ui[k + 1] = ui[k] + nzk;
1353   }
1354 
1355   PetscCall(ISRestoreIndices(perm, &rip));
1356   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1357 
1358   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1359   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1360   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1361   PetscCall(PetscLLDestroy(lnk, lnkbt));
1362 
1363   /* put together the new matrix in MATSEQSBAIJ format */
1364   B = fact;
1365   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1366 
1367   b          = (Mat_SeqSBAIJ *)B->data;
1368   b->free_a  = PETSC_TRUE;
1369   b->free_ij = PETSC_TRUE;
1370 
1371   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1372 
1373   b->j             = uj;
1374   b->i             = ui;
1375   b->diag          = NULL;
1376   b->ilen          = NULL;
1377   b->imax          = NULL;
1378   b->row           = perm;
1379   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1380 
1381   PetscCall(PetscObjectReference((PetscObject)perm));
1382   b->icol = perm;
1383   PetscCall(PetscObjectReference((PetscObject)perm));
1384   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1385   b->maxnz = b->nz = ui[mbs];
1386 
1387   B->info.factor_mallocs   = reallocs;
1388   B->info.fill_ratio_given = fill;
1389   if (ai[mbs] != 0.) {
1390     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1391     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1392   } else {
1393     B->info.fill_ratio_needed = 0.0;
1394   }
1395 #if defined(PETSC_USE_INFO)
1396   if (ai[mbs] != 0.) {
1397     PetscReal af = B->info.fill_ratio_needed;
1398     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1399     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1400     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1401   } else {
1402     PetscCall(PetscInfo(A, "Empty matrix\n"));
1403   }
1404 #endif
1405   if (perm_identity) {
1406     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1407   } else {
1408     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1409   }
1410   PetscFunctionReturn(PETSC_SUCCESS);
1411 }
1412 
1413 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1414 {
1415   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1416   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1417   PetscInt           i, k, n                       = a->mbs;
1418   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1419   const MatScalar   *aa = a->a, *v;
1420   PetscScalar       *x, *s, *t, *ls;
1421   const PetscScalar *b;
1422 
1423   PetscFunctionBegin;
1424   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1425   PetscCall(VecGetArrayRead(bb, &b));
1426   PetscCall(VecGetArray(xx, &x));
1427   t = a->solve_work;
1428 
1429   /* forward solve the lower triangular */
1430   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1431 
1432   for (i = 1; i < n; i++) {
1433     v  = aa + bs2 * ai[i];
1434     vi = aj + ai[i];
1435     nz = ai[i + 1] - ai[i];
1436     s  = t + bs * i;
1437     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1438     for (k = 0; k < nz; k++) {
1439       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1440       v += bs2;
1441     }
1442   }
1443 
1444   /* backward solve the upper triangular */
1445   ls = a->solve_work + A->cmap->n;
1446   for (i = n - 1; i >= 0; i--) {
1447     v  = aa + bs2 * (adiag[i + 1] + 1);
1448     vi = aj + adiag[i + 1] + 1;
1449     nz = adiag[i] - adiag[i + 1] - 1;
1450     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1451     for (k = 0; k < nz; k++) {
1452       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1453       v += bs2;
1454     }
1455     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1456     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1457   }
1458 
1459   PetscCall(VecRestoreArrayRead(bb, &b));
1460   PetscCall(VecRestoreArray(xx, &x));
1461   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1462   PetscFunctionReturn(PETSC_SUCCESS);
1463 }
1464 
1465 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1466 {
1467   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1468   IS                 iscol = a->col, isrow = a->row;
1469   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1470   PetscInt           i, m, n = a->mbs;
1471   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1472   const MatScalar   *aa = a->a, *v;
1473   PetscScalar       *x, *s, *t, *ls;
1474   const PetscScalar *b;
1475 
1476   PetscFunctionBegin;
1477   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1478   PetscCall(VecGetArrayRead(bb, &b));
1479   PetscCall(VecGetArray(xx, &x));
1480   t = a->solve_work;
1481 
1482   PetscCall(ISGetIndices(isrow, &rout));
1483   r = rout;
1484   PetscCall(ISGetIndices(iscol, &cout));
1485   c = cout;
1486 
1487   /* forward solve the lower triangular */
1488   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1489   for (i = 1; i < n; i++) {
1490     v  = aa + bs2 * ai[i];
1491     vi = aj + ai[i];
1492     nz = ai[i + 1] - ai[i];
1493     s  = t + bs * i;
1494     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1495     for (m = 0; m < nz; m++) {
1496       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1497       v += bs2;
1498     }
1499   }
1500 
1501   /* backward solve the upper triangular */
1502   ls = a->solve_work + A->cmap->n;
1503   for (i = n - 1; i >= 0; i--) {
1504     v  = aa + bs2 * (adiag[i + 1] + 1);
1505     vi = aj + adiag[i + 1] + 1;
1506     nz = adiag[i] - adiag[i + 1] - 1;
1507     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1508     for (m = 0; m < nz; m++) {
1509       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1510       v += bs2;
1511     }
1512     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1513     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1514   }
1515   PetscCall(ISRestoreIndices(isrow, &rout));
1516   PetscCall(ISRestoreIndices(iscol, &cout));
1517   PetscCall(VecRestoreArrayRead(bb, &b));
1518   PetscCall(VecRestoreArray(xx, &x));
1519   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1520   PetscFunctionReturn(PETSC_SUCCESS);
1521 }
1522