xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 7cd49bdee00a26e142bff6e7df7c3fb9209fa795)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8 {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15   MatScalar      *rtmp, *pc, *mwork, *pv;
16   MatScalar      *aa             = a->a, *v;
17   PetscReal       shift          = info->shiftamount;
18   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
19 
20   PetscFunctionBegin;
21   PetscCall(ISGetIndices(isrow, &r));
22   PetscCall(ISGetIndices(isicol, &ic));
23 
24   /* generate work space needed by the factorization */
25   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26   PetscCall(PetscArrayzero(rtmp, bs2 * n));
27 
28   for (PetscInt i = 0; i < n; i++) {
29     /* zero rtmp */
30     /* L part */
31     PetscInt *pj;
32     PetscInt  nzL, nz = bi[i + 1] - bi[i];
33     bjtmp = bj + bi[i];
34     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
35 
36     /* U part */
37     nz    = bdiag[i] - bdiag[i + 1];
38     bjtmp = bj + bdiag[i + 1] + 1;
39     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
40 
41     /* load in initial (unfactored row) */
42     nz    = ai[r[i] + 1] - ai[r[i]];
43     ajtmp = aj + ai[r[i]];
44     v     = aa + bs2 * ai[r[i]];
45     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
46 
47     /* elimination */
48     bjtmp = bj + bi[i];
49     nzL   = bi[i + 1] - bi[i];
50     for (PetscInt k = 0; k < nzL; k++) {
51       PetscBool flg = PETSC_FALSE;
52       PetscInt  row = bjtmp[k];
53 
54       pc = rtmp + bs2 * row;
55       for (PetscInt j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = PETSC_TRUE;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (PetscInt j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     {
92       PetscBool zeropivotdetected;
93 
94       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96     }
97 
98     /* U part */
99     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100     pj = b->j + bdiag[i + 1] + 1;
101     nz = bdiag[i] - bdiag[i + 1] - 1;
102     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103   }
104 
105   PetscCall(PetscFree2(rtmp, mwork));
106   PetscCall(ISRestoreIndices(isicol, &ic));
107   PetscCall(ISRestoreIndices(isrow, &r));
108 
109   C->ops->solve          = MatSolve_SeqBAIJ_2;
110   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111   C->assembled           = PETSC_TRUE;
112 
113   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118 {
119   Mat             C = B;
120   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121   PetscInt        i, j, k, nz, nzL, row, *pj;
122   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124   MatScalar      *rtmp, *pc, *mwork, *pv;
125   MatScalar      *aa = a->a, *v;
126   PetscInt        flg;
127   PetscReal       shift = info->shiftamount;
128   PetscBool       allowzeropivot, zeropivotdetected;
129 
130   PetscFunctionBegin;
131   allowzeropivot = PetscNot(A->erroriffailure);
132 
133   /* generate work space needed by the factorization */
134   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135   PetscCall(PetscArrayzero(rtmp, bs2 * n));
136 
137   for (i = 0; i < n; i++) {
138     /* zero rtmp */
139     /* L part */
140     nz    = bi[i + 1] - bi[i];
141     bjtmp = bj + bi[i];
142     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143 
144     /* U part */
145     nz    = bdiag[i] - bdiag[i + 1];
146     bjtmp = bj + bdiag[i + 1] + 1;
147     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
148 
149     /* load in initial (unfactored row) */
150     nz    = ai[i + 1] - ai[i];
151     ajtmp = aj + ai[i];
152     v     = aa + bs2 * ai[i];
153     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
154 
155     /* elimination */
156     bjtmp = bj + bi[i];
157     nzL   = bi[i + 1] - bi[i];
158     for (k = 0; k < nzL; k++) {
159       row = bjtmp[k];
160       pc  = rtmp + bs2 * row;
161       for (flg = 0, j = 0; j < bs2; j++) {
162         if (pc[j] != (PetscScalar)0.0) {
163           flg = 1;
164           break;
165         }
166       }
167       if (flg) {
168         pv = b->a + bs2 * bdiag[row];
169         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
171 
172         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175         for (j = 0; j < nz; j++) {
176           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178           v = rtmp + 4 * pj[j];
179           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180           pv += 4;
181         }
182         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183       }
184     }
185 
186     /* finished row so stick it into b->a */
187     /* L part */
188     pv = b->a + bs2 * bi[i];
189     pj = b->j + bi[i];
190     nz = bi[i + 1] - bi[i];
191     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
192 
193     /* Mark diagonal and invert diagonal for simpler triangular solves */
194     pv = b->a + bs2 * bdiag[i];
195     pj = b->j + bdiag[i];
196     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199 
200     /* U part */
201     /*
202     pv = b->a + bs2*bi[2*n-i];
203     pj = b->j + bi[2*n-i];
204     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205     */
206     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207     pj = b->j + bdiag[i + 1] + 1;
208     nz = bdiag[i] - bdiag[i + 1] - 1;
209     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210   }
211   PetscCall(PetscFree2(rtmp, mwork));
212 
213   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217   C->assembled           = PETSC_TRUE;
218 
219   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224 {
225   Mat             C = B;
226   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227   IS              isrow = b->row, isicol = b->icol;
228   const PetscInt *r, *ic;
229   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230   PetscInt       *ajtmpold, *ajtmp, nz, row;
231   PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
232   const PetscInt *diag_offset;
233   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234   MatScalar       p1, p2, p3, p4;
235   MatScalar      *ba = b->a, *aa = a->a;
236   PetscReal       shift = info->shiftamount;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241   A->factortype = MAT_FACTOR_NONE;
242   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243   A->factortype  = MAT_FACTOR_ILU;
244   allowzeropivot = PetscNot(A->erroriffailure);
245   PetscCall(ISGetIndices(isrow, &r));
246   PetscCall(ISGetIndices(isicol, &ic));
247   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
248 
249   for (i = 0; i < n; i++) {
250     nz    = bi[i + 1] - bi[i];
251     ajtmp = bj + bi[i];
252     for (j = 0; j < nz; j++) {
253       x    = rtmp + 4 * ajtmp[j];
254       x[0] = x[1] = x[2] = x[3] = 0.0;
255     }
256     /* load in initial (unfactored row) */
257     idx      = r[i];
258     nz       = ai[idx + 1] - ai[idx];
259     ajtmpold = aj + ai[idx];
260     v        = aa + 4 * ai[idx];
261     for (j = 0; j < nz; j++) {
262       x    = rtmp + 4 * ic[ajtmpold[j]];
263       x[0] = v[0];
264       x[1] = v[1];
265       x[2] = v[2];
266       x[3] = v[3];
267       v += 4;
268     }
269     row = *ajtmp++;
270     while (row < i) {
271       pc = rtmp + 4 * row;
272       p1 = pc[0];
273       p2 = pc[1];
274       p3 = pc[2];
275       p4 = pc[3];
276       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277         pv    = ba + 4 * diag_offset[row];
278         pj    = bj + diag_offset[row] + 1;
279         x1    = pv[0];
280         x2    = pv[1];
281         x3    = pv[2];
282         x4    = pv[3];
283         pc[0] = m1 = p1 * x1 + p3 * x2;
284         pc[1] = m2 = p2 * x1 + p4 * x2;
285         pc[2] = m3 = p1 * x3 + p3 * x4;
286         pc[3] = m4 = p2 * x3 + p4 * x4;
287         nz         = bi[row + 1] - diag_offset[row] - 1;
288         pv += 4;
289         for (j = 0; j < nz; j++) {
290           x1 = pv[0];
291           x2 = pv[1];
292           x3 = pv[2];
293           x4 = pv[3];
294           x  = rtmp + 4 * pj[j];
295           x[0] -= m1 * x1 + m3 * x2;
296           x[1] -= m2 * x1 + m4 * x2;
297           x[2] -= m1 * x3 + m3 * x4;
298           x[3] -= m2 * x3 + m4 * x4;
299           pv += 4;
300         }
301         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302       }
303       row = *ajtmp++;
304     }
305     /* finished row so stick it into b->a */
306     pv = ba + 4 * bi[i];
307     pj = bj + bi[i];
308     nz = bi[i + 1] - bi[i];
309     for (j = 0; j < nz; j++) {
310       x     = rtmp + 4 * pj[j];
311       pv[0] = x[0];
312       pv[1] = x[1];
313       pv[2] = x[2];
314       pv[3] = x[3];
315       pv += 4;
316     }
317     /* invert diagonal block */
318     w = ba + 4 * diag_offset[i];
319     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321   }
322 
323   PetscCall(PetscFree(rtmp));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 /*
335       Version for when blocks are 2 by 2 Using natural ordering
336 */
337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338 {
339   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341   PetscInt       *ajtmpold, *ajtmp, nz, row;
342   PetscInt       *ai = a->i, *aj = a->j, *pj;
343   const PetscInt *diag_offset;
344   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
345   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346   MatScalar      *ba = b->a, *aa = a->a;
347   PetscReal       shift = info->shiftamount;
348   PetscBool       allowzeropivot, zeropivotdetected;
349 
350   PetscFunctionBegin;
351   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352   A->factortype = MAT_FACTOR_NONE;
353   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354   A->factortype  = MAT_FACTOR_ILU;
355   allowzeropivot = PetscNot(A->erroriffailure);
356   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357   for (i = 0; i < n; i++) {
358     nz    = bi[i + 1] - bi[i];
359     ajtmp = bj + bi[i];
360     for (j = 0; j < nz; j++) {
361       x    = rtmp + 4 * ajtmp[j];
362       x[0] = x[1] = x[2] = x[3] = 0.0;
363     }
364     /* load in initial (unfactored row) */
365     nz       = ai[i + 1] - ai[i];
366     ajtmpold = aj + ai[i];
367     v        = aa + 4 * ai[i];
368     for (j = 0; j < nz; j++) {
369       x    = rtmp + 4 * ajtmpold[j];
370       x[0] = v[0];
371       x[1] = v[1];
372       x[2] = v[2];
373       x[3] = v[3];
374       v += 4;
375     }
376     row = *ajtmp++;
377     while (row < i) {
378       pc = rtmp + 4 * row;
379       p1 = pc[0];
380       p2 = pc[1];
381       p3 = pc[2];
382       p4 = pc[3];
383       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384         pv    = ba + 4 * diag_offset[row];
385         pj    = bj + diag_offset[row] + 1;
386         x1    = pv[0];
387         x2    = pv[1];
388         x3    = pv[2];
389         x4    = pv[3];
390         pc[0] = m1 = p1 * x1 + p3 * x2;
391         pc[1] = m2 = p2 * x1 + p4 * x2;
392         pc[2] = m3 = p1 * x3 + p3 * x4;
393         pc[3] = m4 = p2 * x3 + p4 * x4;
394         nz         = bi[row + 1] - diag_offset[row] - 1;
395         pv += 4;
396         for (j = 0; j < nz; j++) {
397           x1 = pv[0];
398           x2 = pv[1];
399           x3 = pv[2];
400           x4 = pv[3];
401           x  = rtmp + 4 * pj[j];
402           x[0] -= m1 * x1 + m3 * x2;
403           x[1] -= m2 * x1 + m4 * x2;
404           x[2] -= m1 * x3 + m3 * x4;
405           x[3] -= m2 * x3 + m4 * x4;
406           pv += 4;
407         }
408         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409       }
410       row = *ajtmp++;
411     }
412     /* finished row so stick it into b->a */
413     pv = ba + 4 * bi[i];
414     pj = bj + bi[i];
415     nz = bi[i + 1] - bi[i];
416     for (j = 0; j < nz; j++) {
417       x     = rtmp + 4 * pj[j];
418       pv[0] = x[0];
419       pv[1] = x[1];
420       pv[2] = x[2];
421       pv[3] = x[3];
422       /*
423       printf(" col %d:",pj[j]);
424       PetscInt j1;
425       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426       printf("\n");
427       */
428       pv += 4;
429     }
430     /* invert diagonal block */
431     w = ba + 4 * diag_offset[i];
432     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434   }
435 
436   PetscCall(PetscFree(rtmp));
437 
438   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440   C->assembled           = PETSC_TRUE;
441 
442   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /*
447      Version for when blocks are 1 by 1.
448 */
449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450 {
451   Mat              C = B;
452   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453   IS               isrow = b->row, isicol = b->icol;
454   const PetscInt  *r, *ic, *ics;
455   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456   PetscInt         i, j, k, nz, nzL, row, *pj;
457   const PetscInt  *ajtmp, *bjtmp;
458   MatScalar       *rtmp, *pc, multiplier, *pv;
459   const MatScalar *aa = a->a, *v;
460   PetscBool        row_identity, col_identity;
461   FactorShiftCtx   sctx;
462   const PetscInt  *ddiag;
463   PetscReal        rs;
464   MatScalar        d;
465 
466   PetscFunctionBegin;
467   /* MatPivotSetUp(): initialize shift context sctx */
468   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
469 
470   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471     PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472     sctx.shift_top = info->zeropivot;
473     for (i = 0; i < n; i++) {
474       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475       d  = (aa)[ddiag[i]];
476       rs = -PetscAbsScalar(d) - PetscRealPart(d);
477       v  = aa + ai[i];
478       nz = ai[i + 1] - ai[i];
479       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480       if (rs > sctx.shift_top) sctx.shift_top = rs;
481     }
482     sctx.shift_top *= 1.1;
483     sctx.nshift_max = 5;
484     sctx.shift_lo   = 0.;
485     sctx.shift_hi   = 1.;
486   }
487 
488   PetscCall(ISGetIndices(isrow, &r));
489   PetscCall(ISGetIndices(isicol, &ic));
490   PetscCall(PetscMalloc1(n + 1, &rtmp));
491   ics = ic;
492 
493   do {
494     sctx.newshift = PETSC_FALSE;
495     for (i = 0; i < n; i++) {
496       /* zero rtmp */
497       /* L part */
498       nz    = bi[i + 1] - bi[i];
499       bjtmp = bj + bi[i];
500       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
501 
502       /* U part */
503       nz    = bdiag[i] - bdiag[i + 1];
504       bjtmp = bj + bdiag[i + 1] + 1;
505       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
506 
507       /* load in initial (unfactored row) */
508       nz    = ai[r[i] + 1] - ai[r[i]];
509       ajtmp = aj + ai[r[i]];
510       v     = aa + ai[r[i]];
511       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
512 
513       /* ZeropivotApply() */
514       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
515 
516       /* elimination */
517       bjtmp = bj + bi[i];
518       row   = *bjtmp++;
519       nzL   = bi[i + 1] - bi[i];
520       for (k = 0; k < nzL; k++) {
521         pc = rtmp + row;
522         if (*pc != (PetscScalar)0.0) {
523           pv         = b->a + bdiag[row];
524           multiplier = *pc * (*pv);
525           *pc        = multiplier;
526 
527           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528           pv = b->a + bdiag[row + 1] + 1;
529           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531           PetscCall(PetscLogFlops(2.0 * nz));
532         }
533         row = *bjtmp++;
534       }
535 
536       /* finished row so stick it into b->a */
537       rs = 0.0;
538       /* L part */
539       pv = b->a + bi[i];
540       pj = b->j + bi[i];
541       nz = bi[i + 1] - bi[i];
542       for (j = 0; j < nz; j++) {
543         pv[j] = rtmp[pj[j]];
544         rs += PetscAbsScalar(pv[j]);
545       }
546 
547       /* U part */
548       pv = b->a + bdiag[i + 1] + 1;
549       pj = b->j + bdiag[i + 1] + 1;
550       nz = bdiag[i] - bdiag[i + 1] - 1;
551       for (j = 0; j < nz; j++) {
552         pv[j] = rtmp[pj[j]];
553         rs += PetscAbsScalar(pv[j]);
554       }
555 
556       sctx.rs = rs;
557       sctx.pv = rtmp[i];
558       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559       if (sctx.newshift) break; /* break for-loop */
560       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
561 
562       /* Mark diagonal and invert diagonal for simpler triangular solves */
563       pv  = b->a + bdiag[i];
564       *pv = (PetscScalar)1.0 / rtmp[i];
565 
566     } /* endof for (i=0; i<n; i++) { */
567 
568     /* MatPivotRefine() */
569     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570       /*
571        * if no shift in this attempt & shifting & started shifting & can refine,
572        * then try lower shift
573        */
574       sctx.shift_hi       = sctx.shift_fraction;
575       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
577       sctx.newshift       = PETSC_TRUE;
578       sctx.nshift++;
579     }
580   } while (sctx.newshift);
581 
582   PetscCall(PetscFree(rtmp));
583   PetscCall(ISRestoreIndices(isicol, &ic));
584   PetscCall(ISRestoreIndices(isrow, &r));
585 
586   PetscCall(ISIdentity(isrow, &row_identity));
587   PetscCall(ISIdentity(isicol, &col_identity));
588   if (row_identity && col_identity) {
589     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
590     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593   } else {
594     C->ops->solve          = MatSolve_SeqBAIJ_1;
595     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596   }
597   C->assembled = PETSC_TRUE;
598   PetscCall(PetscLogFlops(C->cmap->n));
599 
600   /* MatShiftView(A,info,&sctx) */
601   if (sctx.nshift) {
602     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608     }
609   }
610   PetscFunctionReturn(PETSC_SUCCESS);
611 }
612 
613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614 {
615   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616   IS              isrow = b->row, isicol = b->icol;
617   const PetscInt *r, *ic;
618   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620   PetscInt        diag, *pj;
621   const PetscInt *diag_offset;
622   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
623   MatScalar      *ba = b->a, *aa = a->a;
624   PetscBool       row_identity, col_identity;
625 
626   PetscFunctionBegin;
627   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628   A->factortype = MAT_FACTOR_NONE;
629   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630   A->factortype = MAT_FACTOR_ILU;
631   PetscCall(ISGetIndices(isrow, &r));
632   PetscCall(ISGetIndices(isicol, &ic));
633   PetscCall(PetscMalloc1(n + 1, &rtmp));
634 
635   for (i = 0; i < n; i++) {
636     nz    = bi[i + 1] - bi[i];
637     ajtmp = bj + bi[i];
638     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
639 
640     /* load in initial (unfactored row) */
641     nz       = ai[r[i] + 1] - ai[r[i]];
642     ajtmpold = aj + ai[r[i]];
643     v        = aa + ai[r[i]];
644     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
645 
646     row = *ajtmp++;
647     while (row < i) {
648       pc = rtmp + row;
649       if (*pc != 0.0) {
650         pv         = ba + diag_offset[row];
651         pj         = bj + diag_offset[row] + 1;
652         multiplier = *pc * *pv++;
653         *pc        = multiplier;
654         nz         = bi[row + 1] - diag_offset[row] - 1;
655         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657       }
658       row = *ajtmp++;
659     }
660     /* finished row so stick it into b->a */
661     pv = ba + bi[i];
662     pj = bj + bi[i];
663     nz = bi[i + 1] - bi[i];
664     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665     diag = diag_offset[i] - bi[i];
666     /* check pivot entry for current row */
667     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668     pv[diag] = 1.0 / pv[diag];
669   }
670 
671   PetscCall(PetscFree(rtmp));
672   PetscCall(ISRestoreIndices(isicol, &ic));
673   PetscCall(ISRestoreIndices(isrow, &r));
674   PetscCall(ISIdentity(isrow, &row_identity));
675   PetscCall(ISIdentity(isicol, &col_identity));
676   if (row_identity && col_identity) {
677     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679   } else {
680     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
681     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682   }
683   C->assembled = PETSC_TRUE;
684   PetscCall(PetscLogFlops(C->cmap->n));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689 {
690   PetscFunctionBegin;
691   *type = MATSOLVERPETSC;
692   PetscFunctionReturn(PETSC_SUCCESS);
693 }
694 
695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696 {
697   PetscInt n = A->rmap->n;
698 
699   PetscFunctionBegin;
700 #if defined(PETSC_USE_COMPLEX)
701   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
702 #endif
703   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
704   PetscCall(MatSetSizes(*B, n, n, n, n));
705   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
706     PetscCall(MatSetType(*B, MATSEQBAIJ));
707 
708     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
709     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
710     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
711     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
712     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
713   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
714     PetscCall(MatSetType(*B, MATSEQSBAIJ));
715     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
716 
717     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
718     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
719     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
720     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
721     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
722   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
723   (*B)->factortype     = ftype;
724   (*B)->canuseordering = PETSC_TRUE;
725 
726   PetscCall(PetscFree((*B)->solvertype));
727   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
728   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
729   PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 
732 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
733 {
734   Mat C;
735 
736   PetscFunctionBegin;
737   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
738   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
739   PetscCall(MatLUFactorNumeric(C, A, info));
740 
741   A->ops->solve          = C->ops->solve;
742   A->ops->solvetranspose = C->ops->solvetranspose;
743 
744   PetscCall(MatHeaderMerge(A, &C));
745   PetscFunctionReturn(PETSC_SUCCESS);
746 }
747 
748 #include <../src/mat/impls/sbaij/seq/sbaij.h>
749 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
750 {
751   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
752   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
753   IS              ip = b->row;
754   const PetscInt *rip;
755   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
756   PetscInt       *ai = a->i, *aj = a->j;
757   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
758   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
759   PetscReal       rs;
760   FactorShiftCtx  sctx;
761 
762   PetscFunctionBegin;
763   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
764     if (!a->sbaijMat) {
765       A->symmetric = PETSC_BOOL3_TRUE;
766       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
767       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
768     }
769     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
770     PetscCall(MatDestroy(&a->sbaijMat));
771     PetscFunctionReturn(PETSC_SUCCESS);
772   }
773 
774   /* MatPivotSetUp(): initialize shift context sctx */
775   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
776 
777   PetscCall(ISGetIndices(ip, &rip));
778   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
779 
780   sctx.shift_amount = 0.;
781   sctx.nshift       = 0;
782   do {
783     sctx.newshift = PETSC_FALSE;
784     for (i = 0; i < mbs; i++) {
785       rtmp[i] = 0.0;
786       jl[i]   = mbs;
787       il[0]   = 0;
788     }
789 
790     for (k = 0; k < mbs; k++) {
791       bval = ba + bi[k];
792       /* initialize k-th row by the perm[k]-th row of A */
793       jmin = ai[rip[k]];
794       jmax = ai[rip[k] + 1];
795       for (j = jmin; j < jmax; j++) {
796         col = rip[aj[j]];
797         if (col >= k) { /* only take upper triangular entry */
798           rtmp[col] = aa[j];
799           *bval++   = 0.0; /* for in-place factorization */
800         }
801       }
802 
803       /* shift the diagonal of the matrix */
804       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
805 
806       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
807       dk = rtmp[k];
808       i  = jl[k]; /* first row to be added to k_th row  */
809 
810       while (i < k) {
811         nexti = jl[i]; /* next row to be added to k_th row */
812 
813         /* compute multiplier, update diag(k) and U(i,k) */
814         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
815         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
816         dk += uikdi * ba[ili];
817         ba[ili] = uikdi; /* -U(i,k) */
818 
819         /* add multiple of row i to k-th row */
820         jmin = ili + 1;
821         jmax = bi[i + 1];
822         if (jmin < jmax) {
823           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
824           /* update il and jl for row i */
825           il[i] = jmin;
826           j     = bj[jmin];
827           jl[i] = jl[j];
828           jl[j] = i;
829         }
830         i = nexti;
831       }
832 
833       /* shift the diagonals when zero pivot is detected */
834       /* compute rs=sum of abs(off-diagonal) */
835       rs   = 0.0;
836       jmin = bi[k] + 1;
837       nz   = bi[k + 1] - jmin;
838       if (nz) {
839         bcol = bj + jmin;
840         while (nz--) {
841           rs += PetscAbsScalar(rtmp[*bcol]);
842           bcol++;
843         }
844       }
845 
846       sctx.rs = rs;
847       sctx.pv = dk;
848       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
849       if (sctx.newshift) break;
850       dk = sctx.pv;
851 
852       /* copy data into U(k,:) */
853       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
854       jmin      = bi[k] + 1;
855       jmax      = bi[k + 1];
856       if (jmin < jmax) {
857         for (j = jmin; j < jmax; j++) {
858           col       = bj[j];
859           ba[j]     = rtmp[col];
860           rtmp[col] = 0.0;
861         }
862         /* add the k-th row into il and jl */
863         il[k] = jmin;
864         i     = bj[jmin];
865         jl[k] = jl[i];
866         jl[i] = k;
867       }
868     }
869   } while (sctx.newshift);
870   PetscCall(PetscFree3(rtmp, il, jl));
871 
872   PetscCall(ISRestoreIndices(ip, &rip));
873 
874   C->assembled    = PETSC_TRUE;
875   C->preallocated = PETSC_TRUE;
876 
877   PetscCall(PetscLogFlops(C->rmap->N));
878   if (sctx.nshift) {
879     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
880       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
881     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
882       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
883     }
884   }
885   PetscFunctionReturn(PETSC_SUCCESS);
886 }
887 
888 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
889 {
890   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
891   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
892   PetscInt       i, j, am = a->mbs;
893   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
894   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
895   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
896   PetscReal      rs;
897   FactorShiftCtx sctx;
898 
899   PetscFunctionBegin;
900   /* MatPivotSetUp(): initialize shift context sctx */
901   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
902 
903   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
904 
905   do {
906     sctx.newshift = PETSC_FALSE;
907     for (i = 0; i < am; i++) {
908       rtmp[i] = 0.0;
909       jl[i]   = am;
910       il[0]   = 0;
911     }
912 
913     for (k = 0; k < am; k++) {
914       /* initialize k-th row with elements nonzero in row perm(k) of A */
915       nz   = ai[k + 1] - ai[k];
916       acol = aj + ai[k];
917       aval = aa + ai[k];
918       bval = ba + bi[k];
919       while (nz--) {
920         if (*acol < k) { /* skip lower triangular entries */
921           acol++;
922           aval++;
923         } else {
924           rtmp[*acol++] = *aval++;
925           *bval++       = 0.0; /* for in-place factorization */
926         }
927       }
928 
929       /* shift the diagonal of the matrix */
930       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
931 
932       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
933       dk = rtmp[k];
934       i  = jl[k]; /* first row to be added to k_th row  */
935 
936       while (i < k) {
937         nexti = jl[i]; /* next row to be added to k_th row */
938         /* compute multiplier, update D(k) and U(i,k) */
939         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
940         uikdi = -ba[ili] * ba[bi[i]];
941         dk += uikdi * ba[ili];
942         ba[ili] = uikdi; /* -U(i,k) */
943 
944         /* add multiple of row i to k-th row ... */
945         jmin = ili + 1;
946         nz   = bi[i + 1] - jmin;
947         if (nz > 0) {
948           bcol = bj + jmin;
949           bval = ba + jmin;
950           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
951           /* update il and jl for i-th row */
952           il[i] = jmin;
953           j     = bj[jmin];
954           jl[i] = jl[j];
955           jl[j] = i;
956         }
957         i = nexti;
958       }
959 
960       /* shift the diagonals when zero pivot is detected */
961       /* compute rs=sum of abs(off-diagonal) */
962       rs   = 0.0;
963       jmin = bi[k] + 1;
964       nz   = bi[k + 1] - jmin;
965       if (nz) {
966         bcol = bj + jmin;
967         while (nz--) {
968           rs += PetscAbsScalar(rtmp[*bcol]);
969           bcol++;
970         }
971       }
972 
973       sctx.rs = rs;
974       sctx.pv = dk;
975       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
976       if (sctx.newshift) break; /* sctx.shift_amount is updated */
977       dk = sctx.pv;
978 
979       /* copy data into U(k,:) */
980       ba[bi[k]] = 1.0 / dk;
981       jmin      = bi[k] + 1;
982       nz        = bi[k + 1] - jmin;
983       if (nz) {
984         bcol = bj + jmin;
985         bval = ba + jmin;
986         while (nz--) {
987           *bval++       = rtmp[*bcol];
988           rtmp[*bcol++] = 0.0;
989         }
990         /* add k-th row into il and jl */
991         il[k] = jmin;
992         i     = bj[jmin];
993         jl[k] = jl[i];
994         jl[i] = k;
995       }
996     }
997   } while (sctx.newshift);
998   PetscCall(PetscFree3(rtmp, il, jl));
999 
1000   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1001   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1002   C->assembled           = PETSC_TRUE;
1003   C->preallocated        = PETSC_TRUE;
1004 
1005   PetscCall(PetscLogFlops(C->rmap->N));
1006   if (sctx.nshift) {
1007     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1008       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1009     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1010       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1011     }
1012   }
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 #include <petscbt.h>
1017 #include <../src/mat/utils/freespace.h>
1018 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1019 {
1020   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1021   Mat_SeqSBAIJ      *b;
1022   Mat                B;
1023   PetscBool          perm_identity;
1024   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1025   const PetscInt    *rip;
1026   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1027   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1028   PetscReal          fill = info->fill, levels = info->levels;
1029   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1030   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1031   PetscBT            lnkbt;
1032   PetscBool          diagDense;
1033   const PetscInt    *adiag;
1034 
1035   PetscFunctionBegin;
1036   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1037   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1038 
1039   if (bs > 1) {
1040     if (!a->sbaijMat) {
1041       A->symmetric = PETSC_BOOL3_TRUE;
1042       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1043       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1044     }
1045     fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1046 
1047     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1048     PetscFunctionReturn(PETSC_SUCCESS);
1049   }
1050 
1051   PetscCall(ISIdentity(perm, &perm_identity));
1052   PetscCall(ISGetIndices(perm, &rip));
1053 
1054   /* special case that simply copies fill pattern */
1055   if (!levels && perm_identity) {
1056     PetscCall(PetscMalloc1(am + 1, &ui));
1057     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1058     B = fact;
1059     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1060 
1061     b  = (Mat_SeqSBAIJ *)B->data;
1062     uj = b->j;
1063     for (i = 0; i < am; i++) {
1064       aj = a->j + adiag[i];
1065       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1066       b->ilen[i] = ui[i];
1067     }
1068     PetscCall(PetscFree(ui));
1069 
1070     B->factortype = MAT_FACTOR_NONE;
1071 
1072     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1073     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1074     B->factortype = MAT_FACTOR_ICC;
1075 
1076     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1077     PetscFunctionReturn(PETSC_SUCCESS);
1078   }
1079 
1080   /* initialization */
1081   PetscCall(PetscMalloc1(am + 1, &ui));
1082   ui[0] = 0;
1083   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1084 
1085   /* jl: linked list for storing indices of the pivot rows
1086      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1087   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1088   for (i = 0; i < am; i++) {
1089     jl[i] = am;
1090     il[i] = 0;
1091   }
1092 
1093   /* create and initialize a linked list for storing column indices of the active row k */
1094   nlnk = am + 1;
1095   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1096 
1097   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1098   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1099 
1100   current_space = free_space;
1101 
1102   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1103   current_space_lvl = free_space_lvl;
1104 
1105   for (k = 0; k < am; k++) { /* for each active row k */
1106     /* initialize lnk by the column indices of row rip[k] of A */
1107     nzk         = 0;
1108     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1109     ncols_upper = 0;
1110     cols        = cols_lvl + am;
1111     for (j = 0; j < ncols; j++) {
1112       i = rip[*(aj + ai[rip[k]] + j)];
1113       if (i >= k) { /* only take upper triangular entry */
1114         cols[ncols_upper]     = i;
1115         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1116         ncols_upper++;
1117       }
1118     }
1119     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1120     nzk += nlnk;
1121 
1122     /* update lnk by computing fill-in for each pivot row to be merged in */
1123     prow = jl[k]; /* 1st pivot row */
1124 
1125     while (prow < k) {
1126       nextprow = jl[prow];
1127 
1128       /* merge prow into k-th row */
1129       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1130       jmax  = ui[prow + 1];
1131       ncols = jmax - jmin;
1132       i     = jmin - ui[prow];
1133       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1134       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1135       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1136       nzk += nlnk;
1137 
1138       /* update il and jl for prow */
1139       if (jmin < jmax) {
1140         il[prow] = jmin;
1141 
1142         j        = *cols;
1143         jl[prow] = jl[j];
1144         jl[j]    = prow;
1145       }
1146       prow = nextprow;
1147     }
1148 
1149     /* if free space is not available, make more free space */
1150     if (current_space->local_remaining < nzk) {
1151       i = am - k + 1;                                                             /* num of unfactored rows */
1152       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1153       PetscCall(PetscFreeSpaceGet(i, &current_space));
1154       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1155       reallocs++;
1156     }
1157 
1158     /* copy data into free_space and free_space_lvl, then initialize lnk */
1159     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1160 
1161     /* add the k-th row into il and jl */
1162     if (nzk - 1 > 0) {
1163       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1164       jl[k] = jl[i];
1165       jl[i] = k;
1166       il[k] = ui[k] + 1;
1167     }
1168     uj_ptr[k]     = current_space->array;
1169     uj_lvl_ptr[k] = current_space_lvl->array;
1170 
1171     current_space->array += nzk;
1172     current_space->local_used += nzk;
1173     current_space->local_remaining -= nzk;
1174 
1175     current_space_lvl->array += nzk;
1176     current_space_lvl->local_used += nzk;
1177     current_space_lvl->local_remaining -= nzk;
1178 
1179     ui[k + 1] = ui[k] + nzk;
1180   }
1181 
1182   PetscCall(ISRestoreIndices(perm, &rip));
1183   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1184   PetscCall(PetscFree(cols_lvl));
1185 
1186   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1187   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1188   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1189   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1190   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1191 
1192   /* put together the new matrix in MATSEQSBAIJ format */
1193   B = fact;
1194   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1195 
1196   b          = (Mat_SeqSBAIJ *)B->data;
1197   b->free_a  = PETSC_TRUE;
1198   b->free_ij = PETSC_TRUE;
1199 
1200   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1201 
1202   b->j             = uj;
1203   b->i             = ui;
1204   b->diag          = NULL;
1205   b->ilen          = NULL;
1206   b->imax          = NULL;
1207   b->row           = perm;
1208   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1209 
1210   PetscCall(PetscObjectReference((PetscObject)perm));
1211 
1212   b->icol = perm;
1213 
1214   PetscCall(PetscObjectReference((PetscObject)perm));
1215   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1216 
1217   b->maxnz = b->nz = ui[am];
1218 
1219   B->info.factor_mallocs   = reallocs;
1220   B->info.fill_ratio_given = fill;
1221   if (ai[am] != 0.) {
1222     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1223     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1224   } else {
1225     B->info.fill_ratio_needed = 0.0;
1226   }
1227 #if defined(PETSC_USE_INFO)
1228   if (ai[am] != 0) {
1229     PetscReal af = B->info.fill_ratio_needed;
1230     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1231     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1232     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1233   } else {
1234     PetscCall(PetscInfo(A, "Empty matrix\n"));
1235   }
1236 #endif
1237   if (perm_identity) {
1238     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1239     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1240     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1241   } else {
1242     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1243   }
1244   PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246 
1247 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1248 {
1249   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1250   Mat_SeqSBAIJ      *b;
1251   Mat                B;
1252   PetscBool          perm_identity;
1253   PetscReal          fill = info->fill;
1254   const PetscInt    *rip;
1255   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1256   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1257   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1258   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1259   PetscBT            lnkbt;
1260   PetscBool          diagDense;
1261 
1262   PetscFunctionBegin;
1263   if (bs > 1) { /* convert to seqsbaij */
1264     if (!a->sbaijMat) {
1265       A->symmetric = PETSC_BOOL3_TRUE;
1266       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1267       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1268     }
1269     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1270 
1271     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1272     PetscFunctionReturn(PETSC_SUCCESS);
1273   }
1274   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1275   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1276 
1277   /* check whether perm is the identity mapping */
1278   PetscCall(ISIdentity(perm, &perm_identity));
1279   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1280   PetscCall(ISGetIndices(perm, &rip));
1281 
1282   /* initialization */
1283   PetscCall(PetscMalloc1(mbs + 1, &ui));
1284   ui[0] = 0;
1285 
1286   /* jl: linked list for storing indices of the pivot rows
1287      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1288   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1289   for (i = 0; i < mbs; i++) {
1290     jl[i] = mbs;
1291     il[i] = 0;
1292   }
1293 
1294   /* create and initialize a linked list for storing column indices of the active row k */
1295   nlnk = mbs + 1;
1296   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1297 
1298   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1299   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1300 
1301   current_space = free_space;
1302 
1303   for (k = 0; k < mbs; k++) { /* for each active row k */
1304     /* initialize lnk by the column indices of row rip[k] of A */
1305     nzk         = 0;
1306     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1307     ncols_upper = 0;
1308     for (j = 0; j < ncols; j++) {
1309       i = rip[*(aj + ai[rip[k]] + j)];
1310       if (i >= k) { /* only take upper triangular entry */
1311         cols[ncols_upper] = i;
1312         ncols_upper++;
1313       }
1314     }
1315     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1316     nzk += nlnk;
1317 
1318     /* update lnk by computing fill-in for each pivot row to be merged in */
1319     prow = jl[k]; /* 1st pivot row */
1320 
1321     while (prow < k) {
1322       nextprow = jl[prow];
1323       /* merge prow into k-th row */
1324       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1325       jmax   = ui[prow + 1];
1326       ncols  = jmax - jmin;
1327       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1328       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1329       nzk += nlnk;
1330 
1331       /* update il and jl for prow */
1332       if (jmin < jmax) {
1333         il[prow] = jmin;
1334         j        = *uj_ptr;
1335         jl[prow] = jl[j];
1336         jl[j]    = prow;
1337       }
1338       prow = nextprow;
1339     }
1340 
1341     /* if free space is not available, make more free space */
1342     if (current_space->local_remaining < nzk) {
1343       i = mbs - k + 1;                                                            /* num of unfactored rows */
1344       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1345       PetscCall(PetscFreeSpaceGet(i, &current_space));
1346       reallocs++;
1347     }
1348 
1349     /* copy data into free space, then initialize lnk */
1350     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1351 
1352     /* add the k-th row into il and jl */
1353     if (nzk - 1 > 0) {
1354       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1355       jl[k] = jl[i];
1356       jl[i] = k;
1357       il[k] = ui[k] + 1;
1358     }
1359     ui_ptr[k] = current_space->array;
1360     current_space->array += nzk;
1361     current_space->local_used += nzk;
1362     current_space->local_remaining -= nzk;
1363 
1364     ui[k + 1] = ui[k] + nzk;
1365   }
1366 
1367   PetscCall(ISRestoreIndices(perm, &rip));
1368   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1369 
1370   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1371   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1372   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1373   PetscCall(PetscLLDestroy(lnk, lnkbt));
1374 
1375   /* put together the new matrix in MATSEQSBAIJ format */
1376   B = fact;
1377   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1378 
1379   b          = (Mat_SeqSBAIJ *)B->data;
1380   b->free_a  = PETSC_TRUE;
1381   b->free_ij = PETSC_TRUE;
1382 
1383   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1384 
1385   b->j             = uj;
1386   b->i             = ui;
1387   b->diag          = NULL;
1388   b->ilen          = NULL;
1389   b->imax          = NULL;
1390   b->row           = perm;
1391   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1392 
1393   PetscCall(PetscObjectReference((PetscObject)perm));
1394   b->icol = perm;
1395   PetscCall(PetscObjectReference((PetscObject)perm));
1396   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1397   b->maxnz = b->nz = ui[mbs];
1398 
1399   B->info.factor_mallocs   = reallocs;
1400   B->info.fill_ratio_given = fill;
1401   if (ai[mbs] != 0.) {
1402     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1403     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1404   } else {
1405     B->info.fill_ratio_needed = 0.0;
1406   }
1407 #if defined(PETSC_USE_INFO)
1408   if (ai[mbs] != 0.) {
1409     PetscReal af = B->info.fill_ratio_needed;
1410     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1411     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1412     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1413   } else {
1414     PetscCall(PetscInfo(A, "Empty matrix\n"));
1415   }
1416 #endif
1417   if (perm_identity) {
1418     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1419   } else {
1420     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1421   }
1422   PetscFunctionReturn(PETSC_SUCCESS);
1423 }
1424 
1425 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1426 {
1427   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1428   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1429   PetscInt           i, k, n                       = a->mbs;
1430   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1431   const MatScalar   *aa = a->a, *v;
1432   PetscScalar       *x, *s, *t, *ls;
1433   const PetscScalar *b;
1434 
1435   PetscFunctionBegin;
1436   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1437   PetscCall(VecGetArrayRead(bb, &b));
1438   PetscCall(VecGetArray(xx, &x));
1439   t = a->solve_work;
1440 
1441   /* forward solve the lower triangular */
1442   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1443 
1444   for (i = 1; i < n; i++) {
1445     v  = aa + bs2 * ai[i];
1446     vi = aj + ai[i];
1447     nz = ai[i + 1] - ai[i];
1448     s  = t + bs * i;
1449     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1450     for (k = 0; k < nz; k++) {
1451       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1452       v += bs2;
1453     }
1454   }
1455 
1456   /* backward solve the upper triangular */
1457   ls = a->solve_work + A->cmap->n;
1458   for (i = n - 1; i >= 0; i--) {
1459     v  = aa + bs2 * (adiag[i + 1] + 1);
1460     vi = aj + adiag[i + 1] + 1;
1461     nz = adiag[i] - adiag[i + 1] - 1;
1462     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1463     for (k = 0; k < nz; k++) {
1464       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1465       v += bs2;
1466     }
1467     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1468     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1469   }
1470 
1471   PetscCall(VecRestoreArrayRead(bb, &b));
1472   PetscCall(VecRestoreArray(xx, &x));
1473   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1474   PetscFunctionReturn(PETSC_SUCCESS);
1475 }
1476 
1477 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1478 {
1479   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1480   IS                 iscol = a->col, isrow = a->row;
1481   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1482   PetscInt           i, m, n = a->mbs;
1483   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1484   const MatScalar   *aa = a->a, *v;
1485   PetscScalar       *x, *s, *t, *ls;
1486   const PetscScalar *b;
1487 
1488   PetscFunctionBegin;
1489   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1490   PetscCall(VecGetArrayRead(bb, &b));
1491   PetscCall(VecGetArray(xx, &x));
1492   t = a->solve_work;
1493 
1494   PetscCall(ISGetIndices(isrow, &rout));
1495   r = rout;
1496   PetscCall(ISGetIndices(iscol, &cout));
1497   c = cout;
1498 
1499   /* forward solve the lower triangular */
1500   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1501   for (i = 1; i < n; i++) {
1502     v  = aa + bs2 * ai[i];
1503     vi = aj + ai[i];
1504     nz = ai[i + 1] - ai[i];
1505     s  = t + bs * i;
1506     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1507     for (m = 0; m < nz; m++) {
1508       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1509       v += bs2;
1510     }
1511   }
1512 
1513   /* backward solve the upper triangular */
1514   ls = a->solve_work + A->cmap->n;
1515   for (i = n - 1; i >= 0; i--) {
1516     v  = aa + bs2 * (adiag[i + 1] + 1);
1517     vi = aj + adiag[i + 1] + 1;
1518     nz = adiag[i] - adiag[i + 1] - 1;
1519     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1520     for (m = 0; m < nz; m++) {
1521       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1522       v += bs2;
1523     }
1524     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1525     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1526   }
1527   PetscCall(ISRestoreIndices(isrow, &rout));
1528   PetscCall(ISRestoreIndices(iscol, &cout));
1529   PetscCall(VecRestoreArrayRead(bb, &b));
1530   PetscCall(VecRestoreArray(xx, &x));
1531   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1532   PetscFunctionReturn(PETSC_SUCCESS);
1533 }
1534