xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   PetscInt        i, j, k, nz, nzL, row, *pj;
14   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
15   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
16   MatScalar      *rtmp, *pc, *mwork, *pv;
17   MatScalar      *aa = a->a, *v;
18   PetscInt        flg;
19   PetscReal       shift = info->shiftamount;
20   PetscBool       allowzeropivot, zeropivotdetected;
21 
22   PetscFunctionBegin;
23   PetscCall(ISGetIndices(isrow, &r));
24   PetscCall(ISGetIndices(isicol, &ic));
25   allowzeropivot = PetscNot(A->erroriffailure);
26 
27   /* generate work space needed by the factorization */
28   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
29   PetscCall(PetscArrayzero(rtmp, bs2 * n));
30 
31   for (i = 0; i < n; i++) {
32     /* zero rtmp */
33     /* L part */
34     nz    = bi[i + 1] - bi[i];
35     bjtmp = bj + bi[i];
36     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
37 
38     /* U part */
39     nz    = bdiag[i] - bdiag[i + 1];
40     bjtmp = bj + bdiag[i + 1] + 1;
41     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
42 
43     /* load in initial (unfactored row) */
44     nz    = ai[r[i] + 1] - ai[r[i]];
45     ajtmp = aj + ai[r[i]];
46     v     = aa + bs2 * ai[r[i]];
47     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); }
48 
49     /* elimination */
50     bjtmp = bj + bi[i];
51     nzL   = bi[i + 1] - bi[i];
52     for (k = 0; k < nzL; k++) {
53       row = bjtmp[k];
54       pc  = rtmp + bs2 * row;
55       for (flg = 0, j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = 1;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
92     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
93 
94     /* U part */
95     pv = b->a + bs2 * (bdiag[i + 1] + 1);
96     pj = b->j + bdiag[i + 1] + 1;
97     nz = bdiag[i] - bdiag[i + 1] - 1;
98     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
99   }
100 
101   PetscCall(PetscFree2(rtmp, mwork));
102   PetscCall(ISRestoreIndices(isicol, &ic));
103   PetscCall(ISRestoreIndices(isrow, &r));
104 
105   C->ops->solve          = MatSolve_SeqBAIJ_2;
106   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
107   C->assembled           = PETSC_TRUE;
108 
109   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
110   PetscFunctionReturn(0);
111 }
112 
113 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) {
114   Mat             C = B;
115   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
116   PetscInt        i, j, k, nz, nzL, row, *pj;
117   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
118   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
119   MatScalar      *rtmp, *pc, *mwork, *pv;
120   MatScalar      *aa = a->a, *v;
121   PetscInt        flg;
122   PetscReal       shift = info->shiftamount;
123   PetscBool       allowzeropivot, zeropivotdetected;
124 
125   PetscFunctionBegin;
126   allowzeropivot = PetscNot(A->erroriffailure);
127 
128   /* generate work space needed by the factorization */
129   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
130   PetscCall(PetscArrayzero(rtmp, bs2 * n));
131 
132   for (i = 0; i < n; i++) {
133     /* zero rtmp */
134     /* L part */
135     nz    = bi[i + 1] - bi[i];
136     bjtmp = bj + bi[i];
137     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
138 
139     /* U part */
140     nz    = bdiag[i] - bdiag[i + 1];
141     bjtmp = bj + bdiag[i + 1] + 1;
142     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
143 
144     /* load in initial (unfactored row) */
145     nz    = ai[i + 1] - ai[i];
146     ajtmp = aj + ai[i];
147     v     = aa + bs2 * ai[i];
148     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); }
149 
150     /* elimination */
151     bjtmp = bj + bi[i];
152     nzL   = bi[i + 1] - bi[i];
153     for (k = 0; k < nzL; k++) {
154       row = bjtmp[k];
155       pc  = rtmp + bs2 * row;
156       for (flg = 0, j = 0; j < bs2; j++) {
157         if (pc[j] != (PetscScalar)0.0) {
158           flg = 1;
159           break;
160         }
161       }
162       if (flg) {
163         pv = b->a + bs2 * bdiag[row];
164         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
165         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
166 
167         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
168         pv = b->a + bs2 * (bdiag[row + 1] + 1);
169         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
170         for (j = 0; j < nz; j++) {
171           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
172           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
173           v = rtmp + 4 * pj[j];
174           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
175           pv += 4;
176         }
177         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
178       }
179     }
180 
181     /* finished row so stick it into b->a */
182     /* L part */
183     pv = b->a + bs2 * bi[i];
184     pj = b->j + bi[i];
185     nz = bi[i + 1] - bi[i];
186     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
187 
188     /* Mark diagonal and invert diagonal for simpler triangular solves */
189     pv = b->a + bs2 * bdiag[i];
190     pj = b->j + bdiag[i];
191     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
192     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
193     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
194 
195     /* U part */
196     /*
197     pv = b->a + bs2*bi[2*n-i];
198     pj = b->j + bi[2*n-i];
199     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
200     */
201     pv = b->a + bs2 * (bdiag[i + 1] + 1);
202     pj = b->j + bdiag[i + 1] + 1;
203     nz = bdiag[i] - bdiag[i + 1] - 1;
204     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
205   }
206   PetscCall(PetscFree2(rtmp, mwork));
207 
208   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
209   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
210   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
211   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
212   C->assembled           = PETSC_TRUE;
213 
214   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
215   PetscFunctionReturn(0);
216 }
217 
218 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) {
219   Mat             C = B;
220   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
221   IS              isrow = b->row, isicol = b->icol;
222   const PetscInt *r, *ic;
223   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
224   PetscInt       *ajtmpold, *ajtmp, nz, row;
225   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
226   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
227   MatScalar       p1, p2, p3, p4;
228   MatScalar      *ba = b->a, *aa = a->a;
229   PetscReal       shift = info->shiftamount;
230   PetscBool       allowzeropivot, zeropivotdetected;
231 
232   PetscFunctionBegin;
233   allowzeropivot = PetscNot(A->erroriffailure);
234   PetscCall(ISGetIndices(isrow, &r));
235   PetscCall(ISGetIndices(isicol, &ic));
236   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
237 
238   for (i = 0; i < n; i++) {
239     nz    = bi[i + 1] - bi[i];
240     ajtmp = bj + bi[i];
241     for (j = 0; j < nz; j++) {
242       x    = rtmp + 4 * ajtmp[j];
243       x[0] = x[1] = x[2] = x[3] = 0.0;
244     }
245     /* load in initial (unfactored row) */
246     idx      = r[i];
247     nz       = ai[idx + 1] - ai[idx];
248     ajtmpold = aj + ai[idx];
249     v        = aa + 4 * ai[idx];
250     for (j = 0; j < nz; j++) {
251       x    = rtmp + 4 * ic[ajtmpold[j]];
252       x[0] = v[0];
253       x[1] = v[1];
254       x[2] = v[2];
255       x[3] = v[3];
256       v += 4;
257     }
258     row = *ajtmp++;
259     while (row < i) {
260       pc = rtmp + 4 * row;
261       p1 = pc[0];
262       p2 = pc[1];
263       p3 = pc[2];
264       p4 = pc[3];
265       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
266         pv    = ba + 4 * diag_offset[row];
267         pj    = bj + diag_offset[row] + 1;
268         x1    = pv[0];
269         x2    = pv[1];
270         x3    = pv[2];
271         x4    = pv[3];
272         pc[0] = m1 = p1 * x1 + p3 * x2;
273         pc[1] = m2 = p2 * x1 + p4 * x2;
274         pc[2] = m3 = p1 * x3 + p3 * x4;
275         pc[3] = m4 = p2 * x3 + p4 * x4;
276         nz         = bi[row + 1] - diag_offset[row] - 1;
277         pv += 4;
278         for (j = 0; j < nz; j++) {
279           x1 = pv[0];
280           x2 = pv[1];
281           x3 = pv[2];
282           x4 = pv[3];
283           x  = rtmp + 4 * pj[j];
284           x[0] -= m1 * x1 + m3 * x2;
285           x[1] -= m2 * x1 + m4 * x2;
286           x[2] -= m1 * x3 + m3 * x4;
287           x[3] -= m2 * x3 + m4 * x4;
288           pv += 4;
289         }
290         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
291       }
292       row = *ajtmp++;
293     }
294     /* finished row so stick it into b->a */
295     pv = ba + 4 * bi[i];
296     pj = bj + bi[i];
297     nz = bi[i + 1] - bi[i];
298     for (j = 0; j < nz; j++) {
299       x     = rtmp + 4 * pj[j];
300       pv[0] = x[0];
301       pv[1] = x[1];
302       pv[2] = x[2];
303       pv[3] = x[3];
304       pv += 4;
305     }
306     /* invert diagonal block */
307     w = ba + 4 * diag_offset[i];
308     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
309     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
310   }
311 
312   PetscCall(PetscFree(rtmp));
313   PetscCall(ISRestoreIndices(isicol, &ic));
314   PetscCall(ISRestoreIndices(isrow, &r));
315 
316   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
317   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
318   C->assembled           = PETSC_TRUE;
319 
320   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
321   PetscFunctionReturn(0);
322 }
323 /*
324       Version for when blocks are 2 by 2 Using natural ordering
325 */
326 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) {
327   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
328   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
329   PetscInt    *ajtmpold, *ajtmp, nz, row;
330   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
331   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
332   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
333   MatScalar   *ba = b->a, *aa = a->a;
334   PetscReal    shift = info->shiftamount;
335   PetscBool    allowzeropivot, zeropivotdetected;
336 
337   PetscFunctionBegin;
338   allowzeropivot = PetscNot(A->erroriffailure);
339   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
340   for (i = 0; i < n; i++) {
341     nz    = bi[i + 1] - bi[i];
342     ajtmp = bj + bi[i];
343     for (j = 0; j < nz; j++) {
344       x    = rtmp + 4 * ajtmp[j];
345       x[0] = x[1] = x[2] = x[3] = 0.0;
346     }
347     /* load in initial (unfactored row) */
348     nz       = ai[i + 1] - ai[i];
349     ajtmpold = aj + ai[i];
350     v        = aa + 4 * ai[i];
351     for (j = 0; j < nz; j++) {
352       x    = rtmp + 4 * ajtmpold[j];
353       x[0] = v[0];
354       x[1] = v[1];
355       x[2] = v[2];
356       x[3] = v[3];
357       v += 4;
358     }
359     row = *ajtmp++;
360     while (row < i) {
361       pc = rtmp + 4 * row;
362       p1 = pc[0];
363       p2 = pc[1];
364       p3 = pc[2];
365       p4 = pc[3];
366       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
367         pv    = ba + 4 * diag_offset[row];
368         pj    = bj + diag_offset[row] + 1;
369         x1    = pv[0];
370         x2    = pv[1];
371         x3    = pv[2];
372         x4    = pv[3];
373         pc[0] = m1 = p1 * x1 + p3 * x2;
374         pc[1] = m2 = p2 * x1 + p4 * x2;
375         pc[2] = m3 = p1 * x3 + p3 * x4;
376         pc[3] = m4 = p2 * x3 + p4 * x4;
377         nz         = bi[row + 1] - diag_offset[row] - 1;
378         pv += 4;
379         for (j = 0; j < nz; j++) {
380           x1 = pv[0];
381           x2 = pv[1];
382           x3 = pv[2];
383           x4 = pv[3];
384           x  = rtmp + 4 * pj[j];
385           x[0] -= m1 * x1 + m3 * x2;
386           x[1] -= m2 * x1 + m4 * x2;
387           x[2] -= m1 * x3 + m3 * x4;
388           x[3] -= m2 * x3 + m4 * x4;
389           pv += 4;
390         }
391         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
392       }
393       row = *ajtmp++;
394     }
395     /* finished row so stick it into b->a */
396     pv = ba + 4 * bi[i];
397     pj = bj + bi[i];
398     nz = bi[i + 1] - bi[i];
399     for (j = 0; j < nz; j++) {
400       x     = rtmp + 4 * pj[j];
401       pv[0] = x[0];
402       pv[1] = x[1];
403       pv[2] = x[2];
404       pv[3] = x[3];
405       /*
406       printf(" col %d:",pj[j]);
407       PetscInt j1;
408       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
409       printf("\n");
410       */
411       pv += 4;
412     }
413     /* invert diagonal block */
414     w = ba + 4 * diag_offset[i];
415     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
416     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417   }
418 
419   PetscCall(PetscFree(rtmp));
420 
421   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
422   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
423   C->assembled           = PETSC_TRUE;
424 
425   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
426   PetscFunctionReturn(0);
427 }
428 
429 /* ----------------------------------------------------------- */
430 /*
431      Version for when blocks are 1 by 1.
432 */
433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) {
434   Mat              C = B;
435   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
436   IS               isrow = b->row, isicol = b->icol;
437   const PetscInt  *r, *ic, *ics;
438   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
439   PetscInt         i, j, k, nz, nzL, row, *pj;
440   const PetscInt  *ajtmp, *bjtmp;
441   MatScalar       *rtmp, *pc, multiplier, *pv;
442   const MatScalar *aa = a->a, *v;
443   PetscBool        row_identity, col_identity;
444   FactorShiftCtx   sctx;
445   const PetscInt  *ddiag;
446   PetscReal        rs;
447   MatScalar        d;
448 
449   PetscFunctionBegin;
450   /* MatPivotSetUp(): initialize shift context sctx */
451   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
452 
453   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
454     ddiag          = a->diag;
455     sctx.shift_top = info->zeropivot;
456     for (i = 0; i < n; i++) {
457       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
458       d  = (aa)[ddiag[i]];
459       rs = -PetscAbsScalar(d) - PetscRealPart(d);
460       v  = aa + ai[i];
461       nz = ai[i + 1] - ai[i];
462       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
463       if (rs > sctx.shift_top) sctx.shift_top = rs;
464     }
465     sctx.shift_top *= 1.1;
466     sctx.nshift_max = 5;
467     sctx.shift_lo   = 0.;
468     sctx.shift_hi   = 1.;
469   }
470 
471   PetscCall(ISGetIndices(isrow, &r));
472   PetscCall(ISGetIndices(isicol, &ic));
473   PetscCall(PetscMalloc1(n + 1, &rtmp));
474   ics = ic;
475 
476   do {
477     sctx.newshift = PETSC_FALSE;
478     for (i = 0; i < n; i++) {
479       /* zero rtmp */
480       /* L part */
481       nz    = bi[i + 1] - bi[i];
482       bjtmp = bj + bi[i];
483       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
484 
485       /* U part */
486       nz    = bdiag[i] - bdiag[i + 1];
487       bjtmp = bj + bdiag[i + 1] + 1;
488       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
489 
490       /* load in initial (unfactored row) */
491       nz    = ai[r[i] + 1] - ai[r[i]];
492       ajtmp = aj + ai[r[i]];
493       v     = aa + ai[r[i]];
494       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
495 
496       /* ZeropivotApply() */
497       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
498 
499       /* elimination */
500       bjtmp = bj + bi[i];
501       row   = *bjtmp++;
502       nzL   = bi[i + 1] - bi[i];
503       for (k = 0; k < nzL; k++) {
504         pc = rtmp + row;
505         if (*pc != (PetscScalar)0.0) {
506           pv         = b->a + bdiag[row];
507           multiplier = *pc * (*pv);
508           *pc        = multiplier;
509 
510           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
511           pv = b->a + bdiag[row + 1] + 1;
512           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
513           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
514           PetscCall(PetscLogFlops(2.0 * nz));
515         }
516         row = *bjtmp++;
517       }
518 
519       /* finished row so stick it into b->a */
520       rs = 0.0;
521       /* L part */
522       pv = b->a + bi[i];
523       pj = b->j + bi[i];
524       nz = bi[i + 1] - bi[i];
525       for (j = 0; j < nz; j++) {
526         pv[j] = rtmp[pj[j]];
527         rs += PetscAbsScalar(pv[j]);
528       }
529 
530       /* U part */
531       pv = b->a + bdiag[i + 1] + 1;
532       pj = b->j + bdiag[i + 1] + 1;
533       nz = bdiag[i] - bdiag[i + 1] - 1;
534       for (j = 0; j < nz; j++) {
535         pv[j] = rtmp[pj[j]];
536         rs += PetscAbsScalar(pv[j]);
537       }
538 
539       sctx.rs = rs;
540       sctx.pv = rtmp[i];
541       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
542       if (sctx.newshift) break; /* break for-loop */
543       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
544 
545       /* Mark diagonal and invert diagonal for simpler triangular solves */
546       pv  = b->a + bdiag[i];
547       *pv = (PetscScalar)1.0 / rtmp[i];
548 
549     } /* endof for (i=0; i<n; i++) { */
550 
551     /* MatPivotRefine() */
552     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
553       /*
554        * if no shift in this attempt & shifting & started shifting & can refine,
555        * then try lower shift
556        */
557       sctx.shift_hi       = sctx.shift_fraction;
558       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
559       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
560       sctx.newshift       = PETSC_TRUE;
561       sctx.nshift++;
562     }
563   } while (sctx.newshift);
564 
565   PetscCall(PetscFree(rtmp));
566   PetscCall(ISRestoreIndices(isicol, &ic));
567   PetscCall(ISRestoreIndices(isrow, &r));
568 
569   PetscCall(ISIdentity(isrow, &row_identity));
570   PetscCall(ISIdentity(isicol, &col_identity));
571   if (row_identity && col_identity) {
572     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
573     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
574     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
575     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
576   } else {
577     C->ops->solve          = MatSolve_SeqBAIJ_1;
578     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
579   }
580   C->assembled = PETSC_TRUE;
581   PetscCall(PetscLogFlops(C->cmap->n));
582 
583   /* MatShiftView(A,info,&sctx) */
584   if (sctx.nshift) {
585     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
586       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
587     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
588       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
589     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
590       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
591     }
592   }
593   PetscFunctionReturn(0);
594 }
595 
596 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) {
597   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
598   IS              isrow = b->row, isicol = b->icol;
599   const PetscInt *r, *ic;
600   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
601   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
602   PetscInt       *diag_offset = b->diag, diag, *pj;
603   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
604   MatScalar      *ba = b->a, *aa = a->a;
605   PetscBool       row_identity, col_identity;
606 
607   PetscFunctionBegin;
608   PetscCall(ISGetIndices(isrow, &r));
609   PetscCall(ISGetIndices(isicol, &ic));
610   PetscCall(PetscMalloc1(n + 1, &rtmp));
611 
612   for (i = 0; i < n; i++) {
613     nz    = bi[i + 1] - bi[i];
614     ajtmp = bj + bi[i];
615     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
616 
617     /* load in initial (unfactored row) */
618     nz       = ai[r[i] + 1] - ai[r[i]];
619     ajtmpold = aj + ai[r[i]];
620     v        = aa + ai[r[i]];
621     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
622 
623     row = *ajtmp++;
624     while (row < i) {
625       pc = rtmp + row;
626       if (*pc != 0.0) {
627         pv         = ba + diag_offset[row];
628         pj         = bj + diag_offset[row] + 1;
629         multiplier = *pc * *pv++;
630         *pc        = multiplier;
631         nz         = bi[row + 1] - diag_offset[row] - 1;
632         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
633         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
634       }
635       row = *ajtmp++;
636     }
637     /* finished row so stick it into b->a */
638     pv = ba + bi[i];
639     pj = bj + bi[i];
640     nz = bi[i + 1] - bi[i];
641     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
642     diag = diag_offset[i] - bi[i];
643     /* check pivot entry for current row */
644     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
645     pv[diag] = 1.0 / pv[diag];
646   }
647 
648   PetscCall(PetscFree(rtmp));
649   PetscCall(ISRestoreIndices(isicol, &ic));
650   PetscCall(ISRestoreIndices(isrow, &r));
651   PetscCall(ISIdentity(isrow, &row_identity));
652   PetscCall(ISIdentity(isicol, &col_identity));
653   if (row_identity && col_identity) {
654     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
655     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
656   } else {
657     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
658     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
659   }
660   C->assembled = PETSC_TRUE;
661   PetscCall(PetscLogFlops(C->cmap->n));
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) {
666   PetscFunctionBegin;
667   *type = MATSOLVERPETSC;
668   PetscFunctionReturn(0);
669 }
670 
671 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) {
672   PetscInt n = A->rmap->n;
673 
674   PetscFunctionBegin;
675 #if defined(PETSC_USE_COMPLEX)
676   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
677 #endif
678   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
679   PetscCall(MatSetSizes(*B, n, n, n, n));
680   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
681     PetscCall(MatSetType(*B, MATSEQBAIJ));
682 
683     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
684     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
685     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
686     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
687     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
688   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
689     PetscCall(MatSetType(*B, MATSEQSBAIJ));
690     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
691 
692     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
693     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
694     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
695     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
696     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
697   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
698   (*B)->factortype     = ftype;
699   (*B)->canuseordering = PETSC_TRUE;
700 
701   PetscCall(PetscFree((*B)->solvertype));
702   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
703   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
704   PetscFunctionReturn(0);
705 }
706 
707 /* ----------------------------------------------------------- */
708 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) {
709   Mat C;
710 
711   PetscFunctionBegin;
712   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
713   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
714   PetscCall(MatLUFactorNumeric(C, A, info));
715 
716   A->ops->solve          = C->ops->solve;
717   A->ops->solvetranspose = C->ops->solvetranspose;
718 
719   PetscCall(MatHeaderMerge(A, &C));
720   PetscCall(PetscLogObjectParent((PetscObject)A, (PetscObject)((Mat_SeqBAIJ *)(A->data))->icol));
721   PetscFunctionReturn(0);
722 }
723 
724 #include <../src/mat/impls/sbaij/seq/sbaij.h>
725 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) {
726   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
727   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
728   IS              ip = b->row;
729   const PetscInt *rip;
730   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
731   PetscInt       *ai = a->i, *aj = a->j;
732   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
733   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
734   PetscReal       rs;
735   FactorShiftCtx  sctx;
736 
737   PetscFunctionBegin;
738   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
739     if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); }
740     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
741     PetscCall(MatDestroy(&a->sbaijMat));
742     PetscFunctionReturn(0);
743   }
744 
745   /* MatPivotSetUp(): initialize shift context sctx */
746   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
747 
748   PetscCall(ISGetIndices(ip, &rip));
749   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
750 
751   sctx.shift_amount = 0.;
752   sctx.nshift       = 0;
753   do {
754     sctx.newshift = PETSC_FALSE;
755     for (i = 0; i < mbs; i++) {
756       rtmp[i] = 0.0;
757       jl[i]   = mbs;
758       il[0]   = 0;
759     }
760 
761     for (k = 0; k < mbs; k++) {
762       bval = ba + bi[k];
763       /* initialize k-th row by the perm[k]-th row of A */
764       jmin = ai[rip[k]];
765       jmax = ai[rip[k] + 1];
766       for (j = jmin; j < jmax; j++) {
767         col = rip[aj[j]];
768         if (col >= k) { /* only take upper triangular entry */
769           rtmp[col] = aa[j];
770           *bval++   = 0.0; /* for in-place factorization */
771         }
772       }
773 
774       /* shift the diagonal of the matrix */
775       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
776 
777       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
778       dk = rtmp[k];
779       i  = jl[k]; /* first row to be added to k_th row  */
780 
781       while (i < k) {
782         nexti = jl[i]; /* next row to be added to k_th row */
783 
784         /* compute multiplier, update diag(k) and U(i,k) */
785         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
786         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
787         dk += uikdi * ba[ili];
788         ba[ili] = uikdi; /* -U(i,k) */
789 
790         /* add multiple of row i to k-th row */
791         jmin = ili + 1;
792         jmax = bi[i + 1];
793         if (jmin < jmax) {
794           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
795           /* update il and jl for row i */
796           il[i] = jmin;
797           j     = bj[jmin];
798           jl[i] = jl[j];
799           jl[j] = i;
800         }
801         i = nexti;
802       }
803 
804       /* shift the diagonals when zero pivot is detected */
805       /* compute rs=sum of abs(off-diagonal) */
806       rs   = 0.0;
807       jmin = bi[k] + 1;
808       nz   = bi[k + 1] - jmin;
809       if (nz) {
810         bcol = bj + jmin;
811         while (nz--) {
812           rs += PetscAbsScalar(rtmp[*bcol]);
813           bcol++;
814         }
815       }
816 
817       sctx.rs = rs;
818       sctx.pv = dk;
819       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
820       if (sctx.newshift) break;
821       dk = sctx.pv;
822 
823       /* copy data into U(k,:) */
824       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
825       jmin      = bi[k] + 1;
826       jmax      = bi[k + 1];
827       if (jmin < jmax) {
828         for (j = jmin; j < jmax; j++) {
829           col       = bj[j];
830           ba[j]     = rtmp[col];
831           rtmp[col] = 0.0;
832         }
833         /* add the k-th row into il and jl */
834         il[k] = jmin;
835         i     = bj[jmin];
836         jl[k] = jl[i];
837         jl[i] = k;
838       }
839     }
840   } while (sctx.newshift);
841   PetscCall(PetscFree3(rtmp, il, jl));
842 
843   PetscCall(ISRestoreIndices(ip, &rip));
844 
845   C->assembled    = PETSC_TRUE;
846   C->preallocated = PETSC_TRUE;
847 
848   PetscCall(PetscLogFlops(C->rmap->N));
849   if (sctx.nshift) {
850     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
851       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
852     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
853       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
854     }
855   }
856   PetscFunctionReturn(0);
857 }
858 
859 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) {
860   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
861   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
862   PetscInt       i, j, am = a->mbs;
863   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
864   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
865   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
866   PetscReal      rs;
867   FactorShiftCtx sctx;
868 
869   PetscFunctionBegin;
870   /* MatPivotSetUp(): initialize shift context sctx */
871   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
872 
873   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
874 
875   do {
876     sctx.newshift = PETSC_FALSE;
877     for (i = 0; i < am; i++) {
878       rtmp[i] = 0.0;
879       jl[i]   = am;
880       il[0]   = 0;
881     }
882 
883     for (k = 0; k < am; k++) {
884       /* initialize k-th row with elements nonzero in row perm(k) of A */
885       nz   = ai[k + 1] - ai[k];
886       acol = aj + ai[k];
887       aval = aa + ai[k];
888       bval = ba + bi[k];
889       while (nz--) {
890         if (*acol < k) { /* skip lower triangular entries */
891           acol++;
892           aval++;
893         } else {
894           rtmp[*acol++] = *aval++;
895           *bval++       = 0.0; /* for in-place factorization */
896         }
897       }
898 
899       /* shift the diagonal of the matrix */
900       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
901 
902       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
903       dk = rtmp[k];
904       i  = jl[k]; /* first row to be added to k_th row  */
905 
906       while (i < k) {
907         nexti = jl[i]; /* next row to be added to k_th row */
908         /* compute multiplier, update D(k) and U(i,k) */
909         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
910         uikdi = -ba[ili] * ba[bi[i]];
911         dk += uikdi * ba[ili];
912         ba[ili] = uikdi; /* -U(i,k) */
913 
914         /* add multiple of row i to k-th row ... */
915         jmin = ili + 1;
916         nz   = bi[i + 1] - jmin;
917         if (nz > 0) {
918           bcol = bj + jmin;
919           bval = ba + jmin;
920           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
921           /* update il and jl for i-th row */
922           il[i] = jmin;
923           j     = bj[jmin];
924           jl[i] = jl[j];
925           jl[j] = i;
926         }
927         i = nexti;
928       }
929 
930       /* shift the diagonals when zero pivot is detected */
931       /* compute rs=sum of abs(off-diagonal) */
932       rs   = 0.0;
933       jmin = bi[k] + 1;
934       nz   = bi[k + 1] - jmin;
935       if (nz) {
936         bcol = bj + jmin;
937         while (nz--) {
938           rs += PetscAbsScalar(rtmp[*bcol]);
939           bcol++;
940         }
941       }
942 
943       sctx.rs = rs;
944       sctx.pv = dk;
945       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
946       if (sctx.newshift) break; /* sctx.shift_amount is updated */
947       dk = sctx.pv;
948 
949       /* copy data into U(k,:) */
950       ba[bi[k]] = 1.0 / dk;
951       jmin      = bi[k] + 1;
952       nz        = bi[k + 1] - jmin;
953       if (nz) {
954         bcol = bj + jmin;
955         bval = ba + jmin;
956         while (nz--) {
957           *bval++       = rtmp[*bcol];
958           rtmp[*bcol++] = 0.0;
959         }
960         /* add k-th row into il and jl */
961         il[k] = jmin;
962         i     = bj[jmin];
963         jl[k] = jl[i];
964         jl[i] = k;
965       }
966     }
967   } while (sctx.newshift);
968   PetscCall(PetscFree3(rtmp, il, jl));
969 
970   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
971   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
972   C->assembled           = PETSC_TRUE;
973   C->preallocated        = PETSC_TRUE;
974 
975   PetscCall(PetscLogFlops(C->rmap->N));
976   if (sctx.nshift) {
977     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
978       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
979     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
980       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
981     }
982   }
983   PetscFunctionReturn(0);
984 }
985 
986 #include <petscbt.h>
987 #include <../src/mat/utils/freespace.h>
988 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
989   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
990   Mat_SeqSBAIJ      *b;
991   Mat                B;
992   PetscBool          perm_identity, missing;
993   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
994   const PetscInt    *rip;
995   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
996   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
997   PetscReal          fill = info->fill, levels = info->levels;
998   PetscFreeSpaceList free_space = NULL, current_space = NULL;
999   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1000   PetscBT            lnkbt;
1001 
1002   PetscFunctionBegin;
1003   PetscCall(MatMissingDiagonal(A, &missing, &i));
1004   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1005 
1006   if (bs > 1) {
1007     if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); }
1008     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1009 
1010     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1011     PetscFunctionReturn(0);
1012   }
1013 
1014   PetscCall(ISIdentity(perm, &perm_identity));
1015   PetscCall(ISGetIndices(perm, &rip));
1016 
1017   /* special case that simply copies fill pattern */
1018   if (!levels && perm_identity) {
1019     PetscCall(PetscMalloc1(am + 1, &ui));
1020     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1021     B = fact;
1022     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1023 
1024     b  = (Mat_SeqSBAIJ *)B->data;
1025     uj = b->j;
1026     for (i = 0; i < am; i++) {
1027       aj = a->j + a->diag[i];
1028       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1029       b->ilen[i] = ui[i];
1030     }
1031     PetscCall(PetscFree(ui));
1032 
1033     B->factortype = MAT_FACTOR_NONE;
1034 
1035     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1036     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1037     B->factortype = MAT_FACTOR_ICC;
1038 
1039     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1040     PetscFunctionReturn(0);
1041   }
1042 
1043   /* initialization */
1044   PetscCall(PetscMalloc1(am + 1, &ui));
1045   ui[0] = 0;
1046   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1047 
1048   /* jl: linked list for storing indices of the pivot rows
1049      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1050   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1051   for (i = 0; i < am; i++) {
1052     jl[i] = am;
1053     il[i] = 0;
1054   }
1055 
1056   /* create and initialize a linked list for storing column indices of the active row k */
1057   nlnk = am + 1;
1058   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1059 
1060   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1061   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1062 
1063   current_space = free_space;
1064 
1065   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1066   current_space_lvl = free_space_lvl;
1067 
1068   for (k = 0; k < am; k++) { /* for each active row k */
1069     /* initialize lnk by the column indices of row rip[k] of A */
1070     nzk         = 0;
1071     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1072     ncols_upper = 0;
1073     cols        = cols_lvl + am;
1074     for (j = 0; j < ncols; j++) {
1075       i = rip[*(aj + ai[rip[k]] + j)];
1076       if (i >= k) { /* only take upper triangular entry */
1077         cols[ncols_upper]     = i;
1078         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1079         ncols_upper++;
1080       }
1081     }
1082     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1083     nzk += nlnk;
1084 
1085     /* update lnk by computing fill-in for each pivot row to be merged in */
1086     prow = jl[k]; /* 1st pivot row */
1087 
1088     while (prow < k) {
1089       nextprow = jl[prow];
1090 
1091       /* merge prow into k-th row */
1092       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1093       jmax  = ui[prow + 1];
1094       ncols = jmax - jmin;
1095       i     = jmin - ui[prow];
1096       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1097       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1098       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1099       nzk += nlnk;
1100 
1101       /* update il and jl for prow */
1102       if (jmin < jmax) {
1103         il[prow] = jmin;
1104 
1105         j        = *cols;
1106         jl[prow] = jl[j];
1107         jl[j]    = prow;
1108       }
1109       prow = nextprow;
1110     }
1111 
1112     /* if free space is not available, make more free space */
1113     if (current_space->local_remaining < nzk) {
1114       i = am - k + 1;                                                             /* num of unfactored rows */
1115       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1116       PetscCall(PetscFreeSpaceGet(i, &current_space));
1117       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1118       reallocs++;
1119     }
1120 
1121     /* copy data into free_space and free_space_lvl, then initialize lnk */
1122     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1123 
1124     /* add the k-th row into il and jl */
1125     if (nzk - 1 > 0) {
1126       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1127       jl[k] = jl[i];
1128       jl[i] = k;
1129       il[k] = ui[k] + 1;
1130     }
1131     uj_ptr[k]     = current_space->array;
1132     uj_lvl_ptr[k] = current_space_lvl->array;
1133 
1134     current_space->array += nzk;
1135     current_space->local_used += nzk;
1136     current_space->local_remaining -= nzk;
1137 
1138     current_space_lvl->array += nzk;
1139     current_space_lvl->local_used += nzk;
1140     current_space_lvl->local_remaining -= nzk;
1141 
1142     ui[k + 1] = ui[k] + nzk;
1143   }
1144 
1145   PetscCall(ISRestoreIndices(perm, &rip));
1146   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1147   PetscCall(PetscFree(cols_lvl));
1148 
1149   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1150   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1151   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1152   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1153   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1154 
1155   /* put together the new matrix in MATSEQSBAIJ format */
1156   B = fact;
1157   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1158 
1159   b               = (Mat_SeqSBAIJ *)B->data;
1160   b->singlemalloc = PETSC_FALSE;
1161   b->free_a       = PETSC_TRUE;
1162   b->free_ij      = PETSC_TRUE;
1163 
1164   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1165 
1166   b->j             = uj;
1167   b->i             = ui;
1168   b->diag          = NULL;
1169   b->ilen          = NULL;
1170   b->imax          = NULL;
1171   b->row           = perm;
1172   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1173 
1174   PetscCall(PetscObjectReference((PetscObject)perm));
1175 
1176   b->icol = perm;
1177 
1178   PetscCall(PetscObjectReference((PetscObject)perm));
1179   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1180   PetscCall(PetscLogObjectMemory((PetscObject)B, (ui[am] - am) * (sizeof(PetscInt) + sizeof(MatScalar))));
1181 
1182   b->maxnz = b->nz = ui[am];
1183 
1184   B->info.factor_mallocs   = reallocs;
1185   B->info.fill_ratio_given = fill;
1186   if (ai[am] != 0.) {
1187     /* nonzeros in lower triangular part of A (includign diagonals)= (ai[am]+am)/2 */
1188     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1189   } else {
1190     B->info.fill_ratio_needed = 0.0;
1191   }
1192 #if defined(PETSC_USE_INFO)
1193   if (ai[am] != 0) {
1194     PetscReal af = B->info.fill_ratio_needed;
1195     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1196     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1197     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1198   } else {
1199     PetscCall(PetscInfo(A, "Empty matrix\n"));
1200   }
1201 #endif
1202   if (perm_identity) {
1203     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1204     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1205     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1206   } else {
1207     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1208   }
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1213   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1214   Mat_SeqSBAIJ      *b;
1215   Mat                B;
1216   PetscBool          perm_identity, missing;
1217   PetscReal          fill = info->fill;
1218   const PetscInt    *rip;
1219   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1220   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1221   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1222   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1223   PetscBT            lnkbt;
1224 
1225   PetscFunctionBegin;
1226   if (bs > 1) { /* convert to seqsbaij */
1227     if (!a->sbaijMat) { PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); }
1228     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1229 
1230     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1231     PetscFunctionReturn(0);
1232   }
1233 
1234   PetscCall(MatMissingDiagonal(A, &missing, &i));
1235   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1236 
1237   /* check whether perm is the identity mapping */
1238   PetscCall(ISIdentity(perm, &perm_identity));
1239   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1240   PetscCall(ISGetIndices(perm, &rip));
1241 
1242   /* initialization */
1243   PetscCall(PetscMalloc1(mbs + 1, &ui));
1244   ui[0] = 0;
1245 
1246   /* jl: linked list for storing indices of the pivot rows
1247      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1248   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1249   for (i = 0; i < mbs; i++) {
1250     jl[i] = mbs;
1251     il[i] = 0;
1252   }
1253 
1254   /* create and initialize a linked list for storing column indices of the active row k */
1255   nlnk = mbs + 1;
1256   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1257 
1258   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1259   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1260 
1261   current_space = free_space;
1262 
1263   for (k = 0; k < mbs; k++) { /* for each active row k */
1264     /* initialize lnk by the column indices of row rip[k] of A */
1265     nzk         = 0;
1266     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1267     ncols_upper = 0;
1268     for (j = 0; j < ncols; j++) {
1269       i = rip[*(aj + ai[rip[k]] + j)];
1270       if (i >= k) { /* only take upper triangular entry */
1271         cols[ncols_upper] = i;
1272         ncols_upper++;
1273       }
1274     }
1275     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1276     nzk += nlnk;
1277 
1278     /* update lnk by computing fill-in for each pivot row to be merged in */
1279     prow = jl[k]; /* 1st pivot row */
1280 
1281     while (prow < k) {
1282       nextprow = jl[prow];
1283       /* merge prow into k-th row */
1284       jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1285       jmax     = ui[prow + 1];
1286       ncols    = jmax - jmin;
1287       uj_ptr   = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1288       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1289       nzk += nlnk;
1290 
1291       /* update il and jl for prow */
1292       if (jmin < jmax) {
1293         il[prow] = jmin;
1294         j        = *uj_ptr;
1295         jl[prow] = jl[j];
1296         jl[j]    = prow;
1297       }
1298       prow = nextprow;
1299     }
1300 
1301     /* if free space is not available, make more free space */
1302     if (current_space->local_remaining < nzk) {
1303       i = mbs - k + 1;                                                            /* num of unfactored rows */
1304       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1305       PetscCall(PetscFreeSpaceGet(i, &current_space));
1306       reallocs++;
1307     }
1308 
1309     /* copy data into free space, then initialize lnk */
1310     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1311 
1312     /* add the k-th row into il and jl */
1313     if (nzk - 1 > 0) {
1314       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1315       jl[k] = jl[i];
1316       jl[i] = k;
1317       il[k] = ui[k] + 1;
1318     }
1319     ui_ptr[k] = current_space->array;
1320     current_space->array += nzk;
1321     current_space->local_used += nzk;
1322     current_space->local_remaining -= nzk;
1323 
1324     ui[k + 1] = ui[k] + nzk;
1325   }
1326 
1327   PetscCall(ISRestoreIndices(perm, &rip));
1328   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1329 
1330   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1331   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1332   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1333   PetscCall(PetscLLDestroy(lnk, lnkbt));
1334 
1335   /* put together the new matrix in MATSEQSBAIJ format */
1336   B = fact;
1337   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1338 
1339   b               = (Mat_SeqSBAIJ *)B->data;
1340   b->singlemalloc = PETSC_FALSE;
1341   b->free_a       = PETSC_TRUE;
1342   b->free_ij      = PETSC_TRUE;
1343 
1344   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1345 
1346   b->j             = uj;
1347   b->i             = ui;
1348   b->diag          = NULL;
1349   b->ilen          = NULL;
1350   b->imax          = NULL;
1351   b->row           = perm;
1352   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1353 
1354   PetscCall(PetscObjectReference((PetscObject)perm));
1355   b->icol = perm;
1356   PetscCall(PetscObjectReference((PetscObject)perm));
1357   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1358   PetscCall(PetscLogObjectMemory((PetscObject)B, (ui[mbs] - mbs) * (sizeof(PetscInt) + sizeof(MatScalar))));
1359   b->maxnz = b->nz = ui[mbs];
1360 
1361   B->info.factor_mallocs   = reallocs;
1362   B->info.fill_ratio_given = fill;
1363   if (ai[mbs] != 0.) {
1364     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1365     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1366   } else {
1367     B->info.fill_ratio_needed = 0.0;
1368   }
1369 #if defined(PETSC_USE_INFO)
1370   if (ai[mbs] != 0.) {
1371     PetscReal af = B->info.fill_ratio_needed;
1372     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1373     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1374     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1375   } else {
1376     PetscCall(PetscInfo(A, "Empty matrix\n"));
1377   }
1378 #endif
1379   if (perm_identity) {
1380     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1381   } else {
1382     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1383   }
1384   PetscFunctionReturn(0);
1385 }
1386 
1387 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1388   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1389   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1390   PetscInt           i, k, n                       = a->mbs;
1391   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1392   const MatScalar   *aa = a->a, *v;
1393   PetscScalar       *x, *s, *t, *ls;
1394   const PetscScalar *b;
1395 
1396   PetscFunctionBegin;
1397   PetscCall(VecGetArrayRead(bb, &b));
1398   PetscCall(VecGetArray(xx, &x));
1399   t = a->solve_work;
1400 
1401   /* forward solve the lower triangular */
1402   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1403 
1404   for (i = 1; i < n; i++) {
1405     v  = aa + bs2 * ai[i];
1406     vi = aj + ai[i];
1407     nz = ai[i + 1] - ai[i];
1408     s  = t + bs * i;
1409     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1410     for (k = 0; k < nz; k++) {
1411       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1412       v += bs2;
1413     }
1414   }
1415 
1416   /* backward solve the upper triangular */
1417   ls = a->solve_work + A->cmap->n;
1418   for (i = n - 1; i >= 0; i--) {
1419     v  = aa + bs2 * (adiag[i + 1] + 1);
1420     vi = aj + adiag[i + 1] + 1;
1421     nz = adiag[i] - adiag[i + 1] - 1;
1422     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1423     for (k = 0; k < nz; k++) {
1424       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1425       v += bs2;
1426     }
1427     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1428     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1429   }
1430 
1431   PetscCall(VecRestoreArrayRead(bb, &b));
1432   PetscCall(VecRestoreArray(xx, &x));
1433   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1434   PetscFunctionReturn(0);
1435 }
1436 
1437 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) {
1438   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1439   IS                 iscol = a->col, isrow = a->row;
1440   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1441   PetscInt           i, m, n = a->mbs;
1442   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1443   const MatScalar   *aa = a->a, *v;
1444   PetscScalar       *x, *s, *t, *ls;
1445   const PetscScalar *b;
1446 
1447   PetscFunctionBegin;
1448   PetscCall(VecGetArrayRead(bb, &b));
1449   PetscCall(VecGetArray(xx, &x));
1450   t = a->solve_work;
1451 
1452   PetscCall(ISGetIndices(isrow, &rout));
1453   r = rout;
1454   PetscCall(ISGetIndices(iscol, &cout));
1455   c = cout;
1456 
1457   /* forward solve the lower triangular */
1458   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1459   for (i = 1; i < n; i++) {
1460     v  = aa + bs2 * ai[i];
1461     vi = aj + ai[i];
1462     nz = ai[i + 1] - ai[i];
1463     s  = t + bs * i;
1464     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1465     for (m = 0; m < nz; m++) {
1466       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1467       v += bs2;
1468     }
1469   }
1470 
1471   /* backward solve the upper triangular */
1472   ls = a->solve_work + A->cmap->n;
1473   for (i = n - 1; i >= 0; i--) {
1474     v  = aa + bs2 * (adiag[i + 1] + 1);
1475     vi = aj + adiag[i + 1] + 1;
1476     nz = adiag[i] - adiag[i + 1] - 1;
1477     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1478     for (m = 0; m < nz; m++) {
1479       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1480       v += bs2;
1481     }
1482     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1483     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1484   }
1485   PetscCall(ISRestoreIndices(isrow, &rout));
1486   PetscCall(ISRestoreIndices(iscol, &cout));
1487   PetscCall(VecRestoreArrayRead(bb, &b));
1488   PetscCall(VecRestoreArray(xx, &x));
1489   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1490   PetscFunctionReturn(0);
1491 }
1492 
1493 /*
1494     For each block in an block array saves the largest absolute value in the block into another array
1495 */
1496 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) {
1497   PetscInt i, j;
1498 
1499   PetscFunctionBegin;
1500   PetscCall(PetscArrayzero(absarray, nbs + 1));
1501   for (i = 0; i < nbs; i++) {
1502     for (j = 0; j < bs2; j++) {
1503       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1504     }
1505   }
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 /*
1510      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1511 */
1512 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) {
1513   Mat             B = *fact;
1514   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1515   IS              isicol;
1516   const PetscInt *r, *ic;
1517   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1518   PetscInt       *bi, *bj, *bdiag;
1519 
1520   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1521   PetscInt   nlnk, *lnk;
1522   PetscBT    lnkbt;
1523   PetscBool  row_identity, icol_identity;
1524   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1525   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1526 
1527   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1528   PetscInt   nnz_max;
1529   PetscBool  missing;
1530   PetscReal *vtmp_abs;
1531   MatScalar *v_work;
1532   PetscInt  *v_pivots;
1533   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1534 
1535   PetscFunctionBegin;
1536   /* ------- symbolic factorization, can be reused ---------*/
1537   PetscCall(MatMissingDiagonal(A, &missing, &i));
1538   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1539   adiag = a->diag;
1540 
1541   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1542 
1543   /* bdiag is location of diagonal in factor */
1544   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1545 
1546   /* allocate row pointers bi */
1547   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1548 
1549   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1550   dtcount = (PetscInt)info->dtcount;
1551   if (dtcount > mbs - 1) dtcount = mbs - 1;
1552   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1553   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1554   PetscCall(PetscMalloc1(nnz_max, &bj));
1555   nnz_max = nnz_max * bs2;
1556   PetscCall(PetscMalloc1(nnz_max, &ba));
1557 
1558   /* put together the new matrix */
1559   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1560   PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)isicol));
1561 
1562   b               = (Mat_SeqBAIJ *)(B)->data;
1563   b->free_a       = PETSC_TRUE;
1564   b->free_ij      = PETSC_TRUE;
1565   b->singlemalloc = PETSC_FALSE;
1566 
1567   b->a    = ba;
1568   b->j    = bj;
1569   b->i    = bi;
1570   b->diag = bdiag;
1571   b->ilen = NULL;
1572   b->imax = NULL;
1573   b->row  = isrow;
1574   b->col  = iscol;
1575 
1576   PetscCall(PetscObjectReference((PetscObject)isrow));
1577   PetscCall(PetscObjectReference((PetscObject)iscol));
1578 
1579   b->icol = isicol;
1580   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1581   PetscCall(PetscLogObjectMemory((PetscObject)B, nnz_max * (sizeof(PetscInt) + sizeof(MatScalar))));
1582   b->maxnz = nnz_max / bs2;
1583 
1584   (B)->factortype            = MAT_FACTOR_ILUDT;
1585   (B)->info.factor_mallocs   = 0;
1586   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1587   /* ------- end of symbolic factorization ---------*/
1588   PetscCall(ISGetIndices(isrow, &r));
1589   PetscCall(ISGetIndices(isicol, &ic));
1590 
1591   /* linked list for storing column indices of the active row */
1592   nlnk = mbs + 1;
1593   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1594 
1595   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1596   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1597   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1598   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1599   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1600   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1601 
1602   allowzeropivot  = PetscNot(A->erroriffailure);
1603   bi[0]           = 0;
1604   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1605   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1606   for (i = 0; i < mbs; i++) {
1607     /* copy initial fill into linked list */
1608     nzi = ai[r[i] + 1] - ai[r[i]];
1609     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1610     nzi_al = adiag[r[i]] - ai[r[i]];
1611     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1612 
1613     /* load in initial unfactored row */
1614     ajtmp = aj + ai[r[i]];
1615     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1616     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1617     aatmp = a->a + bs2 * ai[r[i]];
1618     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1619 
1620     /* add pivot rows into linked list */
1621     row = lnk[mbs];
1622     while (row < i) {
1623       nzi_bl = bi[row + 1] - bi[row] + 1;
1624       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1625       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1626       nzi += nlnk;
1627       row = lnk[row];
1628     }
1629 
1630     /* copy data from lnk into jtmp, then initialize lnk */
1631     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1632 
1633     /* numerical factorization */
1634     bjtmp = jtmp;
1635     row   = *bjtmp++; /* 1st pivot row */
1636 
1637     while (row < i) {
1638       pc = rtmp + bs2 * row;
1639       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1640       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1641       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1642       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1643         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1644         pv = ba + bs2 * (bdiag[row + 1] + 1);
1645         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1646         for (j = 0; j < nz; j++) { PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); }
1647         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1648       }
1649       row = *bjtmp++;
1650     }
1651 
1652     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1653     nzi_bl = 0;
1654     j      = 0;
1655     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1656       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1657       nzi_bl++;
1658       j++;
1659     }
1660     nzi_bu = nzi - nzi_bl - 1;
1661 
1662     while (j < nzi) { /* U-part */
1663       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1664       j++;
1665     }
1666 
1667     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1668 
1669     bjtmp = bj + bi[i];
1670     batmp = ba + bs2 * bi[i];
1671     /* apply level dropping rule to L part */
1672     ncut  = nzi_al + dtcount;
1673     if (ncut < nzi_bl) {
1674       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1675       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1676     } else {
1677       ncut = nzi_bl;
1678     }
1679     for (j = 0; j < ncut; j++) {
1680       bjtmp[j] = jtmp[j];
1681       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1682     }
1683     bi[i + 1] = bi[i] + ncut;
1684     nzi       = ncut + 1;
1685 
1686     /* apply level dropping rule to U part */
1687     ncut = nzi_au + dtcount;
1688     if (ncut < nzi_bu) {
1689       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1690       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1691     } else {
1692       ncut = nzi_bu;
1693     }
1694     nzi += ncut;
1695 
1696     /* mark bdiagonal */
1697     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1698     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1699 
1700     bjtmp = bj + bdiag[i];
1701     batmp = ba + bs2 * bdiag[i];
1702     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1703     *bjtmp = i;
1704 
1705     bjtmp = bj + bdiag[i + 1] + 1;
1706     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1707 
1708     for (k = 0; k < ncut; k++) {
1709       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1710       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1711     }
1712 
1713     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1714 
1715     /* invert diagonal block for simpler triangular solves - add shift??? */
1716     batmp = ba + bs2 * bdiag[i];
1717 
1718     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1719     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1720   } /* for (i=0; i<mbs; i++) */
1721   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1722 
1723   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1724   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1725 
1726   PetscCall(ISRestoreIndices(isrow, &r));
1727   PetscCall(ISRestoreIndices(isicol, &ic));
1728 
1729   PetscCall(PetscLLDestroy(lnk, lnkbt));
1730 
1731   PetscCall(PetscFree2(im, jtmp));
1732   PetscCall(PetscFree2(rtmp, vtmp));
1733 
1734   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1735   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1736 
1737   PetscCall(ISIdentity(isrow, &row_identity));
1738   PetscCall(ISIdentity(isicol, &icol_identity));
1739   if (row_identity && icol_identity) {
1740     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1741   } else {
1742     B->ops->solve = MatSolve_SeqBAIJ_N;
1743   }
1744 
1745   B->ops->solveadd          = NULL;
1746   B->ops->solvetranspose    = NULL;
1747   B->ops->solvetransposeadd = NULL;
1748   B->ops->matsolve          = NULL;
1749   B->assembled              = PETSC_TRUE;
1750   B->preallocated           = PETSC_TRUE;
1751   PetscFunctionReturn(0);
1752 }
1753