xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision b6e6beb40ac0a73ccc19b70153df96d5058dce43)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
9 {
10   Mat             C = B;
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
12   IS              isrow = b->row, isicol = b->icol;
13   const PetscInt *r, *ic;
14   PetscInt        i, j, k, nz, nzL, row, *pj;
15   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
16   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
17   MatScalar      *rtmp, *pc, *mwork, *pv;
18   MatScalar      *aa = a->a, *v;
19   PetscInt        flg;
20   PetscReal       shift = info->shiftamount;
21   PetscBool       allowzeropivot, zeropivotdetected;
22 
23   PetscFunctionBegin;
24   PetscCall(ISGetIndices(isrow, &r));
25   PetscCall(ISGetIndices(isicol, &ic));
26   allowzeropivot = PetscNot(A->erroriffailure);
27 
28   /* generate work space needed by the factorization */
29   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
30   PetscCall(PetscArrayzero(rtmp, bs2 * n));
31 
32   for (i = 0; i < n; i++) {
33     /* zero rtmp */
34     /* L part */
35     nz    = bi[i + 1] - bi[i];
36     bjtmp = bj + bi[i];
37     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
38 
39     /* U part */
40     nz    = bdiag[i] - bdiag[i + 1];
41     bjtmp = bj + bdiag[i + 1] + 1;
42     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
43 
44     /* load in initial (unfactored row) */
45     nz    = ai[r[i] + 1] - ai[r[i]];
46     ajtmp = aj + ai[r[i]];
47     v     = aa + bs2 * ai[r[i]];
48     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
49 
50     /* elimination */
51     bjtmp = bj + bi[i];
52     nzL   = bi[i + 1] - bi[i];
53     for (k = 0; k < nzL; k++) {
54       row = bjtmp[k];
55       pc  = rtmp + bs2 * row;
56       for (flg = 0, j = 0; j < bs2; j++) {
57         if (pc[j] != (PetscScalar)0.0) {
58           flg = 1;
59           break;
60         }
61       }
62       if (flg) {
63         pv = b->a + bs2 * bdiag[row];
64         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
65         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66 
67         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
68         pv = b->a + bs2 * (bdiag[row + 1] + 1);
69         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
70         for (j = 0; j < nz; j++) {
71           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
72           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
73           v = rtmp + 4 * pj[j];
74           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
75           pv += 4;
76         }
77         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
78       }
79     }
80 
81     /* finished row so stick it into b->a */
82     /* L part */
83     pv = b->a + bs2 * bi[i];
84     pj = b->j + bi[i];
85     nz = bi[i + 1] - bi[i];
86     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87 
88     /* Mark diagonal and invert diagonal for simpler triangular solves */
89     pv = b->a + bs2 * bdiag[i];
90     pj = b->j + bdiag[i];
91     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
92     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
93     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
94 
95     /* U part */
96     pv = b->a + bs2 * (bdiag[i + 1] + 1);
97     pj = b->j + bdiag[i + 1] + 1;
98     nz = bdiag[i] - bdiag[i + 1] - 1;
99     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
100   }
101 
102   PetscCall(PetscFree2(rtmp, mwork));
103   PetscCall(ISRestoreIndices(isicol, &ic));
104   PetscCall(ISRestoreIndices(isrow, &r));
105 
106   C->ops->solve          = MatSolve_SeqBAIJ_2;
107   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
108   C->assembled           = PETSC_TRUE;
109 
110   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
115 {
116   Mat             C = B;
117   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
118   PetscInt        i, j, k, nz, nzL, row, *pj;
119   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
120   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
121   MatScalar      *rtmp, *pc, *mwork, *pv;
122   MatScalar      *aa = a->a, *v;
123   PetscInt        flg;
124   PetscReal       shift = info->shiftamount;
125   PetscBool       allowzeropivot, zeropivotdetected;
126 
127   PetscFunctionBegin;
128   allowzeropivot = PetscNot(A->erroriffailure);
129 
130   /* generate work space needed by the factorization */
131   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
132   PetscCall(PetscArrayzero(rtmp, bs2 * n));
133 
134   for (i = 0; i < n; i++) {
135     /* zero rtmp */
136     /* L part */
137     nz    = bi[i + 1] - bi[i];
138     bjtmp = bj + bi[i];
139     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
140 
141     /* U part */
142     nz    = bdiag[i] - bdiag[i + 1];
143     bjtmp = bj + bdiag[i + 1] + 1;
144     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
145 
146     /* load in initial (unfactored row) */
147     nz    = ai[i + 1] - ai[i];
148     ajtmp = aj + ai[i];
149     v     = aa + bs2 * ai[i];
150     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
151 
152     /* elimination */
153     bjtmp = bj + bi[i];
154     nzL   = bi[i + 1] - bi[i];
155     for (k = 0; k < nzL; k++) {
156       row = bjtmp[k];
157       pc  = rtmp + bs2 * row;
158       for (flg = 0, j = 0; j < bs2; j++) {
159         if (pc[j] != (PetscScalar)0.0) {
160           flg = 1;
161           break;
162         }
163       }
164       if (flg) {
165         pv = b->a + bs2 * bdiag[row];
166         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
167         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
168 
169         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
170         pv = b->a + bs2 * (bdiag[row + 1] + 1);
171         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
172         for (j = 0; j < nz; j++) {
173           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
174           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
175           v = rtmp + 4 * pj[j];
176           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
177           pv += 4;
178         }
179         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
180       }
181     }
182 
183     /* finished row so stick it into b->a */
184     /* L part */
185     pv = b->a + bs2 * bi[i];
186     pj = b->j + bi[i];
187     nz = bi[i + 1] - bi[i];
188     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
189 
190     /* Mark diagonal and invert diagonal for simpler triangular solves */
191     pv = b->a + bs2 * bdiag[i];
192     pj = b->j + bdiag[i];
193     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
194     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
195     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
196 
197     /* U part */
198     /*
199     pv = b->a + bs2*bi[2*n-i];
200     pj = b->j + bi[2*n-i];
201     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
202     */
203     pv = b->a + bs2 * (bdiag[i + 1] + 1);
204     pj = b->j + bdiag[i + 1] + 1;
205     nz = bdiag[i] - bdiag[i + 1] - 1;
206     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
207   }
208   PetscCall(PetscFree2(rtmp, mwork));
209 
210   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
211   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
212   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
213   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
214   C->assembled           = PETSC_TRUE;
215 
216   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
217   PetscFunctionReturn(PETSC_SUCCESS);
218 }
219 
220 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
221 {
222   Mat             C = B;
223   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
224   IS              isrow = b->row, isicol = b->icol;
225   const PetscInt *r, *ic;
226   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
227   PetscInt       *ajtmpold, *ajtmp, nz, row;
228   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
229   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
230   MatScalar       p1, p2, p3, p4;
231   MatScalar      *ba = b->a, *aa = a->a;
232   PetscReal       shift = info->shiftamount;
233   PetscBool       allowzeropivot, zeropivotdetected;
234 
235   PetscFunctionBegin;
236   allowzeropivot = PetscNot(A->erroriffailure);
237   PetscCall(ISGetIndices(isrow, &r));
238   PetscCall(ISGetIndices(isicol, &ic));
239   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
240 
241   for (i = 0; i < n; i++) {
242     nz    = bi[i + 1] - bi[i];
243     ajtmp = bj + bi[i];
244     for (j = 0; j < nz; j++) {
245       x    = rtmp + 4 * ajtmp[j];
246       x[0] = x[1] = x[2] = x[3] = 0.0;
247     }
248     /* load in initial (unfactored row) */
249     idx      = r[i];
250     nz       = ai[idx + 1] - ai[idx];
251     ajtmpold = aj + ai[idx];
252     v        = aa + 4 * ai[idx];
253     for (j = 0; j < nz; j++) {
254       x    = rtmp + 4 * ic[ajtmpold[j]];
255       x[0] = v[0];
256       x[1] = v[1];
257       x[2] = v[2];
258       x[3] = v[3];
259       v += 4;
260     }
261     row = *ajtmp++;
262     while (row < i) {
263       pc = rtmp + 4 * row;
264       p1 = pc[0];
265       p2 = pc[1];
266       p3 = pc[2];
267       p4 = pc[3];
268       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
269         pv    = ba + 4 * diag_offset[row];
270         pj    = bj + diag_offset[row] + 1;
271         x1    = pv[0];
272         x2    = pv[1];
273         x3    = pv[2];
274         x4    = pv[3];
275         pc[0] = m1 = p1 * x1 + p3 * x2;
276         pc[1] = m2 = p2 * x1 + p4 * x2;
277         pc[2] = m3 = p1 * x3 + p3 * x4;
278         pc[3] = m4 = p2 * x3 + p4 * x4;
279         nz         = bi[row + 1] - diag_offset[row] - 1;
280         pv += 4;
281         for (j = 0; j < nz; j++) {
282           x1 = pv[0];
283           x2 = pv[1];
284           x3 = pv[2];
285           x4 = pv[3];
286           x  = rtmp + 4 * pj[j];
287           x[0] -= m1 * x1 + m3 * x2;
288           x[1] -= m2 * x1 + m4 * x2;
289           x[2] -= m1 * x3 + m3 * x4;
290           x[3] -= m2 * x3 + m4 * x4;
291           pv += 4;
292         }
293         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
294       }
295       row = *ajtmp++;
296     }
297     /* finished row so stick it into b->a */
298     pv = ba + 4 * bi[i];
299     pj = bj + bi[i];
300     nz = bi[i + 1] - bi[i];
301     for (j = 0; j < nz; j++) {
302       x     = rtmp + 4 * pj[j];
303       pv[0] = x[0];
304       pv[1] = x[1];
305       pv[2] = x[2];
306       pv[3] = x[3];
307       pv += 4;
308     }
309     /* invert diagonal block */
310     w = ba + 4 * diag_offset[i];
311     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
312     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
313   }
314 
315   PetscCall(PetscFree(rtmp));
316   PetscCall(ISRestoreIndices(isicol, &ic));
317   PetscCall(ISRestoreIndices(isrow, &r));
318 
319   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
320   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
321   C->assembled           = PETSC_TRUE;
322 
323   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
324   PetscFunctionReturn(PETSC_SUCCESS);
325 }
326 /*
327       Version for when blocks are 2 by 2 Using natural ordering
328 */
329 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
330 {
331   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
332   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
333   PetscInt    *ajtmpold, *ajtmp, nz, row;
334   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
335   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
336   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
337   MatScalar   *ba = b->a, *aa = a->a;
338   PetscReal    shift = info->shiftamount;
339   PetscBool    allowzeropivot, zeropivotdetected;
340 
341   PetscFunctionBegin;
342   allowzeropivot = PetscNot(A->erroriffailure);
343   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
344   for (i = 0; i < n; i++) {
345     nz    = bi[i + 1] - bi[i];
346     ajtmp = bj + bi[i];
347     for (j = 0; j < nz; j++) {
348       x    = rtmp + 4 * ajtmp[j];
349       x[0] = x[1] = x[2] = x[3] = 0.0;
350     }
351     /* load in initial (unfactored row) */
352     nz       = ai[i + 1] - ai[i];
353     ajtmpold = aj + ai[i];
354     v        = aa + 4 * ai[i];
355     for (j = 0; j < nz; j++) {
356       x    = rtmp + 4 * ajtmpold[j];
357       x[0] = v[0];
358       x[1] = v[1];
359       x[2] = v[2];
360       x[3] = v[3];
361       v += 4;
362     }
363     row = *ajtmp++;
364     while (row < i) {
365       pc = rtmp + 4 * row;
366       p1 = pc[0];
367       p2 = pc[1];
368       p3 = pc[2];
369       p4 = pc[3];
370       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
371         pv    = ba + 4 * diag_offset[row];
372         pj    = bj + diag_offset[row] + 1;
373         x1    = pv[0];
374         x2    = pv[1];
375         x3    = pv[2];
376         x4    = pv[3];
377         pc[0] = m1 = p1 * x1 + p3 * x2;
378         pc[1] = m2 = p2 * x1 + p4 * x2;
379         pc[2] = m3 = p1 * x3 + p3 * x4;
380         pc[3] = m4 = p2 * x3 + p4 * x4;
381         nz         = bi[row + 1] - diag_offset[row] - 1;
382         pv += 4;
383         for (j = 0; j < nz; j++) {
384           x1 = pv[0];
385           x2 = pv[1];
386           x3 = pv[2];
387           x4 = pv[3];
388           x  = rtmp + 4 * pj[j];
389           x[0] -= m1 * x1 + m3 * x2;
390           x[1] -= m2 * x1 + m4 * x2;
391           x[2] -= m1 * x3 + m3 * x4;
392           x[3] -= m2 * x3 + m4 * x4;
393           pv += 4;
394         }
395         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
396       }
397       row = *ajtmp++;
398     }
399     /* finished row so stick it into b->a */
400     pv = ba + 4 * bi[i];
401     pj = bj + bi[i];
402     nz = bi[i + 1] - bi[i];
403     for (j = 0; j < nz; j++) {
404       x     = rtmp + 4 * pj[j];
405       pv[0] = x[0];
406       pv[1] = x[1];
407       pv[2] = x[2];
408       pv[3] = x[3];
409       /*
410       printf(" col %d:",pj[j]);
411       PetscInt j1;
412       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
413       printf("\n");
414       */
415       pv += 4;
416     }
417     /* invert diagonal block */
418     w = ba + 4 * diag_offset[i];
419     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
420     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
421   }
422 
423   PetscCall(PetscFree(rtmp));
424 
425   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
426   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
427   C->assembled           = PETSC_TRUE;
428 
429   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 /* ----------------------------------------------------------- */
434 /*
435      Version for when blocks are 1 by 1.
436 */
437 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
438 {
439   Mat              C = B;
440   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
441   IS               isrow = b->row, isicol = b->icol;
442   const PetscInt  *r, *ic, *ics;
443   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
444   PetscInt         i, j, k, nz, nzL, row, *pj;
445   const PetscInt  *ajtmp, *bjtmp;
446   MatScalar       *rtmp, *pc, multiplier, *pv;
447   const MatScalar *aa = a->a, *v;
448   PetscBool        row_identity, col_identity;
449   FactorShiftCtx   sctx;
450   const PetscInt  *ddiag;
451   PetscReal        rs;
452   MatScalar        d;
453 
454   PetscFunctionBegin;
455   /* MatPivotSetUp(): initialize shift context sctx */
456   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
457 
458   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
459     ddiag          = a->diag;
460     sctx.shift_top = info->zeropivot;
461     for (i = 0; i < n; i++) {
462       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
463       d  = (aa)[ddiag[i]];
464       rs = -PetscAbsScalar(d) - PetscRealPart(d);
465       v  = aa + ai[i];
466       nz = ai[i + 1] - ai[i];
467       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
468       if (rs > sctx.shift_top) sctx.shift_top = rs;
469     }
470     sctx.shift_top *= 1.1;
471     sctx.nshift_max = 5;
472     sctx.shift_lo   = 0.;
473     sctx.shift_hi   = 1.;
474   }
475 
476   PetscCall(ISGetIndices(isrow, &r));
477   PetscCall(ISGetIndices(isicol, &ic));
478   PetscCall(PetscMalloc1(n + 1, &rtmp));
479   ics = ic;
480 
481   do {
482     sctx.newshift = PETSC_FALSE;
483     for (i = 0; i < n; i++) {
484       /* zero rtmp */
485       /* L part */
486       nz    = bi[i + 1] - bi[i];
487       bjtmp = bj + bi[i];
488       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
489 
490       /* U part */
491       nz    = bdiag[i] - bdiag[i + 1];
492       bjtmp = bj + bdiag[i + 1] + 1;
493       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
494 
495       /* load in initial (unfactored row) */
496       nz    = ai[r[i] + 1] - ai[r[i]];
497       ajtmp = aj + ai[r[i]];
498       v     = aa + ai[r[i]];
499       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
500 
501       /* ZeropivotApply() */
502       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
503 
504       /* elimination */
505       bjtmp = bj + bi[i];
506       row   = *bjtmp++;
507       nzL   = bi[i + 1] - bi[i];
508       for (k = 0; k < nzL; k++) {
509         pc = rtmp + row;
510         if (*pc != (PetscScalar)0.0) {
511           pv         = b->a + bdiag[row];
512           multiplier = *pc * (*pv);
513           *pc        = multiplier;
514 
515           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
516           pv = b->a + bdiag[row + 1] + 1;
517           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
518           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
519           PetscCall(PetscLogFlops(2.0 * nz));
520         }
521         row = *bjtmp++;
522       }
523 
524       /* finished row so stick it into b->a */
525       rs = 0.0;
526       /* L part */
527       pv = b->a + bi[i];
528       pj = b->j + bi[i];
529       nz = bi[i + 1] - bi[i];
530       for (j = 0; j < nz; j++) {
531         pv[j] = rtmp[pj[j]];
532         rs += PetscAbsScalar(pv[j]);
533       }
534 
535       /* U part */
536       pv = b->a + bdiag[i + 1] + 1;
537       pj = b->j + bdiag[i + 1] + 1;
538       nz = bdiag[i] - bdiag[i + 1] - 1;
539       for (j = 0; j < nz; j++) {
540         pv[j] = rtmp[pj[j]];
541         rs += PetscAbsScalar(pv[j]);
542       }
543 
544       sctx.rs = rs;
545       sctx.pv = rtmp[i];
546       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
547       if (sctx.newshift) break; /* break for-loop */
548       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
549 
550       /* Mark diagonal and invert diagonal for simpler triangular solves */
551       pv  = b->a + bdiag[i];
552       *pv = (PetscScalar)1.0 / rtmp[i];
553 
554     } /* endof for (i=0; i<n; i++) { */
555 
556     /* MatPivotRefine() */
557     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
558       /*
559        * if no shift in this attempt & shifting & started shifting & can refine,
560        * then try lower shift
561        */
562       sctx.shift_hi       = sctx.shift_fraction;
563       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
564       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
565       sctx.newshift       = PETSC_TRUE;
566       sctx.nshift++;
567     }
568   } while (sctx.newshift);
569 
570   PetscCall(PetscFree(rtmp));
571   PetscCall(ISRestoreIndices(isicol, &ic));
572   PetscCall(ISRestoreIndices(isrow, &r));
573 
574   PetscCall(ISIdentity(isrow, &row_identity));
575   PetscCall(ISIdentity(isicol, &col_identity));
576   if (row_identity && col_identity) {
577     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
578     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
579     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
580     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
581   } else {
582     C->ops->solve          = MatSolve_SeqBAIJ_1;
583     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
584   }
585   C->assembled = PETSC_TRUE;
586   PetscCall(PetscLogFlops(C->cmap->n));
587 
588   /* MatShiftView(A,info,&sctx) */
589   if (sctx.nshift) {
590     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
591       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
592     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
593       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
594     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
595       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
596     }
597   }
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
601 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
602 {
603   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
604   IS              isrow = b->row, isicol = b->icol;
605   const PetscInt *r, *ic;
606   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
607   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
608   PetscInt       *diag_offset = b->diag, diag, *pj;
609   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
610   MatScalar      *ba = b->a, *aa = a->a;
611   PetscBool       row_identity, col_identity;
612 
613   PetscFunctionBegin;
614   PetscCall(ISGetIndices(isrow, &r));
615   PetscCall(ISGetIndices(isicol, &ic));
616   PetscCall(PetscMalloc1(n + 1, &rtmp));
617 
618   for (i = 0; i < n; i++) {
619     nz    = bi[i + 1] - bi[i];
620     ajtmp = bj + bi[i];
621     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
622 
623     /* load in initial (unfactored row) */
624     nz       = ai[r[i] + 1] - ai[r[i]];
625     ajtmpold = aj + ai[r[i]];
626     v        = aa + ai[r[i]];
627     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
628 
629     row = *ajtmp++;
630     while (row < i) {
631       pc = rtmp + row;
632       if (*pc != 0.0) {
633         pv         = ba + diag_offset[row];
634         pj         = bj + diag_offset[row] + 1;
635         multiplier = *pc * *pv++;
636         *pc        = multiplier;
637         nz         = bi[row + 1] - diag_offset[row] - 1;
638         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
639         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
640       }
641       row = *ajtmp++;
642     }
643     /* finished row so stick it into b->a */
644     pv = ba + bi[i];
645     pj = bj + bi[i];
646     nz = bi[i + 1] - bi[i];
647     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
648     diag = diag_offset[i] - bi[i];
649     /* check pivot entry for current row */
650     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
651     pv[diag] = 1.0 / pv[diag];
652   }
653 
654   PetscCall(PetscFree(rtmp));
655   PetscCall(ISRestoreIndices(isicol, &ic));
656   PetscCall(ISRestoreIndices(isrow, &r));
657   PetscCall(ISIdentity(isrow, &row_identity));
658   PetscCall(ISIdentity(isicol, &col_identity));
659   if (row_identity && col_identity) {
660     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
661     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
662   } else {
663     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
664     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
665   }
666   C->assembled = PETSC_TRUE;
667   PetscCall(PetscLogFlops(C->cmap->n));
668   PetscFunctionReturn(PETSC_SUCCESS);
669 }
670 
671 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
672 {
673   PetscFunctionBegin;
674   *type = MATSOLVERPETSC;
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
678 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
679 {
680   PetscInt n = A->rmap->n;
681 
682   PetscFunctionBegin;
683 #if defined(PETSC_USE_COMPLEX)
684   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
685 #endif
686   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
687   PetscCall(MatSetSizes(*B, n, n, n, n));
688   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
689     PetscCall(MatSetType(*B, MATSEQBAIJ));
690 
691     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
692     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
693     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
694     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
695     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
696   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
697     PetscCall(MatSetType(*B, MATSEQSBAIJ));
698     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
699 
700     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
701     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
702     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
703     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
704     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
705   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
706   (*B)->factortype     = ftype;
707   (*B)->canuseordering = PETSC_TRUE;
708 
709   PetscCall(PetscFree((*B)->solvertype));
710   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
711   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
712   PetscFunctionReturn(PETSC_SUCCESS);
713 }
714 
715 /* ----------------------------------------------------------- */
716 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
717 {
718   Mat C;
719 
720   PetscFunctionBegin;
721   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
722   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
723   PetscCall(MatLUFactorNumeric(C, A, info));
724 
725   A->ops->solve          = C->ops->solve;
726   A->ops->solvetranspose = C->ops->solvetranspose;
727 
728   PetscCall(MatHeaderMerge(A, &C));
729   PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 
732 #include <../src/mat/impls/sbaij/seq/sbaij.h>
733 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
734 {
735   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
736   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
737   IS              ip = b->row;
738   const PetscInt *rip;
739   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
740   PetscInt       *ai = a->i, *aj = a->j;
741   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
742   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
743   PetscReal       rs;
744   FactorShiftCtx  sctx;
745 
746   PetscFunctionBegin;
747   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
748     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
749     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
750     PetscCall(MatDestroy(&a->sbaijMat));
751     PetscFunctionReturn(PETSC_SUCCESS);
752   }
753 
754   /* MatPivotSetUp(): initialize shift context sctx */
755   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
756 
757   PetscCall(ISGetIndices(ip, &rip));
758   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
759 
760   sctx.shift_amount = 0.;
761   sctx.nshift       = 0;
762   do {
763     sctx.newshift = PETSC_FALSE;
764     for (i = 0; i < mbs; i++) {
765       rtmp[i] = 0.0;
766       jl[i]   = mbs;
767       il[0]   = 0;
768     }
769 
770     for (k = 0; k < mbs; k++) {
771       bval = ba + bi[k];
772       /* initialize k-th row by the perm[k]-th row of A */
773       jmin = ai[rip[k]];
774       jmax = ai[rip[k] + 1];
775       for (j = jmin; j < jmax; j++) {
776         col = rip[aj[j]];
777         if (col >= k) { /* only take upper triangular entry */
778           rtmp[col] = aa[j];
779           *bval++   = 0.0; /* for in-place factorization */
780         }
781       }
782 
783       /* shift the diagonal of the matrix */
784       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
785 
786       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
787       dk = rtmp[k];
788       i  = jl[k]; /* first row to be added to k_th row  */
789 
790       while (i < k) {
791         nexti = jl[i]; /* next row to be added to k_th row */
792 
793         /* compute multiplier, update diag(k) and U(i,k) */
794         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
795         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
796         dk += uikdi * ba[ili];
797         ba[ili] = uikdi; /* -U(i,k) */
798 
799         /* add multiple of row i to k-th row */
800         jmin = ili + 1;
801         jmax = bi[i + 1];
802         if (jmin < jmax) {
803           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
804           /* update il and jl for row i */
805           il[i] = jmin;
806           j     = bj[jmin];
807           jl[i] = jl[j];
808           jl[j] = i;
809         }
810         i = nexti;
811       }
812 
813       /* shift the diagonals when zero pivot is detected */
814       /* compute rs=sum of abs(off-diagonal) */
815       rs   = 0.0;
816       jmin = bi[k] + 1;
817       nz   = bi[k + 1] - jmin;
818       if (nz) {
819         bcol = bj + jmin;
820         while (nz--) {
821           rs += PetscAbsScalar(rtmp[*bcol]);
822           bcol++;
823         }
824       }
825 
826       sctx.rs = rs;
827       sctx.pv = dk;
828       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
829       if (sctx.newshift) break;
830       dk = sctx.pv;
831 
832       /* copy data into U(k,:) */
833       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
834       jmin      = bi[k] + 1;
835       jmax      = bi[k + 1];
836       if (jmin < jmax) {
837         for (j = jmin; j < jmax; j++) {
838           col       = bj[j];
839           ba[j]     = rtmp[col];
840           rtmp[col] = 0.0;
841         }
842         /* add the k-th row into il and jl */
843         il[k] = jmin;
844         i     = bj[jmin];
845         jl[k] = jl[i];
846         jl[i] = k;
847       }
848     }
849   } while (sctx.newshift);
850   PetscCall(PetscFree3(rtmp, il, jl));
851 
852   PetscCall(ISRestoreIndices(ip, &rip));
853 
854   C->assembled    = PETSC_TRUE;
855   C->preallocated = PETSC_TRUE;
856 
857   PetscCall(PetscLogFlops(C->rmap->N));
858   if (sctx.nshift) {
859     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
860       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
861     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
862       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
863     }
864   }
865   PetscFunctionReturn(PETSC_SUCCESS);
866 }
867 
868 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
869 {
870   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
871   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
872   PetscInt       i, j, am = a->mbs;
873   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
874   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
875   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
876   PetscReal      rs;
877   FactorShiftCtx sctx;
878 
879   PetscFunctionBegin;
880   /* MatPivotSetUp(): initialize shift context sctx */
881   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
882 
883   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
884 
885   do {
886     sctx.newshift = PETSC_FALSE;
887     for (i = 0; i < am; i++) {
888       rtmp[i] = 0.0;
889       jl[i]   = am;
890       il[0]   = 0;
891     }
892 
893     for (k = 0; k < am; k++) {
894       /* initialize k-th row with elements nonzero in row perm(k) of A */
895       nz   = ai[k + 1] - ai[k];
896       acol = aj + ai[k];
897       aval = aa + ai[k];
898       bval = ba + bi[k];
899       while (nz--) {
900         if (*acol < k) { /* skip lower triangular entries */
901           acol++;
902           aval++;
903         } else {
904           rtmp[*acol++] = *aval++;
905           *bval++       = 0.0; /* for in-place factorization */
906         }
907       }
908 
909       /* shift the diagonal of the matrix */
910       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
911 
912       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
913       dk = rtmp[k];
914       i  = jl[k]; /* first row to be added to k_th row  */
915 
916       while (i < k) {
917         nexti = jl[i]; /* next row to be added to k_th row */
918         /* compute multiplier, update D(k) and U(i,k) */
919         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
920         uikdi = -ba[ili] * ba[bi[i]];
921         dk += uikdi * ba[ili];
922         ba[ili] = uikdi; /* -U(i,k) */
923 
924         /* add multiple of row i to k-th row ... */
925         jmin = ili + 1;
926         nz   = bi[i + 1] - jmin;
927         if (nz > 0) {
928           bcol = bj + jmin;
929           bval = ba + jmin;
930           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
931           /* update il and jl for i-th row */
932           il[i] = jmin;
933           j     = bj[jmin];
934           jl[i] = jl[j];
935           jl[j] = i;
936         }
937         i = nexti;
938       }
939 
940       /* shift the diagonals when zero pivot is detected */
941       /* compute rs=sum of abs(off-diagonal) */
942       rs   = 0.0;
943       jmin = bi[k] + 1;
944       nz   = bi[k + 1] - jmin;
945       if (nz) {
946         bcol = bj + jmin;
947         while (nz--) {
948           rs += PetscAbsScalar(rtmp[*bcol]);
949           bcol++;
950         }
951       }
952 
953       sctx.rs = rs;
954       sctx.pv = dk;
955       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
956       if (sctx.newshift) break; /* sctx.shift_amount is updated */
957       dk = sctx.pv;
958 
959       /* copy data into U(k,:) */
960       ba[bi[k]] = 1.0 / dk;
961       jmin      = bi[k] + 1;
962       nz        = bi[k + 1] - jmin;
963       if (nz) {
964         bcol = bj + jmin;
965         bval = ba + jmin;
966         while (nz--) {
967           *bval++       = rtmp[*bcol];
968           rtmp[*bcol++] = 0.0;
969         }
970         /* add k-th row into il and jl */
971         il[k] = jmin;
972         i     = bj[jmin];
973         jl[k] = jl[i];
974         jl[i] = k;
975       }
976     }
977   } while (sctx.newshift);
978   PetscCall(PetscFree3(rtmp, il, jl));
979 
980   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
981   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
982   C->assembled           = PETSC_TRUE;
983   C->preallocated        = PETSC_TRUE;
984 
985   PetscCall(PetscLogFlops(C->rmap->N));
986   if (sctx.nshift) {
987     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
988       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
989     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
990       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
991     }
992   }
993   PetscFunctionReturn(PETSC_SUCCESS);
994 }
995 
996 #include <petscbt.h>
997 #include <../src/mat/utils/freespace.h>
998 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
999 {
1000   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1001   Mat_SeqSBAIJ      *b;
1002   Mat                B;
1003   PetscBool          perm_identity, missing;
1004   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1005   const PetscInt    *rip;
1006   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1007   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1008   PetscReal          fill = info->fill, levels = info->levels;
1009   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1010   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1011   PetscBT            lnkbt;
1012 
1013   PetscFunctionBegin;
1014   PetscCall(MatMissingDiagonal(A, &missing, &i));
1015   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1016 
1017   if (bs > 1) {
1018     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1019     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1020 
1021     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1022     PetscFunctionReturn(PETSC_SUCCESS);
1023   }
1024 
1025   PetscCall(ISIdentity(perm, &perm_identity));
1026   PetscCall(ISGetIndices(perm, &rip));
1027 
1028   /* special case that simply copies fill pattern */
1029   if (!levels && perm_identity) {
1030     PetscCall(PetscMalloc1(am + 1, &ui));
1031     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1032     B = fact;
1033     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1034 
1035     b  = (Mat_SeqSBAIJ *)B->data;
1036     uj = b->j;
1037     for (i = 0; i < am; i++) {
1038       aj = a->j + a->diag[i];
1039       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1040       b->ilen[i] = ui[i];
1041     }
1042     PetscCall(PetscFree(ui));
1043 
1044     B->factortype = MAT_FACTOR_NONE;
1045 
1046     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1047     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1048     B->factortype = MAT_FACTOR_ICC;
1049 
1050     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1051     PetscFunctionReturn(PETSC_SUCCESS);
1052   }
1053 
1054   /* initialization */
1055   PetscCall(PetscMalloc1(am + 1, &ui));
1056   ui[0] = 0;
1057   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1058 
1059   /* jl: linked list for storing indices of the pivot rows
1060      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1061   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1062   for (i = 0; i < am; i++) {
1063     jl[i] = am;
1064     il[i] = 0;
1065   }
1066 
1067   /* create and initialize a linked list for storing column indices of the active row k */
1068   nlnk = am + 1;
1069   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1070 
1071   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1072   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1073 
1074   current_space = free_space;
1075 
1076   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1077   current_space_lvl = free_space_lvl;
1078 
1079   for (k = 0; k < am; k++) { /* for each active row k */
1080     /* initialize lnk by the column indices of row rip[k] of A */
1081     nzk         = 0;
1082     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1083     ncols_upper = 0;
1084     cols        = cols_lvl + am;
1085     for (j = 0; j < ncols; j++) {
1086       i = rip[*(aj + ai[rip[k]] + j)];
1087       if (i >= k) { /* only take upper triangular entry */
1088         cols[ncols_upper]     = i;
1089         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1090         ncols_upper++;
1091       }
1092     }
1093     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1094     nzk += nlnk;
1095 
1096     /* update lnk by computing fill-in for each pivot row to be merged in */
1097     prow = jl[k]; /* 1st pivot row */
1098 
1099     while (prow < k) {
1100       nextprow = jl[prow];
1101 
1102       /* merge prow into k-th row */
1103       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1104       jmax  = ui[prow + 1];
1105       ncols = jmax - jmin;
1106       i     = jmin - ui[prow];
1107       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1108       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1109       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1110       nzk += nlnk;
1111 
1112       /* update il and jl for prow */
1113       if (jmin < jmax) {
1114         il[prow] = jmin;
1115 
1116         j        = *cols;
1117         jl[prow] = jl[j];
1118         jl[j]    = prow;
1119       }
1120       prow = nextprow;
1121     }
1122 
1123     /* if free space is not available, make more free space */
1124     if (current_space->local_remaining < nzk) {
1125       i = am - k + 1;                                                             /* num of unfactored rows */
1126       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1127       PetscCall(PetscFreeSpaceGet(i, &current_space));
1128       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1129       reallocs++;
1130     }
1131 
1132     /* copy data into free_space and free_space_lvl, then initialize lnk */
1133     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1134 
1135     /* add the k-th row into il and jl */
1136     if (nzk - 1 > 0) {
1137       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1138       jl[k] = jl[i];
1139       jl[i] = k;
1140       il[k] = ui[k] + 1;
1141     }
1142     uj_ptr[k]     = current_space->array;
1143     uj_lvl_ptr[k] = current_space_lvl->array;
1144 
1145     current_space->array += nzk;
1146     current_space->local_used += nzk;
1147     current_space->local_remaining -= nzk;
1148 
1149     current_space_lvl->array += nzk;
1150     current_space_lvl->local_used += nzk;
1151     current_space_lvl->local_remaining -= nzk;
1152 
1153     ui[k + 1] = ui[k] + nzk;
1154   }
1155 
1156   PetscCall(ISRestoreIndices(perm, &rip));
1157   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1158   PetscCall(PetscFree(cols_lvl));
1159 
1160   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1161   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1162   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1163   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1164   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1165 
1166   /* put together the new matrix in MATSEQSBAIJ format */
1167   B = fact;
1168   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1169 
1170   b               = (Mat_SeqSBAIJ *)B->data;
1171   b->singlemalloc = PETSC_FALSE;
1172   b->free_a       = PETSC_TRUE;
1173   b->free_ij      = PETSC_TRUE;
1174 
1175   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1176 
1177   b->j             = uj;
1178   b->i             = ui;
1179   b->diag          = NULL;
1180   b->ilen          = NULL;
1181   b->imax          = NULL;
1182   b->row           = perm;
1183   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1184 
1185   PetscCall(PetscObjectReference((PetscObject)perm));
1186 
1187   b->icol = perm;
1188 
1189   PetscCall(PetscObjectReference((PetscObject)perm));
1190   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1191 
1192   b->maxnz = b->nz = ui[am];
1193 
1194   B->info.factor_mallocs   = reallocs;
1195   B->info.fill_ratio_given = fill;
1196   if (ai[am] != 0.) {
1197     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1198     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1199   } else {
1200     B->info.fill_ratio_needed = 0.0;
1201   }
1202 #if defined(PETSC_USE_INFO)
1203   if (ai[am] != 0) {
1204     PetscReal af = B->info.fill_ratio_needed;
1205     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1206     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1207     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1208   } else {
1209     PetscCall(PetscInfo(A, "Empty matrix\n"));
1210   }
1211 #endif
1212   if (perm_identity) {
1213     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1214     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1215     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1216   } else {
1217     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1218   }
1219   PetscFunctionReturn(PETSC_SUCCESS);
1220 }
1221 
1222 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1223 {
1224   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1225   Mat_SeqSBAIJ      *b;
1226   Mat                B;
1227   PetscBool          perm_identity, missing;
1228   PetscReal          fill = info->fill;
1229   const PetscInt    *rip;
1230   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1231   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1232   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1233   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1234   PetscBT            lnkbt;
1235 
1236   PetscFunctionBegin;
1237   if (bs > 1) { /* convert to seqsbaij */
1238     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1239     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1240 
1241     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1242     PetscFunctionReturn(PETSC_SUCCESS);
1243   }
1244 
1245   PetscCall(MatMissingDiagonal(A, &missing, &i));
1246   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1247 
1248   /* check whether perm is the identity mapping */
1249   PetscCall(ISIdentity(perm, &perm_identity));
1250   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1251   PetscCall(ISGetIndices(perm, &rip));
1252 
1253   /* initialization */
1254   PetscCall(PetscMalloc1(mbs + 1, &ui));
1255   ui[0] = 0;
1256 
1257   /* jl: linked list for storing indices of the pivot rows
1258      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1259   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1260   for (i = 0; i < mbs; i++) {
1261     jl[i] = mbs;
1262     il[i] = 0;
1263   }
1264 
1265   /* create and initialize a linked list for storing column indices of the active row k */
1266   nlnk = mbs + 1;
1267   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1268 
1269   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1270   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1271 
1272   current_space = free_space;
1273 
1274   for (k = 0; k < mbs; k++) { /* for each active row k */
1275     /* initialize lnk by the column indices of row rip[k] of A */
1276     nzk         = 0;
1277     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1278     ncols_upper = 0;
1279     for (j = 0; j < ncols; j++) {
1280       i = rip[*(aj + ai[rip[k]] + j)];
1281       if (i >= k) { /* only take upper triangular entry */
1282         cols[ncols_upper] = i;
1283         ncols_upper++;
1284       }
1285     }
1286     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1287     nzk += nlnk;
1288 
1289     /* update lnk by computing fill-in for each pivot row to be merged in */
1290     prow = jl[k]; /* 1st pivot row */
1291 
1292     while (prow < k) {
1293       nextprow = jl[prow];
1294       /* merge prow into k-th row */
1295       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1296       jmax   = ui[prow + 1];
1297       ncols  = jmax - jmin;
1298       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1299       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1300       nzk += nlnk;
1301 
1302       /* update il and jl for prow */
1303       if (jmin < jmax) {
1304         il[prow] = jmin;
1305         j        = *uj_ptr;
1306         jl[prow] = jl[j];
1307         jl[j]    = prow;
1308       }
1309       prow = nextprow;
1310     }
1311 
1312     /* if free space is not available, make more free space */
1313     if (current_space->local_remaining < nzk) {
1314       i = mbs - k + 1;                                                            /* num of unfactored rows */
1315       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1316       PetscCall(PetscFreeSpaceGet(i, &current_space));
1317       reallocs++;
1318     }
1319 
1320     /* copy data into free space, then initialize lnk */
1321     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1322 
1323     /* add the k-th row into il and jl */
1324     if (nzk - 1 > 0) {
1325       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1326       jl[k] = jl[i];
1327       jl[i] = k;
1328       il[k] = ui[k] + 1;
1329     }
1330     ui_ptr[k] = current_space->array;
1331     current_space->array += nzk;
1332     current_space->local_used += nzk;
1333     current_space->local_remaining -= nzk;
1334 
1335     ui[k + 1] = ui[k] + nzk;
1336   }
1337 
1338   PetscCall(ISRestoreIndices(perm, &rip));
1339   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1340 
1341   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1342   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1343   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1344   PetscCall(PetscLLDestroy(lnk, lnkbt));
1345 
1346   /* put together the new matrix in MATSEQSBAIJ format */
1347   B = fact;
1348   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1349 
1350   b               = (Mat_SeqSBAIJ *)B->data;
1351   b->singlemalloc = PETSC_FALSE;
1352   b->free_a       = PETSC_TRUE;
1353   b->free_ij      = PETSC_TRUE;
1354 
1355   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1356 
1357   b->j             = uj;
1358   b->i             = ui;
1359   b->diag          = NULL;
1360   b->ilen          = NULL;
1361   b->imax          = NULL;
1362   b->row           = perm;
1363   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1364 
1365   PetscCall(PetscObjectReference((PetscObject)perm));
1366   b->icol = perm;
1367   PetscCall(PetscObjectReference((PetscObject)perm));
1368   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1369   b->maxnz = b->nz = ui[mbs];
1370 
1371   B->info.factor_mallocs   = reallocs;
1372   B->info.fill_ratio_given = fill;
1373   if (ai[mbs] != 0.) {
1374     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1375     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1376   } else {
1377     B->info.fill_ratio_needed = 0.0;
1378   }
1379 #if defined(PETSC_USE_INFO)
1380   if (ai[mbs] != 0.) {
1381     PetscReal af = B->info.fill_ratio_needed;
1382     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1383     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1384     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1385   } else {
1386     PetscCall(PetscInfo(A, "Empty matrix\n"));
1387   }
1388 #endif
1389   if (perm_identity) {
1390     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1391   } else {
1392     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1393   }
1394   PetscFunctionReturn(PETSC_SUCCESS);
1395 }
1396 
1397 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1398 {
1399   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1400   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1401   PetscInt           i, k, n                       = a->mbs;
1402   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1403   const MatScalar   *aa = a->a, *v;
1404   PetscScalar       *x, *s, *t, *ls;
1405   const PetscScalar *b;
1406 
1407   PetscFunctionBegin;
1408   PetscCall(VecGetArrayRead(bb, &b));
1409   PetscCall(VecGetArray(xx, &x));
1410   t = a->solve_work;
1411 
1412   /* forward solve the lower triangular */
1413   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1414 
1415   for (i = 1; i < n; i++) {
1416     v  = aa + bs2 * ai[i];
1417     vi = aj + ai[i];
1418     nz = ai[i + 1] - ai[i];
1419     s  = t + bs * i;
1420     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1421     for (k = 0; k < nz; k++) {
1422       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1423       v += bs2;
1424     }
1425   }
1426 
1427   /* backward solve the upper triangular */
1428   ls = a->solve_work + A->cmap->n;
1429   for (i = n - 1; i >= 0; i--) {
1430     v  = aa + bs2 * (adiag[i + 1] + 1);
1431     vi = aj + adiag[i + 1] + 1;
1432     nz = adiag[i] - adiag[i + 1] - 1;
1433     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1434     for (k = 0; k < nz; k++) {
1435       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1436       v += bs2;
1437     }
1438     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1439     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1440   }
1441 
1442   PetscCall(VecRestoreArrayRead(bb, &b));
1443   PetscCall(VecRestoreArray(xx, &x));
1444   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1445   PetscFunctionReturn(PETSC_SUCCESS);
1446 }
1447 
1448 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1449 {
1450   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1451   IS                 iscol = a->col, isrow = a->row;
1452   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1453   PetscInt           i, m, n = a->mbs;
1454   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1455   const MatScalar   *aa = a->a, *v;
1456   PetscScalar       *x, *s, *t, *ls;
1457   const PetscScalar *b;
1458 
1459   PetscFunctionBegin;
1460   PetscCall(VecGetArrayRead(bb, &b));
1461   PetscCall(VecGetArray(xx, &x));
1462   t = a->solve_work;
1463 
1464   PetscCall(ISGetIndices(isrow, &rout));
1465   r = rout;
1466   PetscCall(ISGetIndices(iscol, &cout));
1467   c = cout;
1468 
1469   /* forward solve the lower triangular */
1470   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1471   for (i = 1; i < n; i++) {
1472     v  = aa + bs2 * ai[i];
1473     vi = aj + ai[i];
1474     nz = ai[i + 1] - ai[i];
1475     s  = t + bs * i;
1476     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1477     for (m = 0; m < nz; m++) {
1478       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1479       v += bs2;
1480     }
1481   }
1482 
1483   /* backward solve the upper triangular */
1484   ls = a->solve_work + A->cmap->n;
1485   for (i = n - 1; i >= 0; i--) {
1486     v  = aa + bs2 * (adiag[i + 1] + 1);
1487     vi = aj + adiag[i + 1] + 1;
1488     nz = adiag[i] - adiag[i + 1] - 1;
1489     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1490     for (m = 0; m < nz; m++) {
1491       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1492       v += bs2;
1493     }
1494     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1495     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1496   }
1497   PetscCall(ISRestoreIndices(isrow, &rout));
1498   PetscCall(ISRestoreIndices(iscol, &cout));
1499   PetscCall(VecRestoreArrayRead(bb, &b));
1500   PetscCall(VecRestoreArray(xx, &x));
1501   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1502   PetscFunctionReturn(PETSC_SUCCESS);
1503 }
1504 
1505 /*
1506     For each block in an block array saves the largest absolute value in the block into another array
1507 */
1508 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1509 {
1510   PetscInt i, j;
1511 
1512   PetscFunctionBegin;
1513   PetscCall(PetscArrayzero(absarray, nbs + 1));
1514   for (i = 0; i < nbs; i++) {
1515     for (j = 0; j < bs2; j++) {
1516       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1517     }
1518   }
1519   PetscFunctionReturn(PETSC_SUCCESS);
1520 }
1521 
1522 /*
1523      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1524 */
1525 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1526 {
1527   Mat             B = *fact;
1528   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1529   IS              isicol;
1530   const PetscInt *r, *ic;
1531   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1532   PetscInt       *bi, *bj, *bdiag;
1533 
1534   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1535   PetscInt   nlnk, *lnk;
1536   PetscBT    lnkbt;
1537   PetscBool  row_identity, icol_identity;
1538   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1539   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1540 
1541   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1542   PetscInt   nnz_max;
1543   PetscBool  missing;
1544   PetscReal *vtmp_abs;
1545   MatScalar *v_work;
1546   PetscInt  *v_pivots;
1547   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1548 
1549   PetscFunctionBegin;
1550   /* ------- symbolic factorization, can be reused ---------*/
1551   PetscCall(MatMissingDiagonal(A, &missing, &i));
1552   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1553   adiag = a->diag;
1554 
1555   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1556 
1557   /* bdiag is location of diagonal in factor */
1558   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1559 
1560   /* allocate row pointers bi */
1561   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1562 
1563   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1564   dtcount = (PetscInt)info->dtcount;
1565   if (dtcount > mbs - 1) dtcount = mbs - 1;
1566   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1567   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1568   PetscCall(PetscMalloc1(nnz_max, &bj));
1569   nnz_max = nnz_max * bs2;
1570   PetscCall(PetscMalloc1(nnz_max, &ba));
1571 
1572   /* put together the new matrix */
1573   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1574 
1575   b               = (Mat_SeqBAIJ *)(B)->data;
1576   b->free_a       = PETSC_TRUE;
1577   b->free_ij      = PETSC_TRUE;
1578   b->singlemalloc = PETSC_FALSE;
1579 
1580   b->a    = ba;
1581   b->j    = bj;
1582   b->i    = bi;
1583   b->diag = bdiag;
1584   b->ilen = NULL;
1585   b->imax = NULL;
1586   b->row  = isrow;
1587   b->col  = iscol;
1588 
1589   PetscCall(PetscObjectReference((PetscObject)isrow));
1590   PetscCall(PetscObjectReference((PetscObject)iscol));
1591 
1592   b->icol = isicol;
1593   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1594   b->maxnz = nnz_max / bs2;
1595 
1596   (B)->factortype            = MAT_FACTOR_ILUDT;
1597   (B)->info.factor_mallocs   = 0;
1598   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1599   /* ------- end of symbolic factorization ---------*/
1600   PetscCall(ISGetIndices(isrow, &r));
1601   PetscCall(ISGetIndices(isicol, &ic));
1602 
1603   /* linked list for storing column indices of the active row */
1604   nlnk = mbs + 1;
1605   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1606 
1607   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1608   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1609   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1610   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1611   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1612   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1613 
1614   allowzeropivot  = PetscNot(A->erroriffailure);
1615   bi[0]           = 0;
1616   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1617   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1618   for (i = 0; i < mbs; i++) {
1619     /* copy initial fill into linked list */
1620     nzi = ai[r[i] + 1] - ai[r[i]];
1621     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1622     nzi_al = adiag[r[i]] - ai[r[i]];
1623     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1624 
1625     /* load in initial unfactored row */
1626     ajtmp = aj + ai[r[i]];
1627     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1628     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1629     aatmp = a->a + bs2 * ai[r[i]];
1630     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1631 
1632     /* add pivot rows into linked list */
1633     row = lnk[mbs];
1634     while (row < i) {
1635       nzi_bl = bi[row + 1] - bi[row] + 1;
1636       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1637       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1638       nzi += nlnk;
1639       row = lnk[row];
1640     }
1641 
1642     /* copy data from lnk into jtmp, then initialize lnk */
1643     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1644 
1645     /* numerical factorization */
1646     bjtmp = jtmp;
1647     row   = *bjtmp++; /* 1st pivot row */
1648 
1649     while (row < i) {
1650       pc = rtmp + bs2 * row;
1651       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1652       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1653       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1654       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1655         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1656         pv = ba + bs2 * (bdiag[row + 1] + 1);
1657         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1658         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1659         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1660       }
1661       row = *bjtmp++;
1662     }
1663 
1664     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1665     nzi_bl = 0;
1666     j      = 0;
1667     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1668       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1669       nzi_bl++;
1670       j++;
1671     }
1672     nzi_bu = nzi - nzi_bl - 1;
1673 
1674     while (j < nzi) { /* U-part */
1675       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1676       j++;
1677     }
1678 
1679     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1680 
1681     bjtmp = bj + bi[i];
1682     batmp = ba + bs2 * bi[i];
1683     /* apply level dropping rule to L part */
1684     ncut = nzi_al + dtcount;
1685     if (ncut < nzi_bl) {
1686       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1687       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1688     } else {
1689       ncut = nzi_bl;
1690     }
1691     for (j = 0; j < ncut; j++) {
1692       bjtmp[j] = jtmp[j];
1693       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1694     }
1695     bi[i + 1] = bi[i] + ncut;
1696     nzi       = ncut + 1;
1697 
1698     /* apply level dropping rule to U part */
1699     ncut = nzi_au + dtcount;
1700     if (ncut < nzi_bu) {
1701       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1702       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1703     } else {
1704       ncut = nzi_bu;
1705     }
1706     nzi += ncut;
1707 
1708     /* mark bdiagonal */
1709     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1710     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1711 
1712     bjtmp = bj + bdiag[i];
1713     batmp = ba + bs2 * bdiag[i];
1714     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1715     *bjtmp = i;
1716 
1717     bjtmp = bj + bdiag[i + 1] + 1;
1718     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1719 
1720     for (k = 0; k < ncut; k++) {
1721       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1722       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1723     }
1724 
1725     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1726 
1727     /* invert diagonal block for simpler triangular solves - add shift??? */
1728     batmp = ba + bs2 * bdiag[i];
1729 
1730     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1731     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1732   } /* for (i=0; i<mbs; i++) */
1733   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1734 
1735   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1736   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1737 
1738   PetscCall(ISRestoreIndices(isrow, &r));
1739   PetscCall(ISRestoreIndices(isicol, &ic));
1740 
1741   PetscCall(PetscLLDestroy(lnk, lnkbt));
1742 
1743   PetscCall(PetscFree2(im, jtmp));
1744   PetscCall(PetscFree2(rtmp, vtmp));
1745 
1746   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1747   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1748 
1749   PetscCall(ISIdentity(isrow, &row_identity));
1750   PetscCall(ISIdentity(isicol, &icol_identity));
1751   if (row_identity && icol_identity) {
1752     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1753   } else {
1754     B->ops->solve = MatSolve_SeqBAIJ_N;
1755   }
1756 
1757   B->ops->solveadd          = NULL;
1758   B->ops->solvetranspose    = NULL;
1759   B->ops->solvetransposeadd = NULL;
1760   B->ops->matsolve          = NULL;
1761   B->assembled              = PETSC_TRUE;
1762   B->preallocated           = PETSC_TRUE;
1763   PetscFunctionReturn(PETSC_SUCCESS);
1764 }
1765