xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
9 {
10   Mat             C = B;
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
12   IS              isrow = b->row, isicol = b->icol;
13   const PetscInt *r, *ic;
14   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
15   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
16   MatScalar      *rtmp, *pc, *mwork, *pv;
17   MatScalar      *aa             = a->a, *v;
18   PetscReal       shift          = info->shiftamount;
19   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
20 
21   PetscFunctionBegin;
22   PetscCall(ISGetIndices(isrow, &r));
23   PetscCall(ISGetIndices(isicol, &ic));
24 
25   /* generate work space needed by the factorization */
26   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
27   PetscCall(PetscArrayzero(rtmp, bs2 * n));
28 
29   for (PetscInt i = 0; i < n; i++) {
30     /* zero rtmp */
31     /* L part */
32     PetscInt *pj;
33     PetscInt  nzL, nz = bi[i + 1] - bi[i];
34     bjtmp = bj + bi[i];
35     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
36 
37     /* U part */
38     nz    = bdiag[i] - bdiag[i + 1];
39     bjtmp = bj + bdiag[i + 1] + 1;
40     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
41 
42     /* load in initial (unfactored row) */
43     nz    = ai[r[i] + 1] - ai[r[i]];
44     ajtmp = aj + ai[r[i]];
45     v     = aa + bs2 * ai[r[i]];
46     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
47 
48     /* elimination */
49     bjtmp = bj + bi[i];
50     nzL   = bi[i + 1] - bi[i];
51     for (PetscInt k = 0; k < nzL; k++) {
52       PetscBool flg = PETSC_FALSE;
53       PetscInt  row = bjtmp[k];
54 
55       pc = rtmp + bs2 * row;
56       for (PetscInt j = 0; j < bs2; j++) {
57         if (pc[j] != (PetscScalar)0.0) {
58           flg = PETSC_TRUE;
59           break;
60         }
61       }
62       if (flg) {
63         pv = b->a + bs2 * bdiag[row];
64         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
65         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66 
67         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
68         pv = b->a + bs2 * (bdiag[row + 1] + 1);
69         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
70         for (PetscInt j = 0; j < nz; j++) {
71           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
72           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
73           v = rtmp + 4 * pj[j];
74           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
75           pv += 4;
76         }
77         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
78       }
79     }
80 
81     /* finished row so stick it into b->a */
82     /* L part */
83     pv = b->a + bs2 * bi[i];
84     pj = b->j + bi[i];
85     nz = bi[i + 1] - bi[i];
86     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87 
88     /* Mark diagonal and invert diagonal for simpler triangular solves */
89     pv = b->a + bs2 * bdiag[i];
90     pj = b->j + bdiag[i];
91     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
92     {
93       PetscBool zeropivotdetected;
94 
95       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
96       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
97     }
98 
99     /* U part */
100     pv = b->a + bs2 * (bdiag[i + 1] + 1);
101     pj = b->j + bdiag[i + 1] + 1;
102     nz = bdiag[i] - bdiag[i + 1] - 1;
103     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
104   }
105 
106   PetscCall(PetscFree2(rtmp, mwork));
107   PetscCall(ISRestoreIndices(isicol, &ic));
108   PetscCall(ISRestoreIndices(isrow, &r));
109 
110   C->ops->solve          = MatSolve_SeqBAIJ_2;
111   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
112   C->assembled           = PETSC_TRUE;
113 
114   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
115   PetscFunctionReturn(PETSC_SUCCESS);
116 }
117 
118 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
119 {
120   Mat             C = B;
121   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
122   PetscInt        i, j, k, nz, nzL, row, *pj;
123   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
124   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
125   MatScalar      *rtmp, *pc, *mwork, *pv;
126   MatScalar      *aa = a->a, *v;
127   PetscInt        flg;
128   PetscReal       shift = info->shiftamount;
129   PetscBool       allowzeropivot, zeropivotdetected;
130 
131   PetscFunctionBegin;
132   allowzeropivot = PetscNot(A->erroriffailure);
133 
134   /* generate work space needed by the factorization */
135   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
136   PetscCall(PetscArrayzero(rtmp, bs2 * n));
137 
138   for (i = 0; i < n; i++) {
139     /* zero rtmp */
140     /* L part */
141     nz    = bi[i + 1] - bi[i];
142     bjtmp = bj + bi[i];
143     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
144 
145     /* U part */
146     nz    = bdiag[i] - bdiag[i + 1];
147     bjtmp = bj + bdiag[i + 1] + 1;
148     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
149 
150     /* load in initial (unfactored row) */
151     nz    = ai[i + 1] - ai[i];
152     ajtmp = aj + ai[i];
153     v     = aa + bs2 * ai[i];
154     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
155 
156     /* elimination */
157     bjtmp = bj + bi[i];
158     nzL   = bi[i + 1] - bi[i];
159     for (k = 0; k < nzL; k++) {
160       row = bjtmp[k];
161       pc  = rtmp + bs2 * row;
162       for (flg = 0, j = 0; j < bs2; j++) {
163         if (pc[j] != (PetscScalar)0.0) {
164           flg = 1;
165           break;
166         }
167       }
168       if (flg) {
169         pv = b->a + bs2 * bdiag[row];
170         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
171         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
172 
173         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
174         pv = b->a + bs2 * (bdiag[row + 1] + 1);
175         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
176         for (j = 0; j < nz; j++) {
177           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
178           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
179           v = rtmp + 4 * pj[j];
180           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
181           pv += 4;
182         }
183         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
184       }
185     }
186 
187     /* finished row so stick it into b->a */
188     /* L part */
189     pv = b->a + bs2 * bi[i];
190     pj = b->j + bi[i];
191     nz = bi[i + 1] - bi[i];
192     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
193 
194     /* Mark diagonal and invert diagonal for simpler triangular solves */
195     pv = b->a + bs2 * bdiag[i];
196     pj = b->j + bdiag[i];
197     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
198     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
199     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200 
201     /* U part */
202     /*
203     pv = b->a + bs2*bi[2*n-i];
204     pj = b->j + bi[2*n-i];
205     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
206     */
207     pv = b->a + bs2 * (bdiag[i + 1] + 1);
208     pj = b->j + bdiag[i + 1] + 1;
209     nz = bdiag[i] - bdiag[i + 1] - 1;
210     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
211   }
212   PetscCall(PetscFree2(rtmp, mwork));
213 
214   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
217   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
218   C->assembled           = PETSC_TRUE;
219 
220   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
225 {
226   Mat             C = B;
227   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228   IS              isrow = b->row, isicol = b->icol;
229   const PetscInt *r, *ic;
230   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
231   PetscInt       *ajtmpold, *ajtmp, nz, row;
232   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
233   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234   MatScalar       p1, p2, p3, p4;
235   MatScalar      *ba = b->a, *aa = a->a;
236   PetscReal       shift = info->shiftamount;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   allowzeropivot = PetscNot(A->erroriffailure);
241   PetscCall(ISGetIndices(isrow, &r));
242   PetscCall(ISGetIndices(isicol, &ic));
243   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
244 
245   for (i = 0; i < n; i++) {
246     nz    = bi[i + 1] - bi[i];
247     ajtmp = bj + bi[i];
248     for (j = 0; j < nz; j++) {
249       x    = rtmp + 4 * ajtmp[j];
250       x[0] = x[1] = x[2] = x[3] = 0.0;
251     }
252     /* load in initial (unfactored row) */
253     idx      = r[i];
254     nz       = ai[idx + 1] - ai[idx];
255     ajtmpold = aj + ai[idx];
256     v        = aa + 4 * ai[idx];
257     for (j = 0; j < nz; j++) {
258       x    = rtmp + 4 * ic[ajtmpold[j]];
259       x[0] = v[0];
260       x[1] = v[1];
261       x[2] = v[2];
262       x[3] = v[3];
263       v += 4;
264     }
265     row = *ajtmp++;
266     while (row < i) {
267       pc = rtmp + 4 * row;
268       p1 = pc[0];
269       p2 = pc[1];
270       p3 = pc[2];
271       p4 = pc[3];
272       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
273         pv    = ba + 4 * diag_offset[row];
274         pj    = bj + diag_offset[row] + 1;
275         x1    = pv[0];
276         x2    = pv[1];
277         x3    = pv[2];
278         x4    = pv[3];
279         pc[0] = m1 = p1 * x1 + p3 * x2;
280         pc[1] = m2 = p2 * x1 + p4 * x2;
281         pc[2] = m3 = p1 * x3 + p3 * x4;
282         pc[3] = m4 = p2 * x3 + p4 * x4;
283         nz         = bi[row + 1] - diag_offset[row] - 1;
284         pv += 4;
285         for (j = 0; j < nz; j++) {
286           x1 = pv[0];
287           x2 = pv[1];
288           x3 = pv[2];
289           x4 = pv[3];
290           x  = rtmp + 4 * pj[j];
291           x[0] -= m1 * x1 + m3 * x2;
292           x[1] -= m2 * x1 + m4 * x2;
293           x[2] -= m1 * x3 + m3 * x4;
294           x[3] -= m2 * x3 + m4 * x4;
295           pv += 4;
296         }
297         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
298       }
299       row = *ajtmp++;
300     }
301     /* finished row so stick it into b->a */
302     pv = ba + 4 * bi[i];
303     pj = bj + bi[i];
304     nz = bi[i + 1] - bi[i];
305     for (j = 0; j < nz; j++) {
306       x     = rtmp + 4 * pj[j];
307       pv[0] = x[0];
308       pv[1] = x[1];
309       pv[2] = x[2];
310       pv[3] = x[3];
311       pv += 4;
312     }
313     /* invert diagonal block */
314     w = ba + 4 * diag_offset[i];
315     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
316     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317   }
318 
319   PetscCall(PetscFree(rtmp));
320   PetscCall(ISRestoreIndices(isicol, &ic));
321   PetscCall(ISRestoreIndices(isrow, &r));
322 
323   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
324   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
325   C->assembled           = PETSC_TRUE;
326 
327   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
328   PetscFunctionReturn(PETSC_SUCCESS);
329 }
330 /*
331       Version for when blocks are 2 by 2 Using natural ordering
332 */
333 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
334 {
335   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
336   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
337   PetscInt    *ajtmpold, *ajtmp, nz, row;
338   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
339   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
340   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
341   MatScalar   *ba = b->a, *aa = a->a;
342   PetscReal    shift = info->shiftamount;
343   PetscBool    allowzeropivot, zeropivotdetected;
344 
345   PetscFunctionBegin;
346   allowzeropivot = PetscNot(A->erroriffailure);
347   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
348   for (i = 0; i < n; i++) {
349     nz    = bi[i + 1] - bi[i];
350     ajtmp = bj + bi[i];
351     for (j = 0; j < nz; j++) {
352       x    = rtmp + 4 * ajtmp[j];
353       x[0] = x[1] = x[2] = x[3] = 0.0;
354     }
355     /* load in initial (unfactored row) */
356     nz       = ai[i + 1] - ai[i];
357     ajtmpold = aj + ai[i];
358     v        = aa + 4 * ai[i];
359     for (j = 0; j < nz; j++) {
360       x    = rtmp + 4 * ajtmpold[j];
361       x[0] = v[0];
362       x[1] = v[1];
363       x[2] = v[2];
364       x[3] = v[3];
365       v += 4;
366     }
367     row = *ajtmp++;
368     while (row < i) {
369       pc = rtmp + 4 * row;
370       p1 = pc[0];
371       p2 = pc[1];
372       p3 = pc[2];
373       p4 = pc[3];
374       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
375         pv    = ba + 4 * diag_offset[row];
376         pj    = bj + diag_offset[row] + 1;
377         x1    = pv[0];
378         x2    = pv[1];
379         x3    = pv[2];
380         x4    = pv[3];
381         pc[0] = m1 = p1 * x1 + p3 * x2;
382         pc[1] = m2 = p2 * x1 + p4 * x2;
383         pc[2] = m3 = p1 * x3 + p3 * x4;
384         pc[3] = m4 = p2 * x3 + p4 * x4;
385         nz         = bi[row + 1] - diag_offset[row] - 1;
386         pv += 4;
387         for (j = 0; j < nz; j++) {
388           x1 = pv[0];
389           x2 = pv[1];
390           x3 = pv[2];
391           x4 = pv[3];
392           x  = rtmp + 4 * pj[j];
393           x[0] -= m1 * x1 + m3 * x2;
394           x[1] -= m2 * x1 + m4 * x2;
395           x[2] -= m1 * x3 + m3 * x4;
396           x[3] -= m2 * x3 + m4 * x4;
397           pv += 4;
398         }
399         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
400       }
401       row = *ajtmp++;
402     }
403     /* finished row so stick it into b->a */
404     pv = ba + 4 * bi[i];
405     pj = bj + bi[i];
406     nz = bi[i + 1] - bi[i];
407     for (j = 0; j < nz; j++) {
408       x     = rtmp + 4 * pj[j];
409       pv[0] = x[0];
410       pv[1] = x[1];
411       pv[2] = x[2];
412       pv[3] = x[3];
413       /*
414       printf(" col %d:",pj[j]);
415       PetscInt j1;
416       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
417       printf("\n");
418       */
419       pv += 4;
420     }
421     /* invert diagonal block */
422     w = ba + 4 * diag_offset[i];
423     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
424     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
425   }
426 
427   PetscCall(PetscFree(rtmp));
428 
429   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
430   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
431   C->assembled           = PETSC_TRUE;
432 
433   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 /*
438      Version for when blocks are 1 by 1.
439 */
440 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
441 {
442   Mat              C = B;
443   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
444   IS               isrow = b->row, isicol = b->icol;
445   const PetscInt  *r, *ic, *ics;
446   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
447   PetscInt         i, j, k, nz, nzL, row, *pj;
448   const PetscInt  *ajtmp, *bjtmp;
449   MatScalar       *rtmp, *pc, multiplier, *pv;
450   const MatScalar *aa = a->a, *v;
451   PetscBool        row_identity, col_identity;
452   FactorShiftCtx   sctx;
453   const PetscInt  *ddiag;
454   PetscReal        rs;
455   MatScalar        d;
456 
457   PetscFunctionBegin;
458   /* MatPivotSetUp(): initialize shift context sctx */
459   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
460 
461   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
462     ddiag          = a->diag;
463     sctx.shift_top = info->zeropivot;
464     for (i = 0; i < n; i++) {
465       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
466       d  = (aa)[ddiag[i]];
467       rs = -PetscAbsScalar(d) - PetscRealPart(d);
468       v  = aa + ai[i];
469       nz = ai[i + 1] - ai[i];
470       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
471       if (rs > sctx.shift_top) sctx.shift_top = rs;
472     }
473     sctx.shift_top *= 1.1;
474     sctx.nshift_max = 5;
475     sctx.shift_lo   = 0.;
476     sctx.shift_hi   = 1.;
477   }
478 
479   PetscCall(ISGetIndices(isrow, &r));
480   PetscCall(ISGetIndices(isicol, &ic));
481   PetscCall(PetscMalloc1(n + 1, &rtmp));
482   ics = ic;
483 
484   do {
485     sctx.newshift = PETSC_FALSE;
486     for (i = 0; i < n; i++) {
487       /* zero rtmp */
488       /* L part */
489       nz    = bi[i + 1] - bi[i];
490       bjtmp = bj + bi[i];
491       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
492 
493       /* U part */
494       nz    = bdiag[i] - bdiag[i + 1];
495       bjtmp = bj + bdiag[i + 1] + 1;
496       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
497 
498       /* load in initial (unfactored row) */
499       nz    = ai[r[i] + 1] - ai[r[i]];
500       ajtmp = aj + ai[r[i]];
501       v     = aa + ai[r[i]];
502       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
503 
504       /* ZeropivotApply() */
505       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
506 
507       /* elimination */
508       bjtmp = bj + bi[i];
509       row   = *bjtmp++;
510       nzL   = bi[i + 1] - bi[i];
511       for (k = 0; k < nzL; k++) {
512         pc = rtmp + row;
513         if (*pc != (PetscScalar)0.0) {
514           pv         = b->a + bdiag[row];
515           multiplier = *pc * (*pv);
516           *pc        = multiplier;
517 
518           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
519           pv = b->a + bdiag[row + 1] + 1;
520           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
521           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
522           PetscCall(PetscLogFlops(2.0 * nz));
523         }
524         row = *bjtmp++;
525       }
526 
527       /* finished row so stick it into b->a */
528       rs = 0.0;
529       /* L part */
530       pv = b->a + bi[i];
531       pj = b->j + bi[i];
532       nz = bi[i + 1] - bi[i];
533       for (j = 0; j < nz; j++) {
534         pv[j] = rtmp[pj[j]];
535         rs += PetscAbsScalar(pv[j]);
536       }
537 
538       /* U part */
539       pv = b->a + bdiag[i + 1] + 1;
540       pj = b->j + bdiag[i + 1] + 1;
541       nz = bdiag[i] - bdiag[i + 1] - 1;
542       for (j = 0; j < nz; j++) {
543         pv[j] = rtmp[pj[j]];
544         rs += PetscAbsScalar(pv[j]);
545       }
546 
547       sctx.rs = rs;
548       sctx.pv = rtmp[i];
549       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
550       if (sctx.newshift) break; /* break for-loop */
551       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
552 
553       /* Mark diagonal and invert diagonal for simpler triangular solves */
554       pv  = b->a + bdiag[i];
555       *pv = (PetscScalar)1.0 / rtmp[i];
556 
557     } /* endof for (i=0; i<n; i++) { */
558 
559     /* MatPivotRefine() */
560     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
561       /*
562        * if no shift in this attempt & shifting & started shifting & can refine,
563        * then try lower shift
564        */
565       sctx.shift_hi       = sctx.shift_fraction;
566       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
567       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
568       sctx.newshift       = PETSC_TRUE;
569       sctx.nshift++;
570     }
571   } while (sctx.newshift);
572 
573   PetscCall(PetscFree(rtmp));
574   PetscCall(ISRestoreIndices(isicol, &ic));
575   PetscCall(ISRestoreIndices(isrow, &r));
576 
577   PetscCall(ISIdentity(isrow, &row_identity));
578   PetscCall(ISIdentity(isicol, &col_identity));
579   if (row_identity && col_identity) {
580     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
581     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
582     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
583     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
584   } else {
585     C->ops->solve          = MatSolve_SeqBAIJ_1;
586     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
587   }
588   C->assembled = PETSC_TRUE;
589   PetscCall(PetscLogFlops(C->cmap->n));
590 
591   /* MatShiftView(A,info,&sctx) */
592   if (sctx.nshift) {
593     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
594       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
595     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
596       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
597     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
598       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
599     }
600   }
601   PetscFunctionReturn(PETSC_SUCCESS);
602 }
603 
604 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
605 {
606   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
607   IS              isrow = b->row, isicol = b->icol;
608   const PetscInt *r, *ic;
609   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
610   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
611   PetscInt       *diag_offset = b->diag, diag, *pj;
612   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
613   MatScalar      *ba = b->a, *aa = a->a;
614   PetscBool       row_identity, col_identity;
615 
616   PetscFunctionBegin;
617   PetscCall(ISGetIndices(isrow, &r));
618   PetscCall(ISGetIndices(isicol, &ic));
619   PetscCall(PetscMalloc1(n + 1, &rtmp));
620 
621   for (i = 0; i < n; i++) {
622     nz    = bi[i + 1] - bi[i];
623     ajtmp = bj + bi[i];
624     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
625 
626     /* load in initial (unfactored row) */
627     nz       = ai[r[i] + 1] - ai[r[i]];
628     ajtmpold = aj + ai[r[i]];
629     v        = aa + ai[r[i]];
630     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
631 
632     row = *ajtmp++;
633     while (row < i) {
634       pc = rtmp + row;
635       if (*pc != 0.0) {
636         pv         = ba + diag_offset[row];
637         pj         = bj + diag_offset[row] + 1;
638         multiplier = *pc * *pv++;
639         *pc        = multiplier;
640         nz         = bi[row + 1] - diag_offset[row] - 1;
641         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
642         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
643       }
644       row = *ajtmp++;
645     }
646     /* finished row so stick it into b->a */
647     pv = ba + bi[i];
648     pj = bj + bi[i];
649     nz = bi[i + 1] - bi[i];
650     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
651     diag = diag_offset[i] - bi[i];
652     /* check pivot entry for current row */
653     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
654     pv[diag] = 1.0 / pv[diag];
655   }
656 
657   PetscCall(PetscFree(rtmp));
658   PetscCall(ISRestoreIndices(isicol, &ic));
659   PetscCall(ISRestoreIndices(isrow, &r));
660   PetscCall(ISIdentity(isrow, &row_identity));
661   PetscCall(ISIdentity(isicol, &col_identity));
662   if (row_identity && col_identity) {
663     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
664     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
665   } else {
666     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
667     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
668   }
669   C->assembled = PETSC_TRUE;
670   PetscCall(PetscLogFlops(C->cmap->n));
671   PetscFunctionReturn(PETSC_SUCCESS);
672 }
673 
674 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
675 {
676   PetscFunctionBegin;
677   *type = MATSOLVERPETSC;
678   PetscFunctionReturn(PETSC_SUCCESS);
679 }
680 
681 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
682 {
683   PetscInt n = A->rmap->n;
684 
685   PetscFunctionBegin;
686 #if defined(PETSC_USE_COMPLEX)
687   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
688 #endif
689   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
690   PetscCall(MatSetSizes(*B, n, n, n, n));
691   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
692     PetscCall(MatSetType(*B, MATSEQBAIJ));
693 
694     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
695     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
696     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
697     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
698     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
699   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
700     PetscCall(MatSetType(*B, MATSEQSBAIJ));
701     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
702 
703     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
704     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
705     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
706     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
707     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
708   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
709   (*B)->factortype     = ftype;
710   (*B)->canuseordering = PETSC_TRUE;
711 
712   PetscCall(PetscFree((*B)->solvertype));
713   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
714   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
715   PetscFunctionReturn(PETSC_SUCCESS);
716 }
717 
718 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
719 {
720   Mat C;
721 
722   PetscFunctionBegin;
723   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
724   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
725   PetscCall(MatLUFactorNumeric(C, A, info));
726 
727   A->ops->solve          = C->ops->solve;
728   A->ops->solvetranspose = C->ops->solvetranspose;
729 
730   PetscCall(MatHeaderMerge(A, &C));
731   PetscFunctionReturn(PETSC_SUCCESS);
732 }
733 
734 #include <../src/mat/impls/sbaij/seq/sbaij.h>
735 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
736 {
737   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
738   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
739   IS              ip = b->row;
740   const PetscInt *rip;
741   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
742   PetscInt       *ai = a->i, *aj = a->j;
743   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
744   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
745   PetscReal       rs;
746   FactorShiftCtx  sctx;
747 
748   PetscFunctionBegin;
749   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
750     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
751     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
752     PetscCall(MatDestroy(&a->sbaijMat));
753     PetscFunctionReturn(PETSC_SUCCESS);
754   }
755 
756   /* MatPivotSetUp(): initialize shift context sctx */
757   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
758 
759   PetscCall(ISGetIndices(ip, &rip));
760   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
761 
762   sctx.shift_amount = 0.;
763   sctx.nshift       = 0;
764   do {
765     sctx.newshift = PETSC_FALSE;
766     for (i = 0; i < mbs; i++) {
767       rtmp[i] = 0.0;
768       jl[i]   = mbs;
769       il[0]   = 0;
770     }
771 
772     for (k = 0; k < mbs; k++) {
773       bval = ba + bi[k];
774       /* initialize k-th row by the perm[k]-th row of A */
775       jmin = ai[rip[k]];
776       jmax = ai[rip[k] + 1];
777       for (j = jmin; j < jmax; j++) {
778         col = rip[aj[j]];
779         if (col >= k) { /* only take upper triangular entry */
780           rtmp[col] = aa[j];
781           *bval++   = 0.0; /* for in-place factorization */
782         }
783       }
784 
785       /* shift the diagonal of the matrix */
786       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
787 
788       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
789       dk = rtmp[k];
790       i  = jl[k]; /* first row to be added to k_th row  */
791 
792       while (i < k) {
793         nexti = jl[i]; /* next row to be added to k_th row */
794 
795         /* compute multiplier, update diag(k) and U(i,k) */
796         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
797         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
798         dk += uikdi * ba[ili];
799         ba[ili] = uikdi; /* -U(i,k) */
800 
801         /* add multiple of row i to k-th row */
802         jmin = ili + 1;
803         jmax = bi[i + 1];
804         if (jmin < jmax) {
805           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
806           /* update il and jl for row i */
807           il[i] = jmin;
808           j     = bj[jmin];
809           jl[i] = jl[j];
810           jl[j] = i;
811         }
812         i = nexti;
813       }
814 
815       /* shift the diagonals when zero pivot is detected */
816       /* compute rs=sum of abs(off-diagonal) */
817       rs   = 0.0;
818       jmin = bi[k] + 1;
819       nz   = bi[k + 1] - jmin;
820       if (nz) {
821         bcol = bj + jmin;
822         while (nz--) {
823           rs += PetscAbsScalar(rtmp[*bcol]);
824           bcol++;
825         }
826       }
827 
828       sctx.rs = rs;
829       sctx.pv = dk;
830       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
831       if (sctx.newshift) break;
832       dk = sctx.pv;
833 
834       /* copy data into U(k,:) */
835       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
836       jmin      = bi[k] + 1;
837       jmax      = bi[k + 1];
838       if (jmin < jmax) {
839         for (j = jmin; j < jmax; j++) {
840           col       = bj[j];
841           ba[j]     = rtmp[col];
842           rtmp[col] = 0.0;
843         }
844         /* add the k-th row into il and jl */
845         il[k] = jmin;
846         i     = bj[jmin];
847         jl[k] = jl[i];
848         jl[i] = k;
849       }
850     }
851   } while (sctx.newshift);
852   PetscCall(PetscFree3(rtmp, il, jl));
853 
854   PetscCall(ISRestoreIndices(ip, &rip));
855 
856   C->assembled    = PETSC_TRUE;
857   C->preallocated = PETSC_TRUE;
858 
859   PetscCall(PetscLogFlops(C->rmap->N));
860   if (sctx.nshift) {
861     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
862       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
863     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
864       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
865     }
866   }
867   PetscFunctionReturn(PETSC_SUCCESS);
868 }
869 
870 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
871 {
872   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
873   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
874   PetscInt       i, j, am = a->mbs;
875   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
876   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
877   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
878   PetscReal      rs;
879   FactorShiftCtx sctx;
880 
881   PetscFunctionBegin;
882   /* MatPivotSetUp(): initialize shift context sctx */
883   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
884 
885   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
886 
887   do {
888     sctx.newshift = PETSC_FALSE;
889     for (i = 0; i < am; i++) {
890       rtmp[i] = 0.0;
891       jl[i]   = am;
892       il[0]   = 0;
893     }
894 
895     for (k = 0; k < am; k++) {
896       /* initialize k-th row with elements nonzero in row perm(k) of A */
897       nz   = ai[k + 1] - ai[k];
898       acol = aj + ai[k];
899       aval = aa + ai[k];
900       bval = ba + bi[k];
901       while (nz--) {
902         if (*acol < k) { /* skip lower triangular entries */
903           acol++;
904           aval++;
905         } else {
906           rtmp[*acol++] = *aval++;
907           *bval++       = 0.0; /* for in-place factorization */
908         }
909       }
910 
911       /* shift the diagonal of the matrix */
912       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
913 
914       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
915       dk = rtmp[k];
916       i  = jl[k]; /* first row to be added to k_th row  */
917 
918       while (i < k) {
919         nexti = jl[i]; /* next row to be added to k_th row */
920         /* compute multiplier, update D(k) and U(i,k) */
921         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
922         uikdi = -ba[ili] * ba[bi[i]];
923         dk += uikdi * ba[ili];
924         ba[ili] = uikdi; /* -U(i,k) */
925 
926         /* add multiple of row i to k-th row ... */
927         jmin = ili + 1;
928         nz   = bi[i + 1] - jmin;
929         if (nz > 0) {
930           bcol = bj + jmin;
931           bval = ba + jmin;
932           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
933           /* update il and jl for i-th row */
934           il[i] = jmin;
935           j     = bj[jmin];
936           jl[i] = jl[j];
937           jl[j] = i;
938         }
939         i = nexti;
940       }
941 
942       /* shift the diagonals when zero pivot is detected */
943       /* compute rs=sum of abs(off-diagonal) */
944       rs   = 0.0;
945       jmin = bi[k] + 1;
946       nz   = bi[k + 1] - jmin;
947       if (nz) {
948         bcol = bj + jmin;
949         while (nz--) {
950           rs += PetscAbsScalar(rtmp[*bcol]);
951           bcol++;
952         }
953       }
954 
955       sctx.rs = rs;
956       sctx.pv = dk;
957       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
958       if (sctx.newshift) break; /* sctx.shift_amount is updated */
959       dk = sctx.pv;
960 
961       /* copy data into U(k,:) */
962       ba[bi[k]] = 1.0 / dk;
963       jmin      = bi[k] + 1;
964       nz        = bi[k + 1] - jmin;
965       if (nz) {
966         bcol = bj + jmin;
967         bval = ba + jmin;
968         while (nz--) {
969           *bval++       = rtmp[*bcol];
970           rtmp[*bcol++] = 0.0;
971         }
972         /* add k-th row into il and jl */
973         il[k] = jmin;
974         i     = bj[jmin];
975         jl[k] = jl[i];
976         jl[i] = k;
977       }
978     }
979   } while (sctx.newshift);
980   PetscCall(PetscFree3(rtmp, il, jl));
981 
982   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
983   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
984   C->assembled           = PETSC_TRUE;
985   C->preallocated        = PETSC_TRUE;
986 
987   PetscCall(PetscLogFlops(C->rmap->N));
988   if (sctx.nshift) {
989     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
990       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
991     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
992       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
993     }
994   }
995   PetscFunctionReturn(PETSC_SUCCESS);
996 }
997 
998 #include <petscbt.h>
999 #include <../src/mat/utils/freespace.h>
1000 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1001 {
1002   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1003   Mat_SeqSBAIJ      *b;
1004   Mat                B;
1005   PetscBool          perm_identity, missing;
1006   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1007   const PetscInt    *rip;
1008   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1009   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1010   PetscReal          fill = info->fill, levels = info->levels;
1011   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1012   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1013   PetscBT            lnkbt;
1014 
1015   PetscFunctionBegin;
1016   PetscCall(MatMissingDiagonal(A, &missing, &i));
1017   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1018 
1019   if (bs > 1) {
1020     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1021     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1022 
1023     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1024     PetscFunctionReturn(PETSC_SUCCESS);
1025   }
1026 
1027   PetscCall(ISIdentity(perm, &perm_identity));
1028   PetscCall(ISGetIndices(perm, &rip));
1029 
1030   /* special case that simply copies fill pattern */
1031   if (!levels && perm_identity) {
1032     PetscCall(PetscMalloc1(am + 1, &ui));
1033     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1034     B = fact;
1035     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1036 
1037     b  = (Mat_SeqSBAIJ *)B->data;
1038     uj = b->j;
1039     for (i = 0; i < am; i++) {
1040       aj = a->j + a->diag[i];
1041       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1042       b->ilen[i] = ui[i];
1043     }
1044     PetscCall(PetscFree(ui));
1045 
1046     B->factortype = MAT_FACTOR_NONE;
1047 
1048     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1049     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1050     B->factortype = MAT_FACTOR_ICC;
1051 
1052     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1053     PetscFunctionReturn(PETSC_SUCCESS);
1054   }
1055 
1056   /* initialization */
1057   PetscCall(PetscMalloc1(am + 1, &ui));
1058   ui[0] = 0;
1059   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1060 
1061   /* jl: linked list for storing indices of the pivot rows
1062      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1063   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1064   for (i = 0; i < am; i++) {
1065     jl[i] = am;
1066     il[i] = 0;
1067   }
1068 
1069   /* create and initialize a linked list for storing column indices of the active row k */
1070   nlnk = am + 1;
1071   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1072 
1073   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1074   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1075 
1076   current_space = free_space;
1077 
1078   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1079   current_space_lvl = free_space_lvl;
1080 
1081   for (k = 0; k < am; k++) { /* for each active row k */
1082     /* initialize lnk by the column indices of row rip[k] of A */
1083     nzk         = 0;
1084     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1085     ncols_upper = 0;
1086     cols        = cols_lvl + am;
1087     for (j = 0; j < ncols; j++) {
1088       i = rip[*(aj + ai[rip[k]] + j)];
1089       if (i >= k) { /* only take upper triangular entry */
1090         cols[ncols_upper]     = i;
1091         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1092         ncols_upper++;
1093       }
1094     }
1095     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1096     nzk += nlnk;
1097 
1098     /* update lnk by computing fill-in for each pivot row to be merged in */
1099     prow = jl[k]; /* 1st pivot row */
1100 
1101     while (prow < k) {
1102       nextprow = jl[prow];
1103 
1104       /* merge prow into k-th row */
1105       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1106       jmax  = ui[prow + 1];
1107       ncols = jmax - jmin;
1108       i     = jmin - ui[prow];
1109       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1110       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1111       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1112       nzk += nlnk;
1113 
1114       /* update il and jl for prow */
1115       if (jmin < jmax) {
1116         il[prow] = jmin;
1117 
1118         j        = *cols;
1119         jl[prow] = jl[j];
1120         jl[j]    = prow;
1121       }
1122       prow = nextprow;
1123     }
1124 
1125     /* if free space is not available, make more free space */
1126     if (current_space->local_remaining < nzk) {
1127       i = am - k + 1;                                                             /* num of unfactored rows */
1128       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1129       PetscCall(PetscFreeSpaceGet(i, &current_space));
1130       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1131       reallocs++;
1132     }
1133 
1134     /* copy data into free_space and free_space_lvl, then initialize lnk */
1135     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1136 
1137     /* add the k-th row into il and jl */
1138     if (nzk - 1 > 0) {
1139       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1140       jl[k] = jl[i];
1141       jl[i] = k;
1142       il[k] = ui[k] + 1;
1143     }
1144     uj_ptr[k]     = current_space->array;
1145     uj_lvl_ptr[k] = current_space_lvl->array;
1146 
1147     current_space->array += nzk;
1148     current_space->local_used += nzk;
1149     current_space->local_remaining -= nzk;
1150 
1151     current_space_lvl->array += nzk;
1152     current_space_lvl->local_used += nzk;
1153     current_space_lvl->local_remaining -= nzk;
1154 
1155     ui[k + 1] = ui[k] + nzk;
1156   }
1157 
1158   PetscCall(ISRestoreIndices(perm, &rip));
1159   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1160   PetscCall(PetscFree(cols_lvl));
1161 
1162   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1163   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1164   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1165   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1166   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1167 
1168   /* put together the new matrix in MATSEQSBAIJ format */
1169   B = fact;
1170   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1171 
1172   b               = (Mat_SeqSBAIJ *)B->data;
1173   b->singlemalloc = PETSC_FALSE;
1174   b->free_a       = PETSC_TRUE;
1175   b->free_ij      = PETSC_TRUE;
1176 
1177   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1178 
1179   b->j             = uj;
1180   b->i             = ui;
1181   b->diag          = NULL;
1182   b->ilen          = NULL;
1183   b->imax          = NULL;
1184   b->row           = perm;
1185   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1186 
1187   PetscCall(PetscObjectReference((PetscObject)perm));
1188 
1189   b->icol = perm;
1190 
1191   PetscCall(PetscObjectReference((PetscObject)perm));
1192   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1193 
1194   b->maxnz = b->nz = ui[am];
1195 
1196   B->info.factor_mallocs   = reallocs;
1197   B->info.fill_ratio_given = fill;
1198   if (ai[am] != 0.) {
1199     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1200     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1201   } else {
1202     B->info.fill_ratio_needed = 0.0;
1203   }
1204 #if defined(PETSC_USE_INFO)
1205   if (ai[am] != 0) {
1206     PetscReal af = B->info.fill_ratio_needed;
1207     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1208     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1209     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1210   } else {
1211     PetscCall(PetscInfo(A, "Empty matrix\n"));
1212   }
1213 #endif
1214   if (perm_identity) {
1215     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1216     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1217     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1218   } else {
1219     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1220   }
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1225 {
1226   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1227   Mat_SeqSBAIJ      *b;
1228   Mat                B;
1229   PetscBool          perm_identity, missing;
1230   PetscReal          fill = info->fill;
1231   const PetscInt    *rip;
1232   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1233   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1234   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1235   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1236   PetscBT            lnkbt;
1237 
1238   PetscFunctionBegin;
1239   if (bs > 1) { /* convert to seqsbaij */
1240     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1241     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1242 
1243     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1244     PetscFunctionReturn(PETSC_SUCCESS);
1245   }
1246 
1247   PetscCall(MatMissingDiagonal(A, &missing, &i));
1248   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1249 
1250   /* check whether perm is the identity mapping */
1251   PetscCall(ISIdentity(perm, &perm_identity));
1252   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1253   PetscCall(ISGetIndices(perm, &rip));
1254 
1255   /* initialization */
1256   PetscCall(PetscMalloc1(mbs + 1, &ui));
1257   ui[0] = 0;
1258 
1259   /* jl: linked list for storing indices of the pivot rows
1260      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1261   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1262   for (i = 0; i < mbs; i++) {
1263     jl[i] = mbs;
1264     il[i] = 0;
1265   }
1266 
1267   /* create and initialize a linked list for storing column indices of the active row k */
1268   nlnk = mbs + 1;
1269   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1270 
1271   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1272   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1273 
1274   current_space = free_space;
1275 
1276   for (k = 0; k < mbs; k++) { /* for each active row k */
1277     /* initialize lnk by the column indices of row rip[k] of A */
1278     nzk         = 0;
1279     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1280     ncols_upper = 0;
1281     for (j = 0; j < ncols; j++) {
1282       i = rip[*(aj + ai[rip[k]] + j)];
1283       if (i >= k) { /* only take upper triangular entry */
1284         cols[ncols_upper] = i;
1285         ncols_upper++;
1286       }
1287     }
1288     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1289     nzk += nlnk;
1290 
1291     /* update lnk by computing fill-in for each pivot row to be merged in */
1292     prow = jl[k]; /* 1st pivot row */
1293 
1294     while (prow < k) {
1295       nextprow = jl[prow];
1296       /* merge prow into k-th row */
1297       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1298       jmax   = ui[prow + 1];
1299       ncols  = jmax - jmin;
1300       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1301       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1302       nzk += nlnk;
1303 
1304       /* update il and jl for prow */
1305       if (jmin < jmax) {
1306         il[prow] = jmin;
1307         j        = *uj_ptr;
1308         jl[prow] = jl[j];
1309         jl[j]    = prow;
1310       }
1311       prow = nextprow;
1312     }
1313 
1314     /* if free space is not available, make more free space */
1315     if (current_space->local_remaining < nzk) {
1316       i = mbs - k + 1;                                                            /* num of unfactored rows */
1317       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1318       PetscCall(PetscFreeSpaceGet(i, &current_space));
1319       reallocs++;
1320     }
1321 
1322     /* copy data into free space, then initialize lnk */
1323     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1324 
1325     /* add the k-th row into il and jl */
1326     if (nzk - 1 > 0) {
1327       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1328       jl[k] = jl[i];
1329       jl[i] = k;
1330       il[k] = ui[k] + 1;
1331     }
1332     ui_ptr[k] = current_space->array;
1333     current_space->array += nzk;
1334     current_space->local_used += nzk;
1335     current_space->local_remaining -= nzk;
1336 
1337     ui[k + 1] = ui[k] + nzk;
1338   }
1339 
1340   PetscCall(ISRestoreIndices(perm, &rip));
1341   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1342 
1343   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1344   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1345   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1346   PetscCall(PetscLLDestroy(lnk, lnkbt));
1347 
1348   /* put together the new matrix in MATSEQSBAIJ format */
1349   B = fact;
1350   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1351 
1352   b               = (Mat_SeqSBAIJ *)B->data;
1353   b->singlemalloc = PETSC_FALSE;
1354   b->free_a       = PETSC_TRUE;
1355   b->free_ij      = PETSC_TRUE;
1356 
1357   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1358 
1359   b->j             = uj;
1360   b->i             = ui;
1361   b->diag          = NULL;
1362   b->ilen          = NULL;
1363   b->imax          = NULL;
1364   b->row           = perm;
1365   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1366 
1367   PetscCall(PetscObjectReference((PetscObject)perm));
1368   b->icol = perm;
1369   PetscCall(PetscObjectReference((PetscObject)perm));
1370   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1371   b->maxnz = b->nz = ui[mbs];
1372 
1373   B->info.factor_mallocs   = reallocs;
1374   B->info.fill_ratio_given = fill;
1375   if (ai[mbs] != 0.) {
1376     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1377     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1378   } else {
1379     B->info.fill_ratio_needed = 0.0;
1380   }
1381 #if defined(PETSC_USE_INFO)
1382   if (ai[mbs] != 0.) {
1383     PetscReal af = B->info.fill_ratio_needed;
1384     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1385     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1386     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1387   } else {
1388     PetscCall(PetscInfo(A, "Empty matrix\n"));
1389   }
1390 #endif
1391   if (perm_identity) {
1392     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1393   } else {
1394     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1395   }
1396   PetscFunctionReturn(PETSC_SUCCESS);
1397 }
1398 
1399 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1400 {
1401   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1402   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1403   PetscInt           i, k, n                       = a->mbs;
1404   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1405   const MatScalar   *aa = a->a, *v;
1406   PetscScalar       *x, *s, *t, *ls;
1407   const PetscScalar *b;
1408 
1409   PetscFunctionBegin;
1410   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1411   PetscCall(VecGetArrayRead(bb, &b));
1412   PetscCall(VecGetArray(xx, &x));
1413   t = a->solve_work;
1414 
1415   /* forward solve the lower triangular */
1416   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1417 
1418   for (i = 1; i < n; i++) {
1419     v  = aa + bs2 * ai[i];
1420     vi = aj + ai[i];
1421     nz = ai[i + 1] - ai[i];
1422     s  = t + bs * i;
1423     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1424     for (k = 0; k < nz; k++) {
1425       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1426       v += bs2;
1427     }
1428   }
1429 
1430   /* backward solve the upper triangular */
1431   ls = a->solve_work + A->cmap->n;
1432   for (i = n - 1; i >= 0; i--) {
1433     v  = aa + bs2 * (adiag[i + 1] + 1);
1434     vi = aj + adiag[i + 1] + 1;
1435     nz = adiag[i] - adiag[i + 1] - 1;
1436     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1437     for (k = 0; k < nz; k++) {
1438       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1439       v += bs2;
1440     }
1441     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1442     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1443   }
1444 
1445   PetscCall(VecRestoreArrayRead(bb, &b));
1446   PetscCall(VecRestoreArray(xx, &x));
1447   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1448   PetscFunctionReturn(PETSC_SUCCESS);
1449 }
1450 
1451 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1452 {
1453   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1454   IS                 iscol = a->col, isrow = a->row;
1455   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1456   PetscInt           i, m, n = a->mbs;
1457   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1458   const MatScalar   *aa = a->a, *v;
1459   PetscScalar       *x, *s, *t, *ls;
1460   const PetscScalar *b;
1461 
1462   PetscFunctionBegin;
1463   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1464   PetscCall(VecGetArrayRead(bb, &b));
1465   PetscCall(VecGetArray(xx, &x));
1466   t = a->solve_work;
1467 
1468   PetscCall(ISGetIndices(isrow, &rout));
1469   r = rout;
1470   PetscCall(ISGetIndices(iscol, &cout));
1471   c = cout;
1472 
1473   /* forward solve the lower triangular */
1474   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1475   for (i = 1; i < n; i++) {
1476     v  = aa + bs2 * ai[i];
1477     vi = aj + ai[i];
1478     nz = ai[i + 1] - ai[i];
1479     s  = t + bs * i;
1480     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1481     for (m = 0; m < nz; m++) {
1482       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1483       v += bs2;
1484     }
1485   }
1486 
1487   /* backward solve the upper triangular */
1488   ls = a->solve_work + A->cmap->n;
1489   for (i = n - 1; i >= 0; i--) {
1490     v  = aa + bs2 * (adiag[i + 1] + 1);
1491     vi = aj + adiag[i + 1] + 1;
1492     nz = adiag[i] - adiag[i + 1] - 1;
1493     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1494     for (m = 0; m < nz; m++) {
1495       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1496       v += bs2;
1497     }
1498     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1499     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1500   }
1501   PetscCall(ISRestoreIndices(isrow, &rout));
1502   PetscCall(ISRestoreIndices(iscol, &cout));
1503   PetscCall(VecRestoreArrayRead(bb, &b));
1504   PetscCall(VecRestoreArray(xx, &x));
1505   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1506   PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508 
1509 /*
1510     For each block in an block array saves the largest absolute value in the block into another array
1511 */
1512 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1513 {
1514   PetscInt i, j;
1515 
1516   PetscFunctionBegin;
1517   PetscCall(PetscArrayzero(absarray, nbs + 1));
1518   for (i = 0; i < nbs; i++) {
1519     for (j = 0; j < bs2; j++) {
1520       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1521     }
1522   }
1523   PetscFunctionReturn(PETSC_SUCCESS);
1524 }
1525 
1526 /*
1527      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1528 */
1529 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1530 {
1531   Mat             B = *fact;
1532   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1533   IS              isicol;
1534   const PetscInt *r, *ic;
1535   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1536   PetscInt       *bi, *bj, *bdiag;
1537 
1538   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1539   PetscInt   nlnk, *lnk;
1540   PetscBT    lnkbt;
1541   PetscBool  row_identity, icol_identity;
1542   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1543   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1544 
1545   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1546   PetscInt   nnz_max;
1547   PetscBool  missing;
1548   PetscReal *vtmp_abs;
1549   MatScalar *v_work;
1550   PetscInt  *v_pivots;
1551   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1552 
1553   PetscFunctionBegin;
1554   /* symbolic factorization, can be reused */
1555   PetscCall(MatMissingDiagonal(A, &missing, &i));
1556   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1557   adiag = a->diag;
1558 
1559   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1560 
1561   /* bdiag is location of diagonal in factor */
1562   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1563 
1564   /* allocate row pointers bi */
1565   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1566 
1567   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1568   dtcount = (PetscInt)info->dtcount;
1569   if (dtcount > mbs - 1) dtcount = mbs - 1;
1570   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1571   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1572   PetscCall(PetscMalloc1(nnz_max, &bj));
1573   nnz_max = nnz_max * bs2;
1574   PetscCall(PetscMalloc1(nnz_max, &ba));
1575 
1576   /* put together the new matrix */
1577   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1578 
1579   b               = (Mat_SeqBAIJ *)(B)->data;
1580   b->free_a       = PETSC_TRUE;
1581   b->free_ij      = PETSC_TRUE;
1582   b->singlemalloc = PETSC_FALSE;
1583 
1584   b->a    = ba;
1585   b->j    = bj;
1586   b->i    = bi;
1587   b->diag = bdiag;
1588   b->ilen = NULL;
1589   b->imax = NULL;
1590   b->row  = isrow;
1591   b->col  = iscol;
1592 
1593   PetscCall(PetscObjectReference((PetscObject)isrow));
1594   PetscCall(PetscObjectReference((PetscObject)iscol));
1595 
1596   b->icol = isicol;
1597   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1598   b->maxnz = nnz_max / bs2;
1599 
1600   (B)->factortype            = MAT_FACTOR_ILUDT;
1601   (B)->info.factor_mallocs   = 0;
1602   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1603   /* end of symbolic factorization */
1604   PetscCall(ISGetIndices(isrow, &r));
1605   PetscCall(ISGetIndices(isicol, &ic));
1606 
1607   /* linked list for storing column indices of the active row */
1608   nlnk = mbs + 1;
1609   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1610 
1611   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1612   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1613   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1614   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1615   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1616   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1617 
1618   allowzeropivot  = PetscNot(A->erroriffailure);
1619   bi[0]           = 0;
1620   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1621   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1622   for (i = 0; i < mbs; i++) {
1623     /* copy initial fill into linked list */
1624     nzi = ai[r[i] + 1] - ai[r[i]];
1625     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1626     nzi_al = adiag[r[i]] - ai[r[i]];
1627     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1628 
1629     /* load in initial unfactored row */
1630     ajtmp = aj + ai[r[i]];
1631     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1632     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1633     aatmp = a->a + bs2 * ai[r[i]];
1634     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1635 
1636     /* add pivot rows into linked list */
1637     row = lnk[mbs];
1638     while (row < i) {
1639       nzi_bl = bi[row + 1] - bi[row] + 1;
1640       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1641       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1642       nzi += nlnk;
1643       row = lnk[row];
1644     }
1645 
1646     /* copy data from lnk into jtmp, then initialize lnk */
1647     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1648 
1649     /* numerical factorization */
1650     bjtmp = jtmp;
1651     row   = *bjtmp++; /* 1st pivot row */
1652 
1653     while (row < i) {
1654       pc = rtmp + bs2 * row;
1655       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1656       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1657       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1658       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1659         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1660         pv = ba + bs2 * (bdiag[row + 1] + 1);
1661         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1662         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1663         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1664       }
1665       row = *bjtmp++;
1666     }
1667 
1668     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1669     nzi_bl = 0;
1670     j      = 0;
1671     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1672       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1673       nzi_bl++;
1674       j++;
1675     }
1676     nzi_bu = nzi - nzi_bl - 1;
1677 
1678     while (j < nzi) { /* U-part */
1679       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1680       j++;
1681     }
1682 
1683     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1684 
1685     bjtmp = bj + bi[i];
1686     batmp = ba + bs2 * bi[i];
1687     /* apply level dropping rule to L part */
1688     ncut = nzi_al + dtcount;
1689     if (ncut < nzi_bl) {
1690       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1691       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1692     } else {
1693       ncut = nzi_bl;
1694     }
1695     for (j = 0; j < ncut; j++) {
1696       bjtmp[j] = jtmp[j];
1697       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1698     }
1699     bi[i + 1] = bi[i] + ncut;
1700     nzi       = ncut + 1;
1701 
1702     /* apply level dropping rule to U part */
1703     ncut = nzi_au + dtcount;
1704     if (ncut < nzi_bu) {
1705       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1706       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1707     } else {
1708       ncut = nzi_bu;
1709     }
1710     nzi += ncut;
1711 
1712     /* mark bdiagonal */
1713     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1714     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1715 
1716     bjtmp = bj + bdiag[i];
1717     batmp = ba + bs2 * bdiag[i];
1718     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1719     *bjtmp = i;
1720 
1721     bjtmp = bj + bdiag[i + 1] + 1;
1722     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1723 
1724     for (k = 0; k < ncut; k++) {
1725       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1726       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1727     }
1728 
1729     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1730 
1731     /* invert diagonal block for simpler triangular solves - add shift??? */
1732     batmp = ba + bs2 * bdiag[i];
1733 
1734     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1735     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1736   } /* for (i=0; i<mbs; i++) */
1737   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1738 
1739   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1740   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1741 
1742   PetscCall(ISRestoreIndices(isrow, &r));
1743   PetscCall(ISRestoreIndices(isicol, &ic));
1744 
1745   PetscCall(PetscLLDestroy(lnk, lnkbt));
1746 
1747   PetscCall(PetscFree2(im, jtmp));
1748   PetscCall(PetscFree2(rtmp, vtmp));
1749 
1750   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1751   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1752 
1753   PetscCall(ISIdentity(isrow, &row_identity));
1754   PetscCall(ISIdentity(isicol, &icol_identity));
1755   if (row_identity && icol_identity) {
1756     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1757   } else {
1758     B->ops->solve = MatSolve_SeqBAIJ_N;
1759   }
1760 
1761   B->ops->solveadd          = NULL;
1762   B->ops->solvetranspose    = NULL;
1763   B->ops->solvetransposeadd = NULL;
1764   B->ops->matsolve          = NULL;
1765   B->assembled              = PETSC_TRUE;
1766   B->preallocated           = PETSC_TRUE;
1767   PetscFunctionReturn(PETSC_SUCCESS);
1768 }
1769