xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision ae1ee55146a7ad071171b861759b85940c7e5c67)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo * info)7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8 {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15   MatScalar      *rtmp, *pc, *mwork, *pv;
16   MatScalar      *aa             = a->a, *v;
17   PetscReal       shift          = info->shiftamount;
18   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
19 
20   PetscFunctionBegin;
21   PetscCall(ISGetIndices(isrow, &r));
22   PetscCall(ISGetIndices(isicol, &ic));
23 
24   /* generate work space needed by the factorization */
25   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26   PetscCall(PetscArrayzero(rtmp, bs2 * n));
27 
28   for (PetscInt i = 0; i < n; i++) {
29     /* zero rtmp */
30     /* L part */
31     PetscInt *pj;
32     PetscInt  nzL, nz = bi[i + 1] - bi[i];
33     bjtmp = bj + bi[i];
34     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
35 
36     /* U part */
37     nz    = bdiag[i] - bdiag[i + 1];
38     bjtmp = bj + bdiag[i + 1] + 1;
39     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
40 
41     /* load in initial (unfactored row) */
42     nz    = ai[r[i] + 1] - ai[r[i]];
43     ajtmp = aj + ai[r[i]];
44     v     = aa + bs2 * ai[r[i]];
45     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
46 
47     /* elimination */
48     bjtmp = bj + bi[i];
49     nzL   = bi[i + 1] - bi[i];
50     for (PetscInt k = 0; k < nzL; k++) {
51       PetscBool flg = PETSC_FALSE;
52       PetscInt  row = bjtmp[k];
53 
54       pc = rtmp + bs2 * row;
55       for (PetscInt j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = PETSC_TRUE;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (PetscInt j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     {
92       PetscBool zeropivotdetected;
93 
94       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96     }
97 
98     /* U part */
99     pv = b->a + bs2 * (bdiag[i + 1] + 1);
100     pj = b->j + bdiag[i + 1] + 1;
101     nz = bdiag[i] - bdiag[i + 1] - 1;
102     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103   }
104 
105   PetscCall(PetscFree2(rtmp, mwork));
106   PetscCall(ISRestoreIndices(isicol, &ic));
107   PetscCall(ISRestoreIndices(isrow, &r));
108 
109   C->ops->solve          = MatSolve_SeqBAIJ_2;
110   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111   C->assembled           = PETSC_TRUE;
112 
113   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114   PetscFunctionReturn(PETSC_SUCCESS);
115 }
116 
MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118 {
119   Mat             C = B;
120   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121   PetscInt        i, j, k, nz, nzL, row, *pj;
122   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124   MatScalar      *rtmp, *pc, *mwork, *pv;
125   MatScalar      *aa = a->a, *v;
126   PetscInt        flg;
127   PetscReal       shift = info->shiftamount;
128   PetscBool       allowzeropivot, zeropivotdetected;
129 
130   PetscFunctionBegin;
131   allowzeropivot = PetscNot(A->erroriffailure);
132 
133   /* generate work space needed by the factorization */
134   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135   PetscCall(PetscArrayzero(rtmp, bs2 * n));
136 
137   for (i = 0; i < n; i++) {
138     /* zero rtmp */
139     /* L part */
140     nz    = bi[i + 1] - bi[i];
141     bjtmp = bj + bi[i];
142     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143 
144     /* U part */
145     nz    = bdiag[i] - bdiag[i + 1];
146     bjtmp = bj + bdiag[i + 1] + 1;
147     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
148 
149     /* load in initial (unfactored row) */
150     nz    = ai[i + 1] - ai[i];
151     ajtmp = aj + ai[i];
152     v     = aa + bs2 * ai[i];
153     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
154 
155     /* elimination */
156     bjtmp = bj + bi[i];
157     nzL   = bi[i + 1] - bi[i];
158     for (k = 0; k < nzL; k++) {
159       row = bjtmp[k];
160       pc  = rtmp + bs2 * row;
161       for (flg = 0, j = 0; j < bs2; j++) {
162         if (pc[j] != (PetscScalar)0.0) {
163           flg = 1;
164           break;
165         }
166       }
167       if (flg) {
168         pv = b->a + bs2 * bdiag[row];
169         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
171 
172         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173         pv = b->a + bs2 * (bdiag[row + 1] + 1);
174         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175         for (j = 0; j < nz; j++) {
176           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178           v = rtmp + 4 * pj[j];
179           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180           pv += 4;
181         }
182         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183       }
184     }
185 
186     /* finished row so stick it into b->a */
187     /* L part */
188     pv = b->a + bs2 * bi[i];
189     pj = b->j + bi[i];
190     nz = bi[i + 1] - bi[i];
191     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
192 
193     /* Mark diagonal and invert diagonal for simpler triangular solves */
194     pv = b->a + bs2 * bdiag[i];
195     pj = b->j + bdiag[i];
196     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199 
200     /* U part */
201     /*
202     pv = b->a + bs2*bi[2*n-i];
203     pj = b->j + bi[2*n-i];
204     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205     */
206     pv = b->a + bs2 * (bdiag[i + 1] + 1);
207     pj = b->j + bdiag[i + 1] + 1;
208     nz = bdiag[i] - bdiag[i + 1] - 1;
209     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210   }
211   PetscCall(PetscFree2(rtmp, mwork));
212 
213   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
214   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217   C->assembled           = PETSC_TRUE;
218 
219   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B,Mat A,const MatFactorInfo * info)223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224 {
225   Mat             C = B;
226   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227   IS              isrow = b->row, isicol = b->icol;
228   const PetscInt *r, *ic;
229   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230   PetscInt       *ajtmpold, *ajtmp, nz, row;
231   PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
232   const PetscInt *diag_offset;
233   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234   MatScalar       p1, p2, p3, p4;
235   MatScalar      *ba = b->a, *aa = a->a;
236   PetscReal       shift = info->shiftamount;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241   A->factortype = MAT_FACTOR_NONE;
242   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243   A->factortype  = MAT_FACTOR_ILU;
244   allowzeropivot = PetscNot(A->erroriffailure);
245   PetscCall(ISGetIndices(isrow, &r));
246   PetscCall(ISGetIndices(isicol, &ic));
247   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
248 
249   for (i = 0; i < n; i++) {
250     nz    = bi[i + 1] - bi[i];
251     ajtmp = bj + bi[i];
252     for (j = 0; j < nz; j++) {
253       x    = rtmp + 4 * ajtmp[j];
254       x[0] = x[1] = x[2] = x[3] = 0.0;
255     }
256     /* load in initial (unfactored row) */
257     idx      = r[i];
258     nz       = ai[idx + 1] - ai[idx];
259     ajtmpold = aj + ai[idx];
260     v        = aa + 4 * ai[idx];
261     for (j = 0; j < nz; j++) {
262       x    = rtmp + 4 * ic[ajtmpold[j]];
263       x[0] = v[0];
264       x[1] = v[1];
265       x[2] = v[2];
266       x[3] = v[3];
267       v += 4;
268     }
269     row = *ajtmp++;
270     while (row < i) {
271       pc = rtmp + 4 * row;
272       p1 = pc[0];
273       p2 = pc[1];
274       p3 = pc[2];
275       p4 = pc[3];
276       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277         pv    = ba + 4 * diag_offset[row];
278         pj    = bj + diag_offset[row] + 1;
279         x1    = pv[0];
280         x2    = pv[1];
281         x3    = pv[2];
282         x4    = pv[3];
283         pc[0] = m1 = p1 * x1 + p3 * x2;
284         pc[1] = m2 = p2 * x1 + p4 * x2;
285         pc[2] = m3 = p1 * x3 + p3 * x4;
286         pc[3] = m4 = p2 * x3 + p4 * x4;
287         nz         = bi[row + 1] - diag_offset[row] - 1;
288         pv += 4;
289         for (j = 0; j < nz; j++) {
290           x1 = pv[0];
291           x2 = pv[1];
292           x3 = pv[2];
293           x4 = pv[3];
294           x  = rtmp + 4 * pj[j];
295           x[0] -= m1 * x1 + m3 * x2;
296           x[1] -= m2 * x1 + m4 * x2;
297           x[2] -= m1 * x3 + m3 * x4;
298           x[3] -= m2 * x3 + m4 * x4;
299           pv += 4;
300         }
301         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302       }
303       row = *ajtmp++;
304     }
305     /* finished row so stick it into b->a */
306     pv = ba + 4 * bi[i];
307     pj = bj + bi[i];
308     nz = bi[i + 1] - bi[i];
309     for (j = 0; j < nz; j++) {
310       x     = rtmp + 4 * pj[j];
311       pv[0] = x[0];
312       pv[1] = x[1];
313       pv[2] = x[2];
314       pv[3] = x[3];
315       pv += 4;
316     }
317     /* invert diagonal block */
318     w = ba + 4 * diag_offset[i];
319     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321   }
322 
323   PetscCall(PetscFree(rtmp));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 /*
335       Version for when blocks are 2 by 2 Using natural ordering
336 */
MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338 {
339   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341   PetscInt       *ajtmpold, *ajtmp, nz, row;
342   PetscInt       *ai = a->i, *aj = a->j, *pj;
343   const PetscInt *diag_offset;
344   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
345   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346   MatScalar      *ba = b->a, *aa = a->a;
347   PetscReal       shift = info->shiftamount;
348   PetscBool       allowzeropivot, zeropivotdetected;
349 
350   PetscFunctionBegin;
351   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352   A->factortype = MAT_FACTOR_NONE;
353   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354   A->factortype  = MAT_FACTOR_ILU;
355   allowzeropivot = PetscNot(A->erroriffailure);
356   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357   for (i = 0; i < n; i++) {
358     nz    = bi[i + 1] - bi[i];
359     ajtmp = bj + bi[i];
360     for (j = 0; j < nz; j++) {
361       x    = rtmp + 4 * ajtmp[j];
362       x[0] = x[1] = x[2] = x[3] = 0.0;
363     }
364     /* load in initial (unfactored row) */
365     nz       = ai[i + 1] - ai[i];
366     ajtmpold = aj + ai[i];
367     v        = aa + 4 * ai[i];
368     for (j = 0; j < nz; j++) {
369       x    = rtmp + 4 * ajtmpold[j];
370       x[0] = v[0];
371       x[1] = v[1];
372       x[2] = v[2];
373       x[3] = v[3];
374       v += 4;
375     }
376     row = *ajtmp++;
377     while (row < i) {
378       pc = rtmp + 4 * row;
379       p1 = pc[0];
380       p2 = pc[1];
381       p3 = pc[2];
382       p4 = pc[3];
383       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384         pv    = ba + 4 * diag_offset[row];
385         pj    = bj + diag_offset[row] + 1;
386         x1    = pv[0];
387         x2    = pv[1];
388         x3    = pv[2];
389         x4    = pv[3];
390         pc[0] = m1 = p1 * x1 + p3 * x2;
391         pc[1] = m2 = p2 * x1 + p4 * x2;
392         pc[2] = m3 = p1 * x3 + p3 * x4;
393         pc[3] = m4 = p2 * x3 + p4 * x4;
394         nz         = bi[row + 1] - diag_offset[row] - 1;
395         pv += 4;
396         for (j = 0; j < nz; j++) {
397           x1 = pv[0];
398           x2 = pv[1];
399           x3 = pv[2];
400           x4 = pv[3];
401           x  = rtmp + 4 * pj[j];
402           x[0] -= m1 * x1 + m3 * x2;
403           x[1] -= m2 * x1 + m4 * x2;
404           x[2] -= m1 * x3 + m3 * x4;
405           x[3] -= m2 * x3 + m4 * x4;
406           pv += 4;
407         }
408         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409       }
410       row = *ajtmp++;
411     }
412     /* finished row so stick it into b->a */
413     pv = ba + 4 * bi[i];
414     pj = bj + bi[i];
415     nz = bi[i + 1] - bi[i];
416     for (j = 0; j < nz; j++) {
417       x     = rtmp + 4 * pj[j];
418       pv[0] = x[0];
419       pv[1] = x[1];
420       pv[2] = x[2];
421       pv[3] = x[3];
422       /*
423       printf(" col %d:",pj[j]);
424       PetscInt j1;
425       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426       printf("\n");
427       */
428       pv += 4;
429     }
430     /* invert diagonal block */
431     w = ba + 4 * diag_offset[i];
432     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434   }
435 
436   PetscCall(PetscFree(rtmp));
437 
438   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440   C->assembled           = PETSC_TRUE;
441 
442   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443   PetscFunctionReturn(PETSC_SUCCESS);
444 }
445 
446 /*
447      Version for when blocks are 1 by 1.
448 */
MatLUFactorNumeric_SeqBAIJ_1(Mat B,Mat A,const MatFactorInfo * info)449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450 {
451   Mat              C = B;
452   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453   IS               isrow = b->row, isicol = b->icol;
454   const PetscInt  *r, *ic, *ics;
455   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456   PetscInt         i, j, k, nz, nzL, row, *pj;
457   const PetscInt  *ajtmp, *bjtmp;
458   MatScalar       *rtmp, *pc, multiplier, *pv;
459   const MatScalar *aa = a->a, *v;
460   PetscBool        row_identity, col_identity;
461   FactorShiftCtx   sctx;
462   const PetscInt  *ddiag;
463   PetscReal        rs;
464   MatScalar        d;
465 
466   PetscFunctionBegin;
467   /* MatPivotSetUp(): initialize shift context sctx */
468   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
469 
470   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471     PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472     sctx.shift_top = info->zeropivot;
473     for (i = 0; i < n; i++) {
474       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475       d  = (aa)[ddiag[i]];
476       rs = -PetscAbsScalar(d) - PetscRealPart(d);
477       v  = aa + ai[i];
478       nz = ai[i + 1] - ai[i];
479       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480       if (rs > sctx.shift_top) sctx.shift_top = rs;
481     }
482     sctx.shift_top *= 1.1;
483     sctx.nshift_max = 5;
484     sctx.shift_lo   = 0.;
485     sctx.shift_hi   = 1.;
486   }
487 
488   PetscCall(ISGetIndices(isrow, &r));
489   PetscCall(ISGetIndices(isicol, &ic));
490   PetscCall(PetscMalloc1(n + 1, &rtmp));
491   ics = ic;
492 
493   do {
494     sctx.newshift = PETSC_FALSE;
495     for (i = 0; i < n; i++) {
496       /* zero rtmp */
497       /* L part */
498       nz    = bi[i + 1] - bi[i];
499       bjtmp = bj + bi[i];
500       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
501 
502       /* U part */
503       nz    = bdiag[i] - bdiag[i + 1];
504       bjtmp = bj + bdiag[i + 1] + 1;
505       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
506 
507       /* load in initial (unfactored row) */
508       nz    = ai[r[i] + 1] - ai[r[i]];
509       ajtmp = aj + ai[r[i]];
510       v     = aa + ai[r[i]];
511       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
512 
513       /* ZeropivotApply() */
514       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
515 
516       /* elimination */
517       bjtmp = bj + bi[i];
518       row   = *bjtmp++;
519       nzL   = bi[i + 1] - bi[i];
520       for (k = 0; k < nzL; k++) {
521         pc = rtmp + row;
522         if (*pc != (PetscScalar)0.0) {
523           pv         = b->a + bdiag[row];
524           multiplier = *pc * (*pv);
525           *pc        = multiplier;
526 
527           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528           pv = b->a + bdiag[row + 1] + 1;
529           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531           PetscCall(PetscLogFlops(2.0 * nz));
532         }
533         row = *bjtmp++;
534       }
535 
536       /* finished row so stick it into b->a */
537       rs = 0.0;
538       /* L part */
539       pv = b->a + bi[i];
540       pj = b->j + bi[i];
541       nz = bi[i + 1] - bi[i];
542       for (j = 0; j < nz; j++) {
543         pv[j] = rtmp[pj[j]];
544         rs += PetscAbsScalar(pv[j]);
545       }
546 
547       /* U part */
548       pv = b->a + bdiag[i + 1] + 1;
549       pj = b->j + bdiag[i + 1] + 1;
550       nz = bdiag[i] - bdiag[i + 1] - 1;
551       for (j = 0; j < nz; j++) {
552         pv[j] = rtmp[pj[j]];
553         rs += PetscAbsScalar(pv[j]);
554       }
555 
556       sctx.rs = rs;
557       sctx.pv = rtmp[i];
558       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559       if (sctx.newshift) break; /* break for-loop */
560       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
561 
562       /* Mark diagonal and invert diagonal for simpler triangular solves */
563       pv  = b->a + bdiag[i];
564       *pv = (PetscScalar)1.0 / rtmp[i];
565 
566     } /* endof for (i=0; i<n; i++) { */
567 
568     /* MatPivotRefine() */
569     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570       /*
571        * if no shift in this attempt & shifting & started shifting & can refine,
572        * then try lower shift
573        */
574       sctx.shift_hi       = sctx.shift_fraction;
575       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
577       sctx.newshift       = PETSC_TRUE;
578       sctx.nshift++;
579     }
580   } while (sctx.newshift);
581 
582   PetscCall(PetscFree(rtmp));
583   PetscCall(ISRestoreIndices(isicol, &ic));
584   PetscCall(ISRestoreIndices(isrow, &r));
585 
586   PetscCall(ISIdentity(isrow, &row_identity));
587   PetscCall(ISIdentity(isicol, &col_identity));
588   if (row_identity && col_identity) {
589     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
590     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593   } else {
594     C->ops->solve          = MatSolve_SeqBAIJ_1;
595     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596   }
597   C->assembled = PETSC_TRUE;
598   PetscCall(PetscLogFlops(C->cmap->n));
599 
600   /* MatShiftView(A,info,&sctx) */
601   if (sctx.nshift) {
602     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608     }
609   }
610   PetscFunctionReturn(PETSC_SUCCESS);
611 }
612 
MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo * info)613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614 {
615   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616   IS              isrow = b->row, isicol = b->icol;
617   const PetscInt *r, *ic;
618   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620   PetscInt        diag, *pj;
621   const PetscInt *diag_offset;
622   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
623   MatScalar      *ba = b->a, *aa = a->a;
624   PetscBool       row_identity, col_identity;
625 
626   PetscFunctionBegin;
627   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628   A->factortype = MAT_FACTOR_NONE;
629   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630   A->factortype = MAT_FACTOR_ILU;
631   PetscCall(ISGetIndices(isrow, &r));
632   PetscCall(ISGetIndices(isicol, &ic));
633   PetscCall(PetscMalloc1(n + 1, &rtmp));
634 
635   for (i = 0; i < n; i++) {
636     nz    = bi[i + 1] - bi[i];
637     ajtmp = bj + bi[i];
638     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
639 
640     /* load in initial (unfactored row) */
641     nz       = ai[r[i] + 1] - ai[r[i]];
642     ajtmpold = aj + ai[r[i]];
643     v        = aa + ai[r[i]];
644     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
645 
646     row = *ajtmp++;
647     while (row < i) {
648       pc = rtmp + row;
649       if (*pc != 0.0) {
650         pv         = ba + diag_offset[row];
651         pj         = bj + diag_offset[row] + 1;
652         multiplier = *pc * *pv++;
653         *pc        = multiplier;
654         nz         = bi[row + 1] - diag_offset[row] - 1;
655         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657       }
658       row = *ajtmp++;
659     }
660     /* finished row so stick it into b->a */
661     pv = ba + bi[i];
662     pj = bj + bi[i];
663     nz = bi[i + 1] - bi[i];
664     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665     diag = diag_offset[i] - bi[i];
666     /* check pivot entry for current row */
667     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668     pv[diag] = 1.0 / pv[diag];
669   }
670 
671   PetscCall(PetscFree(rtmp));
672   PetscCall(ISRestoreIndices(isicol, &ic));
673   PetscCall(ISRestoreIndices(isrow, &r));
674   PetscCall(ISIdentity(isrow, &row_identity));
675   PetscCall(ISIdentity(isicol, &col_identity));
676   if (row_identity && col_identity) {
677     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679   } else {
680     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
681     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682   }
683   C->assembled = PETSC_TRUE;
684   PetscCall(PetscLogFlops(C->cmap->n));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
MatFactorGetSolverType_petsc(Mat A,MatSolverType * type)688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689 {
690   PetscFunctionBegin;
691   *type = MATSOLVERPETSC;
692   PetscFunctionReturn(PETSC_SUCCESS);
693 }
694 
MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat * B)695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696 {
697   PetscInt n = A->rmap->n;
698 
699   PetscFunctionBegin;
700   PetscCheck(!PetscDefined(USE_COMPLEX) || A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
701   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
702   PetscCall(MatSetSizes(*B, n, n, n, n));
703   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
704     PetscCall(MatSetType(*B, MATSEQBAIJ));
705 
706     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
707     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
708     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
709     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
710     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
711   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
712     PetscCall(MatSetType(*B, MATSEQSBAIJ));
713     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
714 
715     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
716     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
717     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
718     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
719     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
720   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
721   (*B)->factortype     = ftype;
722   (*B)->canuseordering = PETSC_TRUE;
723 
724   PetscCall(PetscFree((*B)->solvertype));
725   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
726   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
727   PetscFunctionReturn(PETSC_SUCCESS);
728 }
729 
MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo * info)730 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
731 {
732   Mat C;
733 
734   PetscFunctionBegin;
735   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
736   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
737   PetscCall(MatLUFactorNumeric(C, A, info));
738 
739   A->ops->solve          = C->ops->solve;
740   A->ops->solvetranspose = C->ops->solvetranspose;
741 
742   PetscCall(MatHeaderMerge(A, &C));
743   PetscFunctionReturn(PETSC_SUCCESS);
744 }
745 
746 #include <../src/mat/impls/sbaij/seq/sbaij.h>
MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo * info)747 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
748 {
749   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
750   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
751   IS              ip = b->row;
752   const PetscInt *rip;
753   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
754   PetscInt       *ai = a->i, *aj = a->j;
755   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
756   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
757   PetscReal       rs;
758   FactorShiftCtx  sctx;
759 
760   PetscFunctionBegin;
761   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
762     if (!a->sbaijMat) {
763       A->symmetric = PETSC_BOOL3_TRUE;
764       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
765       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
766     }
767     PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
768     PetscCall(MatDestroy(&a->sbaijMat));
769     PetscFunctionReturn(PETSC_SUCCESS);
770   }
771 
772   /* MatPivotSetUp(): initialize shift context sctx */
773   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
774 
775   PetscCall(ISGetIndices(ip, &rip));
776   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
777 
778   sctx.shift_amount = 0.;
779   sctx.nshift       = 0;
780   do {
781     sctx.newshift = PETSC_FALSE;
782     for (i = 0; i < mbs; i++) {
783       rtmp[i] = 0.0;
784       jl[i]   = mbs;
785       il[0]   = 0;
786     }
787 
788     for (k = 0; k < mbs; k++) {
789       bval = ba + bi[k];
790       /* initialize k-th row by the perm[k]-th row of A */
791       jmin = ai[rip[k]];
792       jmax = ai[rip[k] + 1];
793       for (j = jmin; j < jmax; j++) {
794         col = rip[aj[j]];
795         if (col >= k) { /* only take upper triangular entry */
796           rtmp[col] = aa[j];
797           *bval++   = 0.0; /* for in-place factorization */
798         }
799       }
800 
801       /* shift the diagonal of the matrix */
802       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
803 
804       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
805       dk = rtmp[k];
806       i  = jl[k]; /* first row to be added to k_th row  */
807 
808       while (i < k) {
809         nexti = jl[i]; /* next row to be added to k_th row */
810 
811         /* compute multiplier, update diag(k) and U(i,k) */
812         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
813         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
814         dk += uikdi * ba[ili];
815         ba[ili] = uikdi; /* -U(i,k) */
816 
817         /* add multiple of row i to k-th row */
818         jmin = ili + 1;
819         jmax = bi[i + 1];
820         if (jmin < jmax) {
821           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
822           /* update il and jl for row i */
823           il[i] = jmin;
824           j     = bj[jmin];
825           jl[i] = jl[j];
826           jl[j] = i;
827         }
828         i = nexti;
829       }
830 
831       /* shift the diagonals when zero pivot is detected */
832       /* compute rs=sum of abs(off-diagonal) */
833       rs   = 0.0;
834       jmin = bi[k] + 1;
835       nz   = bi[k + 1] - jmin;
836       if (nz) {
837         bcol = bj + jmin;
838         while (nz--) {
839           rs += PetscAbsScalar(rtmp[*bcol]);
840           bcol++;
841         }
842       }
843 
844       sctx.rs = rs;
845       sctx.pv = dk;
846       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
847       if (sctx.newshift) break;
848       dk = sctx.pv;
849 
850       /* copy data into U(k,:) */
851       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
852       jmin      = bi[k] + 1;
853       jmax      = bi[k + 1];
854       if (jmin < jmax) {
855         for (j = jmin; j < jmax; j++) {
856           col       = bj[j];
857           ba[j]     = rtmp[col];
858           rtmp[col] = 0.0;
859         }
860         /* add the k-th row into il and jl */
861         il[k] = jmin;
862         i     = bj[jmin];
863         jl[k] = jl[i];
864         jl[i] = k;
865       }
866     }
867   } while (sctx.newshift);
868   PetscCall(PetscFree3(rtmp, il, jl));
869 
870   PetscCall(ISRestoreIndices(ip, &rip));
871 
872   C->assembled    = PETSC_TRUE;
873   C->preallocated = PETSC_TRUE;
874 
875   PetscCall(PetscLogFlops(C->rmap->N));
876   if (sctx.nshift) {
877     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
878       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
879     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
880       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
881     }
882   }
883   PetscFunctionReturn(PETSC_SUCCESS);
884 }
885 
MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo * info)886 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
887 {
888   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
889   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
890   PetscInt       i, j, am = a->mbs;
891   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
892   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
893   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
894   PetscReal      rs;
895   FactorShiftCtx sctx;
896 
897   PetscFunctionBegin;
898   /* MatPivotSetUp(): initialize shift context sctx */
899   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
900 
901   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
902 
903   do {
904     sctx.newshift = PETSC_FALSE;
905     for (i = 0; i < am; i++) {
906       rtmp[i] = 0.0;
907       jl[i]   = am;
908       il[0]   = 0;
909     }
910 
911     for (k = 0; k < am; k++) {
912       /* initialize k-th row with elements nonzero in row perm(k) of A */
913       nz   = ai[k + 1] - ai[k];
914       acol = aj + ai[k];
915       aval = aa + ai[k];
916       bval = ba + bi[k];
917       while (nz--) {
918         if (*acol < k) { /* skip lower triangular entries */
919           acol++;
920           aval++;
921         } else {
922           rtmp[*acol++] = *aval++;
923           *bval++       = 0.0; /* for in-place factorization */
924         }
925       }
926 
927       /* shift the diagonal of the matrix */
928       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
929 
930       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
931       dk = rtmp[k];
932       i  = jl[k]; /* first row to be added to k_th row  */
933 
934       while (i < k) {
935         nexti = jl[i]; /* next row to be added to k_th row */
936         /* compute multiplier, update D(k) and U(i,k) */
937         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
938         uikdi = -ba[ili] * ba[bi[i]];
939         dk += uikdi * ba[ili];
940         ba[ili] = uikdi; /* -U(i,k) */
941 
942         /* add multiple of row i to k-th row ... */
943         jmin = ili + 1;
944         nz   = bi[i + 1] - jmin;
945         if (nz > 0) {
946           bcol = bj + jmin;
947           bval = ba + jmin;
948           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
949           /* update il and jl for i-th row */
950           il[i] = jmin;
951           j     = bj[jmin];
952           jl[i] = jl[j];
953           jl[j] = i;
954         }
955         i = nexti;
956       }
957 
958       /* shift the diagonals when zero pivot is detected */
959       /* compute rs=sum of abs(off-diagonal) */
960       rs   = 0.0;
961       jmin = bi[k] + 1;
962       nz   = bi[k + 1] - jmin;
963       if (nz) {
964         bcol = bj + jmin;
965         while (nz--) {
966           rs += PetscAbsScalar(rtmp[*bcol]);
967           bcol++;
968         }
969       }
970 
971       sctx.rs = rs;
972       sctx.pv = dk;
973       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
974       if (sctx.newshift) break; /* sctx.shift_amount is updated */
975       dk = sctx.pv;
976 
977       /* copy data into U(k,:) */
978       ba[bi[k]] = 1.0 / dk;
979       jmin      = bi[k] + 1;
980       nz        = bi[k + 1] - jmin;
981       if (nz) {
982         bcol = bj + jmin;
983         bval = ba + jmin;
984         while (nz--) {
985           *bval++       = rtmp[*bcol];
986           rtmp[*bcol++] = 0.0;
987         }
988         /* add k-th row into il and jl */
989         il[k] = jmin;
990         i     = bj[jmin];
991         jl[k] = jl[i];
992         jl[i] = k;
993       }
994     }
995   } while (sctx.newshift);
996   PetscCall(PetscFree3(rtmp, il, jl));
997 
998   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
999   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1000   C->assembled           = PETSC_TRUE;
1001   C->preallocated        = PETSC_TRUE;
1002 
1003   PetscCall(PetscLogFlops(C->rmap->N));
1004   if (sctx.nshift) {
1005     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1006       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1007     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1008       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1009     }
1010   }
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
1014 #include <petscbt.h>
1015 #include <../src/mat/utils/freespace.h>
MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1016 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1017 {
1018   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1019   Mat_SeqSBAIJ      *b;
1020   Mat                B;
1021   PetscBool          perm_identity;
1022   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1023   const PetscInt    *rip;
1024   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1025   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1026   PetscReal          fill = info->fill, levels = info->levels;
1027   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1028   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1029   PetscBT            lnkbt;
1030   PetscBool          diagDense;
1031   const PetscInt    *adiag;
1032 
1033   PetscFunctionBegin;
1034   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1035   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1036 
1037   if (bs > 1) {
1038     if (!a->sbaijMat) {
1039       A->symmetric = PETSC_BOOL3_TRUE;
1040       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1041       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1042     }
1043     fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1044 
1045     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1046     PetscFunctionReturn(PETSC_SUCCESS);
1047   }
1048 
1049   PetscCall(ISIdentity(perm, &perm_identity));
1050   PetscCall(ISGetIndices(perm, &rip));
1051 
1052   /* special case that simply copies fill pattern */
1053   if (!levels && perm_identity) {
1054     PetscCall(PetscMalloc1(am + 1, &ui));
1055     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1056     B = fact;
1057     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1058 
1059     b  = (Mat_SeqSBAIJ *)B->data;
1060     uj = b->j;
1061     for (i = 0; i < am; i++) {
1062       aj = a->j + adiag[i];
1063       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1064       b->ilen[i] = ui[i];
1065     }
1066     PetscCall(PetscFree(ui));
1067 
1068     B->factortype = MAT_FACTOR_NONE;
1069 
1070     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1071     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1072     B->factortype = MAT_FACTOR_ICC;
1073 
1074     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1075     PetscFunctionReturn(PETSC_SUCCESS);
1076   }
1077 
1078   /* initialization */
1079   PetscCall(PetscMalloc1(am + 1, &ui));
1080   ui[0] = 0;
1081   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1082 
1083   /* jl: linked list for storing indices of the pivot rows
1084      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1085   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1086   for (i = 0; i < am; i++) {
1087     jl[i] = am;
1088     il[i] = 0;
1089   }
1090 
1091   /* create and initialize a linked list for storing column indices of the active row k */
1092   nlnk = am + 1;
1093   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1094 
1095   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1096   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1097 
1098   current_space = free_space;
1099 
1100   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1101   current_space_lvl = free_space_lvl;
1102 
1103   for (k = 0; k < am; k++) { /* for each active row k */
1104     /* initialize lnk by the column indices of row rip[k] of A */
1105     nzk         = 0;
1106     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1107     ncols_upper = 0;
1108     cols        = cols_lvl + am;
1109     for (j = 0; j < ncols; j++) {
1110       i = rip[*(aj + ai[rip[k]] + j)];
1111       if (i >= k) { /* only take upper triangular entry */
1112         cols[ncols_upper]     = i;
1113         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1114         ncols_upper++;
1115       }
1116     }
1117     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1118     nzk += nlnk;
1119 
1120     /* update lnk by computing fill-in for each pivot row to be merged in */
1121     prow = jl[k]; /* 1st pivot row */
1122 
1123     while (prow < k) {
1124       nextprow = jl[prow];
1125 
1126       /* merge prow into k-th row */
1127       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1128       jmax  = ui[prow + 1];
1129       ncols = jmax - jmin;
1130       i     = jmin - ui[prow];
1131       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1132       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1133       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1134       nzk += nlnk;
1135 
1136       /* update il and jl for prow */
1137       if (jmin < jmax) {
1138         il[prow] = jmin;
1139 
1140         j        = *cols;
1141         jl[prow] = jl[j];
1142         jl[j]    = prow;
1143       }
1144       prow = nextprow;
1145     }
1146 
1147     /* if free space is not available, make more free space */
1148     if (current_space->local_remaining < nzk) {
1149       i = am - k + 1;                                                             /* num of unfactored rows */
1150       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1151       PetscCall(PetscFreeSpaceGet(i, &current_space));
1152       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1153       reallocs++;
1154     }
1155 
1156     /* copy data into free_space and free_space_lvl, then initialize lnk */
1157     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1158 
1159     /* add the k-th row into il and jl */
1160     if (nzk - 1 > 0) {
1161       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1162       jl[k] = jl[i];
1163       jl[i] = k;
1164       il[k] = ui[k] + 1;
1165     }
1166     uj_ptr[k]     = current_space->array;
1167     uj_lvl_ptr[k] = current_space_lvl->array;
1168 
1169     current_space->array += nzk;
1170     current_space->local_used += nzk;
1171     current_space->local_remaining -= nzk;
1172 
1173     current_space_lvl->array += nzk;
1174     current_space_lvl->local_used += nzk;
1175     current_space_lvl->local_remaining -= nzk;
1176 
1177     ui[k + 1] = ui[k] + nzk;
1178   }
1179 
1180   PetscCall(ISRestoreIndices(perm, &rip));
1181   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1182   PetscCall(PetscFree(cols_lvl));
1183 
1184   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1185   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1186   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1187   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1188   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1189 
1190   /* put together the new matrix in MATSEQSBAIJ format */
1191   B = fact;
1192   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1193 
1194   b          = (Mat_SeqSBAIJ *)B->data;
1195   b->free_a  = PETSC_TRUE;
1196   b->free_ij = PETSC_TRUE;
1197 
1198   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1199 
1200   b->j             = uj;
1201   b->i             = ui;
1202   b->diag          = NULL;
1203   b->ilen          = NULL;
1204   b->imax          = NULL;
1205   b->row           = perm;
1206   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1207 
1208   PetscCall(PetscObjectReference((PetscObject)perm));
1209 
1210   b->icol = perm;
1211 
1212   PetscCall(PetscObjectReference((PetscObject)perm));
1213   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1214 
1215   b->maxnz = b->nz = ui[am];
1216 
1217   B->info.factor_mallocs   = reallocs;
1218   B->info.fill_ratio_given = fill;
1219   if (ai[am] != 0.) {
1220     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1221     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1222   } else {
1223     B->info.fill_ratio_needed = 0.0;
1224   }
1225 #if defined(PETSC_USE_INFO)
1226   if (ai[am] != 0) {
1227     PetscReal af = B->info.fill_ratio_needed;
1228     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1229     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1230     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1231   } else {
1232     PetscCall(PetscInfo(A, "Empty matrix\n"));
1233   }
1234 #endif
1235   if (perm_identity) {
1236     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1237     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1238     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1239   } else {
1240     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1241   }
1242   PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244 
MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1245 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1246 {
1247   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1248   Mat_SeqSBAIJ      *b;
1249   Mat                B;
1250   PetscBool          perm_identity;
1251   PetscReal          fill = info->fill;
1252   const PetscInt    *rip;
1253   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1254   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1255   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1256   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1257   PetscBT            lnkbt;
1258   PetscBool          diagDense;
1259 
1260   PetscFunctionBegin;
1261   if (bs > 1) { /* convert to seqsbaij */
1262     if (!a->sbaijMat) {
1263       A->symmetric = PETSC_BOOL3_TRUE;
1264       if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1265       PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1266     }
1267     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1268 
1269     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1270     PetscFunctionReturn(PETSC_SUCCESS);
1271   }
1272   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1273   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1274 
1275   /* check whether perm is the identity mapping */
1276   PetscCall(ISIdentity(perm, &perm_identity));
1277   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1278   PetscCall(ISGetIndices(perm, &rip));
1279 
1280   /* initialization */
1281   PetscCall(PetscMalloc1(mbs + 1, &ui));
1282   ui[0] = 0;
1283 
1284   /* jl: linked list for storing indices of the pivot rows
1285      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1286   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1287   for (i = 0; i < mbs; i++) {
1288     jl[i] = mbs;
1289     il[i] = 0;
1290   }
1291 
1292   /* create and initialize a linked list for storing column indices of the active row k */
1293   nlnk = mbs + 1;
1294   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1295 
1296   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1297   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1298 
1299   current_space = free_space;
1300 
1301   for (k = 0; k < mbs; k++) { /* for each active row k */
1302     /* initialize lnk by the column indices of row rip[k] of A */
1303     nzk         = 0;
1304     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1305     ncols_upper = 0;
1306     for (j = 0; j < ncols; j++) {
1307       i = rip[*(aj + ai[rip[k]] + j)];
1308       if (i >= k) { /* only take upper triangular entry */
1309         cols[ncols_upper] = i;
1310         ncols_upper++;
1311       }
1312     }
1313     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1314     nzk += nlnk;
1315 
1316     /* update lnk by computing fill-in for each pivot row to be merged in */
1317     prow = jl[k]; /* 1st pivot row */
1318 
1319     while (prow < k) {
1320       nextprow = jl[prow];
1321       /* merge prow into k-th row */
1322       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1323       jmax   = ui[prow + 1];
1324       ncols  = jmax - jmin;
1325       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1326       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1327       nzk += nlnk;
1328 
1329       /* update il and jl for prow */
1330       if (jmin < jmax) {
1331         il[prow] = jmin;
1332         j        = *uj_ptr;
1333         jl[prow] = jl[j];
1334         jl[j]    = prow;
1335       }
1336       prow = nextprow;
1337     }
1338 
1339     /* if free space is not available, make more free space */
1340     if (current_space->local_remaining < nzk) {
1341       i = mbs - k + 1;                                                            /* num of unfactored rows */
1342       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1343       PetscCall(PetscFreeSpaceGet(i, &current_space));
1344       reallocs++;
1345     }
1346 
1347     /* copy data into free space, then initialize lnk */
1348     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1349 
1350     /* add the k-th row into il and jl */
1351     if (nzk - 1 > 0) {
1352       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1353       jl[k] = jl[i];
1354       jl[i] = k;
1355       il[k] = ui[k] + 1;
1356     }
1357     ui_ptr[k] = current_space->array;
1358     current_space->array += nzk;
1359     current_space->local_used += nzk;
1360     current_space->local_remaining -= nzk;
1361 
1362     ui[k + 1] = ui[k] + nzk;
1363   }
1364 
1365   PetscCall(ISRestoreIndices(perm, &rip));
1366   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1367 
1368   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1369   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1370   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1371   PetscCall(PetscLLDestroy(lnk, lnkbt));
1372 
1373   /* put together the new matrix in MATSEQSBAIJ format */
1374   B = fact;
1375   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1376 
1377   b          = (Mat_SeqSBAIJ *)B->data;
1378   b->free_a  = PETSC_TRUE;
1379   b->free_ij = PETSC_TRUE;
1380 
1381   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1382 
1383   b->j             = uj;
1384   b->i             = ui;
1385   b->diag          = NULL;
1386   b->ilen          = NULL;
1387   b->imax          = NULL;
1388   b->row           = perm;
1389   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1390 
1391   PetscCall(PetscObjectReference((PetscObject)perm));
1392   b->icol = perm;
1393   PetscCall(PetscObjectReference((PetscObject)perm));
1394   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1395   b->maxnz = b->nz = ui[mbs];
1396 
1397   B->info.factor_mallocs   = reallocs;
1398   B->info.fill_ratio_given = fill;
1399   if (ai[mbs] != 0.) {
1400     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1401     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1402   } else {
1403     B->info.fill_ratio_needed = 0.0;
1404   }
1405 #if defined(PETSC_USE_INFO)
1406   if (ai[mbs] != 0.) {
1407     PetscReal af = B->info.fill_ratio_needed;
1408     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1409     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1410     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1411   } else {
1412     PetscCall(PetscInfo(A, "Empty matrix\n"));
1413   }
1414 #endif
1415   if (perm_identity) {
1416     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1417   } else {
1418     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1419   }
1420   PetscFunctionReturn(PETSC_SUCCESS);
1421 }
1422 
MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)1423 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1424 {
1425   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1426   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1427   PetscInt           i, k, n                       = a->mbs;
1428   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1429   const MatScalar   *aa = a->a, *v;
1430   PetscScalar       *x, *s, *t, *ls;
1431   const PetscScalar *b;
1432 
1433   PetscFunctionBegin;
1434   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1435   PetscCall(VecGetArrayRead(bb, &b));
1436   PetscCall(VecGetArray(xx, &x));
1437   t = a->solve_work;
1438 
1439   /* forward solve the lower triangular */
1440   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1441 
1442   for (i = 1; i < n; i++) {
1443     v  = aa + bs2 * ai[i];
1444     vi = aj + ai[i];
1445     nz = ai[i + 1] - ai[i];
1446     s  = t + bs * i;
1447     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1448     for (k = 0; k < nz; k++) {
1449       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1450       v += bs2;
1451     }
1452   }
1453 
1454   /* backward solve the upper triangular */
1455   ls = a->solve_work + A->cmap->n;
1456   for (i = n - 1; i >= 0; i--) {
1457     v  = aa + bs2 * (adiag[i + 1] + 1);
1458     vi = aj + adiag[i + 1] + 1;
1459     nz = adiag[i] - adiag[i + 1] - 1;
1460     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1461     for (k = 0; k < nz; k++) {
1462       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1463       v += bs2;
1464     }
1465     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1466     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1467   }
1468 
1469   PetscCall(VecRestoreArrayRead(bb, &b));
1470   PetscCall(VecRestoreArray(xx, &x));
1471   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1472   PetscFunctionReturn(PETSC_SUCCESS);
1473 }
1474 
MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)1475 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1476 {
1477   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1478   IS                 iscol = a->col, isrow = a->row;
1479   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1480   PetscInt           i, m, n = a->mbs;
1481   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1482   const MatScalar   *aa = a->a, *v;
1483   PetscScalar       *x, *s, *t, *ls;
1484   const PetscScalar *b;
1485 
1486   PetscFunctionBegin;
1487   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1488   PetscCall(VecGetArrayRead(bb, &b));
1489   PetscCall(VecGetArray(xx, &x));
1490   t = a->solve_work;
1491 
1492   PetscCall(ISGetIndices(isrow, &rout));
1493   r = rout;
1494   PetscCall(ISGetIndices(iscol, &cout));
1495   c = cout;
1496 
1497   /* forward solve the lower triangular */
1498   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1499   for (i = 1; i < n; i++) {
1500     v  = aa + bs2 * ai[i];
1501     vi = aj + ai[i];
1502     nz = ai[i + 1] - ai[i];
1503     s  = t + bs * i;
1504     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1505     for (m = 0; m < nz; m++) {
1506       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1507       v += bs2;
1508     }
1509   }
1510 
1511   /* backward solve the upper triangular */
1512   ls = a->solve_work + A->cmap->n;
1513   for (i = n - 1; i >= 0; i--) {
1514     v  = aa + bs2 * (adiag[i + 1] + 1);
1515     vi = aj + adiag[i + 1] + 1;
1516     nz = adiag[i] - adiag[i + 1] - 1;
1517     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1518     for (m = 0; m < nz; m++) {
1519       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1520       v += bs2;
1521     }
1522     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1523     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1524   }
1525   PetscCall(ISRestoreIndices(isrow, &rout));
1526   PetscCall(ISRestoreIndices(iscol, &cout));
1527   PetscCall(VecRestoreArrayRead(bb, &b));
1528   PetscCall(VecRestoreArray(xx, &x));
1529   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532