xref: /petsc/src/mat/impls/baij/seq/baijfact13.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /*
9       Version for when blocks are 3 by 3
10 */
11 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info) {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13   IS              isrow = b->row, isicol = b->icol;
14   const PetscInt *r, *ic;
15   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17   PetscInt       *diag_offset = b->diag, idx, *pj;
18   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
19   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
21   MatScalar      *ba = b->a, *aa = a->a;
22   PetscReal       shift = info->shiftamount;
23   PetscBool       allowzeropivot, zeropivotdetected;
24 
25   PetscFunctionBegin;
26   PetscCall(ISGetIndices(isrow, &r));
27   PetscCall(ISGetIndices(isicol, &ic));
28   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
29   allowzeropivot = PetscNot(A->erroriffailure);
30 
31   for (i = 0; i < n; i++) {
32     nz    = bi[i + 1] - bi[i];
33     ajtmp = bj + bi[i];
34     for (j = 0; j < nz; j++) {
35       x    = rtmp + 9 * ajtmp[j];
36       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
37     }
38     /* load in initial (unfactored row) */
39     idx      = r[i];
40     nz       = ai[idx + 1] - ai[idx];
41     ajtmpold = aj + ai[idx];
42     v        = aa + 9 * ai[idx];
43     for (j = 0; j < nz; j++) {
44       x    = rtmp + 9 * ic[ajtmpold[j]];
45       x[0] = v[0];
46       x[1] = v[1];
47       x[2] = v[2];
48       x[3] = v[3];
49       x[4] = v[4];
50       x[5] = v[5];
51       x[6] = v[6];
52       x[7] = v[7];
53       x[8] = v[8];
54       v += 9;
55     }
56     row = *ajtmp++;
57     while (row < i) {
58       pc = rtmp + 9 * row;
59       p1 = pc[0];
60       p2 = pc[1];
61       p3 = pc[2];
62       p4 = pc[3];
63       p5 = pc[4];
64       p6 = pc[5];
65       p7 = pc[6];
66       p8 = pc[7];
67       p9 = pc[8];
68       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
69         pv    = ba + 9 * diag_offset[row];
70         pj    = bj + diag_offset[row] + 1;
71         x1    = pv[0];
72         x2    = pv[1];
73         x3    = pv[2];
74         x4    = pv[3];
75         x5    = pv[4];
76         x6    = pv[5];
77         x7    = pv[6];
78         x8    = pv[7];
79         x9    = pv[8];
80         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
81         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
82         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
83 
84         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
85         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
86         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
87 
88         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
89         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
90         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
91         nz         = bi[row + 1] - diag_offset[row] - 1;
92         pv += 9;
93         for (j = 0; j < nz; j++) {
94           x1 = pv[0];
95           x2 = pv[1];
96           x3 = pv[2];
97           x4 = pv[3];
98           x5 = pv[4];
99           x6 = pv[5];
100           x7 = pv[6];
101           x8 = pv[7];
102           x9 = pv[8];
103           x  = rtmp + 9 * pj[j];
104           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
105           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
106           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
107 
108           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
109           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
110           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
111 
112           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
113           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
114           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
115           pv += 9;
116         }
117         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
118       }
119       row = *ajtmp++;
120     }
121     /* finished row so stick it into b->a */
122     pv = ba + 9 * bi[i];
123     pj = bj + bi[i];
124     nz = bi[i + 1] - bi[i];
125     for (j = 0; j < nz; j++) {
126       x     = rtmp + 9 * pj[j];
127       pv[0] = x[0];
128       pv[1] = x[1];
129       pv[2] = x[2];
130       pv[3] = x[3];
131       pv[4] = x[4];
132       pv[5] = x[5];
133       pv[6] = x[6];
134       pv[7] = x[7];
135       pv[8] = x[8];
136       pv += 9;
137     }
138     /* invert diagonal block */
139     w = ba + 9 * diag_offset[i];
140     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
141     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
142   }
143 
144   PetscCall(PetscFree(rtmp));
145   PetscCall(ISRestoreIndices(isicol, &ic));
146   PetscCall(ISRestoreIndices(isrow, &r));
147 
148   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
149   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
150   C->assembled           = PETSC_TRUE;
151 
152   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
153   PetscFunctionReturn(0);
154 }
155 
156 /* MatLUFactorNumeric_SeqBAIJ_3 -
157      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
158        PetscKernel_A_gets_A_times_B()
159        PetscKernel_A_gets_A_minus_B_times_C()
160        PetscKernel_A_gets_inverse_A()
161 */
162 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info) {
163   Mat             C = B;
164   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
165   IS              isrow = b->row, isicol = b->icol;
166   const PetscInt *r, *ic;
167   PetscInt        i, j, k, nz, nzL, row;
168   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
169   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
170   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
171   PetscInt        flg;
172   PetscReal       shift = info->shiftamount;
173   PetscBool       allowzeropivot, zeropivotdetected;
174 
175   PetscFunctionBegin;
176   PetscCall(ISGetIndices(isrow, &r));
177   PetscCall(ISGetIndices(isicol, &ic));
178   allowzeropivot = PetscNot(A->erroriffailure);
179 
180   /* generate work space needed by the factorization */
181   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
182   PetscCall(PetscArrayzero(rtmp, bs2 * n));
183 
184   for (i = 0; i < n; i++) {
185     /* zero rtmp */
186     /* L part */
187     nz    = bi[i + 1] - bi[i];
188     bjtmp = bj + bi[i];
189     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
190 
191     /* U part */
192     nz    = bdiag[i] - bdiag[i + 1];
193     bjtmp = bj + bdiag[i + 1] + 1;
194     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
195 
196     /* load in initial (unfactored row) */
197     nz    = ai[r[i] + 1] - ai[r[i]];
198     ajtmp = aj + ai[r[i]];
199     v     = aa + bs2 * ai[r[i]];
200     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); }
201 
202     /* elimination */
203     bjtmp = bj + bi[i];
204     nzL   = bi[i + 1] - bi[i];
205     for (k = 0; k < nzL; k++) {
206       row = bjtmp[k];
207       pc  = rtmp + bs2 * row;
208       for (flg = 0, j = 0; j < bs2; j++) {
209         if (pc[j] != 0.0) {
210           flg = 1;
211           break;
212         }
213       }
214       if (flg) {
215         pv = b->a + bs2 * bdiag[row];
216         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
217         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
218 
219         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
220         pv = b->a + bs2 * (bdiag[row + 1] + 1);
221         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
222         for (j = 0; j < nz; j++) {
223           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
224           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
225           v = rtmp + bs2 * pj[j];
226           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
227           pv += bs2;
228         }
229         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
230       }
231     }
232 
233     /* finished row so stick it into b->a */
234     /* L part */
235     pv = b->a + bs2 * bi[i];
236     pj = b->j + bi[i];
237     nz = bi[i + 1] - bi[i];
238     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
239 
240     /* Mark diagonal and invert diagonal for simpler triangular solves */
241     pv = b->a + bs2 * bdiag[i];
242     pj = b->j + bdiag[i];
243     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
244     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
245     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
246 
247     /* U part */
248     pj = b->j + bdiag[i + 1] + 1;
249     pv = b->a + bs2 * (bdiag[i + 1] + 1);
250     nz = bdiag[i] - bdiag[i + 1] - 1;
251     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
252   }
253 
254   PetscCall(PetscFree2(rtmp, mwork));
255   PetscCall(ISRestoreIndices(isicol, &ic));
256   PetscCall(ISRestoreIndices(isrow, &r));
257 
258   C->ops->solve          = MatSolve_SeqBAIJ_3;
259   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
260   C->assembled           = PETSC_TRUE;
261 
262   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
263   PetscFunctionReturn(0);
264 }
265 
266 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) {
267   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
268   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
269   PetscInt    *ajtmpold, *ajtmp, nz, row;
270   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
271   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
272   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
273   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
274   MatScalar   *ba = b->a, *aa = a->a;
275   PetscReal    shift = info->shiftamount;
276   PetscBool    allowzeropivot, zeropivotdetected;
277 
278   PetscFunctionBegin;
279   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
280   allowzeropivot = PetscNot(A->erroriffailure);
281 
282   for (i = 0; i < n; i++) {
283     nz    = bi[i + 1] - bi[i];
284     ajtmp = bj + bi[i];
285     for (j = 0; j < nz; j++) {
286       x    = rtmp + 9 * ajtmp[j];
287       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
288     }
289     /* load in initial (unfactored row) */
290     nz       = ai[i + 1] - ai[i];
291     ajtmpold = aj + ai[i];
292     v        = aa + 9 * ai[i];
293     for (j = 0; j < nz; j++) {
294       x    = rtmp + 9 * ajtmpold[j];
295       x[0] = v[0];
296       x[1] = v[1];
297       x[2] = v[2];
298       x[3] = v[3];
299       x[4] = v[4];
300       x[5] = v[5];
301       x[6] = v[6];
302       x[7] = v[7];
303       x[8] = v[8];
304       v += 9;
305     }
306     row = *ajtmp++;
307     while (row < i) {
308       pc = rtmp + 9 * row;
309       p1 = pc[0];
310       p2 = pc[1];
311       p3 = pc[2];
312       p4 = pc[3];
313       p5 = pc[4];
314       p6 = pc[5];
315       p7 = pc[6];
316       p8 = pc[7];
317       p9 = pc[8];
318       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
319         pv    = ba + 9 * diag_offset[row];
320         pj    = bj + diag_offset[row] + 1;
321         x1    = pv[0];
322         x2    = pv[1];
323         x3    = pv[2];
324         x4    = pv[3];
325         x5    = pv[4];
326         x6    = pv[5];
327         x7    = pv[6];
328         x8    = pv[7];
329         x9    = pv[8];
330         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
331         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
332         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
333 
334         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
335         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
336         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
337 
338         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
339         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
340         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
341 
342         nz = bi[row + 1] - diag_offset[row] - 1;
343         pv += 9;
344         for (j = 0; j < nz; j++) {
345           x1 = pv[0];
346           x2 = pv[1];
347           x3 = pv[2];
348           x4 = pv[3];
349           x5 = pv[4];
350           x6 = pv[5];
351           x7 = pv[6];
352           x8 = pv[7];
353           x9 = pv[8];
354           x  = rtmp + 9 * pj[j];
355           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
356           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
357           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
358 
359           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
360           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
361           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
362 
363           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
364           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
365           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
366           pv += 9;
367         }
368         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
369       }
370       row = *ajtmp++;
371     }
372     /* finished row so stick it into b->a */
373     pv = ba + 9 * bi[i];
374     pj = bj + bi[i];
375     nz = bi[i + 1] - bi[i];
376     for (j = 0; j < nz; j++) {
377       x     = rtmp + 9 * pj[j];
378       pv[0] = x[0];
379       pv[1] = x[1];
380       pv[2] = x[2];
381       pv[3] = x[3];
382       pv[4] = x[4];
383       pv[5] = x[5];
384       pv[6] = x[6];
385       pv[7] = x[7];
386       pv[8] = x[8];
387       pv += 9;
388     }
389     /* invert diagonal block */
390     w = ba + 9 * diag_offset[i];
391     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
392     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
393   }
394 
395   PetscCall(PetscFree(rtmp));
396 
397   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
398   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
399   C->assembled           = PETSC_TRUE;
400 
401   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
402   PetscFunctionReturn(0);
403 }
404 
405 /*
406   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
407     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
408 */
409 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) {
410   Mat             C = B;
411   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
412   PetscInt        i, j, k, nz, nzL, row;
413   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
414   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
415   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
416   PetscInt        flg;
417   PetscReal       shift = info->shiftamount;
418   PetscBool       allowzeropivot, zeropivotdetected;
419 
420   PetscFunctionBegin;
421   allowzeropivot = PetscNot(A->erroriffailure);
422 
423   /* generate work space needed by the factorization */
424   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
425   PetscCall(PetscArrayzero(rtmp, bs2 * n));
426 
427   for (i = 0; i < n; i++) {
428     /* zero rtmp */
429     /* L part */
430     nz    = bi[i + 1] - bi[i];
431     bjtmp = bj + bi[i];
432     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
433 
434     /* U part */
435     nz    = bdiag[i] - bdiag[i + 1];
436     bjtmp = bj + bdiag[i + 1] + 1;
437     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
438 
439     /* load in initial (unfactored row) */
440     nz    = ai[i + 1] - ai[i];
441     ajtmp = aj + ai[i];
442     v     = aa + bs2 * ai[i];
443     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); }
444 
445     /* elimination */
446     bjtmp = bj + bi[i];
447     nzL   = bi[i + 1] - bi[i];
448     for (k = 0; k < nzL; k++) {
449       row = bjtmp[k];
450       pc  = rtmp + bs2 * row;
451       for (flg = 0, j = 0; j < bs2; j++) {
452         if (pc[j] != 0.0) {
453           flg = 1;
454           break;
455         }
456       }
457       if (flg) {
458         pv = b->a + bs2 * bdiag[row];
459         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
460         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
461 
462         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
463         pv = b->a + bs2 * (bdiag[row + 1] + 1);
464         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
465         for (j = 0; j < nz; j++) {
466           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
467           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
468           v = rtmp + bs2 * pj[j];
469           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
470           pv += bs2;
471         }
472         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
473       }
474     }
475 
476     /* finished row so stick it into b->a */
477     /* L part */
478     pv = b->a + bs2 * bi[i];
479     pj = b->j + bi[i];
480     nz = bi[i + 1] - bi[i];
481     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
482 
483     /* Mark diagonal and invert diagonal for simpler triangular solves */
484     pv = b->a + bs2 * bdiag[i];
485     pj = b->j + bdiag[i];
486     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
487     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
488     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
489 
490     /* U part */
491     pv = b->a + bs2 * (bdiag[i + 1] + 1);
492     pj = b->j + bdiag[i + 1] + 1;
493     nz = bdiag[i] - bdiag[i + 1] - 1;
494     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
495   }
496   PetscCall(PetscFree2(rtmp, mwork));
497 
498   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
499   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
500   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
501   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
502   C->assembled           = PETSC_TRUE;
503 
504   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
505   PetscFunctionReturn(0);
506 }
507