xref: /petsc/src/mat/impls/baij/seq/baijfact13.c (revision e8c0849ab8fe171bed529bea27238c9b402db591) !
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 /*
8       Version for when blocks are 3 by 3
9 */
MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C,Mat A,const MatFactorInfo * info)10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13   IS              isrow = b->row, isicol = b->icol;
14   const PetscInt *r, *ic;
15   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17   const PetscInt *diag_offset;
18   PetscInt        idx, *pj;
19   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
20   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
22   MatScalar      *ba = b->a, *aa = a->a;
23   PetscReal       shift = info->shiftamount;
24   PetscBool       allowzeropivot, zeropivotdetected;
25 
26   PetscFunctionBegin;
27   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
28   A->factortype = MAT_FACTOR_NONE;
29   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
30   A->factortype = MAT_FACTOR_ILU;
31   PetscCall(ISGetIndices(isrow, &r));
32   PetscCall(ISGetIndices(isicol, &ic));
33   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
34   allowzeropivot = PetscNot(A->erroriffailure);
35 
36   for (i = 0; i < n; i++) {
37     nz    = bi[i + 1] - bi[i];
38     ajtmp = bj + bi[i];
39     for (j = 0; j < nz; j++) {
40       x    = rtmp + 9 * ajtmp[j];
41       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
42     }
43     /* load in initial (unfactored row) */
44     idx      = r[i];
45     nz       = ai[idx + 1] - ai[idx];
46     ajtmpold = aj + ai[idx];
47     v        = aa + 9 * ai[idx];
48     for (j = 0; j < nz; j++) {
49       x    = rtmp + 9 * ic[ajtmpold[j]];
50       x[0] = v[0];
51       x[1] = v[1];
52       x[2] = v[2];
53       x[3] = v[3];
54       x[4] = v[4];
55       x[5] = v[5];
56       x[6] = v[6];
57       x[7] = v[7];
58       x[8] = v[8];
59       v += 9;
60     }
61     row = *ajtmp++;
62     while (row < i) {
63       pc = rtmp + 9 * row;
64       p1 = pc[0];
65       p2 = pc[1];
66       p3 = pc[2];
67       p4 = pc[3];
68       p5 = pc[4];
69       p6 = pc[5];
70       p7 = pc[6];
71       p8 = pc[7];
72       p9 = pc[8];
73       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
74         pv    = ba + 9 * diag_offset[row];
75         pj    = bj + diag_offset[row] + 1;
76         x1    = pv[0];
77         x2    = pv[1];
78         x3    = pv[2];
79         x4    = pv[3];
80         x5    = pv[4];
81         x6    = pv[5];
82         x7    = pv[6];
83         x8    = pv[7];
84         x9    = pv[8];
85         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
86         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
87         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
88 
89         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
90         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
91         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
92 
93         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
94         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
95         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
96         nz         = bi[row + 1] - diag_offset[row] - 1;
97         pv += 9;
98         for (j = 0; j < nz; j++) {
99           x1 = pv[0];
100           x2 = pv[1];
101           x3 = pv[2];
102           x4 = pv[3];
103           x5 = pv[4];
104           x6 = pv[5];
105           x7 = pv[6];
106           x8 = pv[7];
107           x9 = pv[8];
108           x  = rtmp + 9 * pj[j];
109           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
110           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
111           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
112 
113           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
114           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
115           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
116 
117           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
118           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
119           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
120           pv += 9;
121         }
122         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
123       }
124       row = *ajtmp++;
125     }
126     /* finished row so stick it into b->a */
127     pv = ba + 9 * bi[i];
128     pj = bj + bi[i];
129     nz = bi[i + 1] - bi[i];
130     for (j = 0; j < nz; j++) {
131       x     = rtmp + 9 * pj[j];
132       pv[0] = x[0];
133       pv[1] = x[1];
134       pv[2] = x[2];
135       pv[3] = x[3];
136       pv[4] = x[4];
137       pv[5] = x[5];
138       pv[6] = x[6];
139       pv[7] = x[7];
140       pv[8] = x[8];
141       pv += 9;
142     }
143     /* invert diagonal block */
144     w = ba + 9 * diag_offset[i];
145     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
146     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
147   }
148 
149   PetscCall(PetscFree(rtmp));
150   PetscCall(ISRestoreIndices(isicol, &ic));
151   PetscCall(ISRestoreIndices(isrow, &r));
152 
153   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
154   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
155   C->assembled           = PETSC_TRUE;
156 
157   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
158   PetscFunctionReturn(PETSC_SUCCESS);
159 }
160 
161 /* MatLUFactorNumeric_SeqBAIJ_3 -
162      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163        PetscKernel_A_gets_A_times_B()
164        PetscKernel_A_gets_A_minus_B_times_C()
165        PetscKernel_A_gets_inverse_A()
166 */
MatLUFactorNumeric_SeqBAIJ_3(Mat B,Mat A,const MatFactorInfo * info)167 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
168 {
169   Mat             C = B;
170   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
171   IS              isrow = b->row, isicol = b->icol;
172   const PetscInt *r, *ic;
173   PetscInt        i, j, k, nz, nzL, row;
174   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
175   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
176   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
177   PetscInt        flg;
178   PetscReal       shift = info->shiftamount;
179   PetscBool       allowzeropivot, zeropivotdetected;
180 
181   PetscFunctionBegin;
182   PetscCall(ISGetIndices(isrow, &r));
183   PetscCall(ISGetIndices(isicol, &ic));
184   allowzeropivot = PetscNot(A->erroriffailure);
185 
186   /* generate work space needed by the factorization */
187   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
188   PetscCall(PetscArrayzero(rtmp, bs2 * n));
189 
190   for (i = 0; i < n; i++) {
191     /* zero rtmp */
192     /* L part */
193     nz    = bi[i + 1] - bi[i];
194     bjtmp = bj + bi[i];
195     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
196 
197     /* U part */
198     nz    = bdiag[i] - bdiag[i + 1];
199     bjtmp = bj + bdiag[i + 1] + 1;
200     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
201 
202     /* load in initial (unfactored row) */
203     nz    = ai[r[i] + 1] - ai[r[i]];
204     ajtmp = aj + ai[r[i]];
205     v     = aa + bs2 * ai[r[i]];
206     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
207 
208     /* elimination */
209     bjtmp = bj + bi[i];
210     nzL   = bi[i + 1] - bi[i];
211     for (k = 0; k < nzL; k++) {
212       row = bjtmp[k];
213       pc  = rtmp + bs2 * row;
214       for (flg = 0, j = 0; j < bs2; j++) {
215         if (pc[j] != 0.0) {
216           flg = 1;
217           break;
218         }
219       }
220       if (flg) {
221         pv = b->a + bs2 * bdiag[row];
222         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
223         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
224 
225         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
226         pv = b->a + bs2 * (bdiag[row + 1] + 1);
227         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
228         for (j = 0; j < nz; j++) {
229           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
230           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
231           v = rtmp + bs2 * pj[j];
232           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
233           pv += bs2;
234         }
235         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
236       }
237     }
238 
239     /* finished row so stick it into b->a */
240     /* L part */
241     pv = b->a + bs2 * bi[i];
242     pj = b->j + bi[i];
243     nz = bi[i + 1] - bi[i];
244     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
245 
246     /* Mark diagonal and invert diagonal for simpler triangular solves */
247     pv = b->a + bs2 * bdiag[i];
248     pj = b->j + bdiag[i];
249     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
250     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
251     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
252 
253     /* U part */
254     pj = b->j + bdiag[i + 1] + 1;
255     pv = b->a + bs2 * (bdiag[i + 1] + 1);
256     nz = bdiag[i] - bdiag[i + 1] - 1;
257     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
258   }
259 
260   PetscCall(PetscFree2(rtmp, mwork));
261   PetscCall(ISRestoreIndices(isicol, &ic));
262   PetscCall(ISRestoreIndices(isrow, &r));
263 
264   C->ops->solve          = MatSolve_SeqBAIJ_3;
265   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
266   C->assembled           = PETSC_TRUE;
267 
268   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)272 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
273 {
274   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
275   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
276   PetscInt       *ajtmpold, *ajtmp, nz, row;
277   const PetscInt *diag_offset;
278   PetscInt       *ai = a->i, *aj = a->j, *pj;
279   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
280   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
281   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
282   MatScalar      *ba = b->a, *aa = a->a;
283   PetscReal       shift = info->shiftamount;
284   PetscBool       allowzeropivot, zeropivotdetected;
285 
286   PetscFunctionBegin;
287   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
288   A->factortype = MAT_FACTOR_NONE;
289   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
290   A->factortype = MAT_FACTOR_ILU;
291   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
292   allowzeropivot = PetscNot(A->erroriffailure);
293 
294   for (i = 0; i < n; i++) {
295     nz    = bi[i + 1] - bi[i];
296     ajtmp = bj + bi[i];
297     for (j = 0; j < nz; j++) {
298       x    = rtmp + 9 * ajtmp[j];
299       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
300     }
301     /* load in initial (unfactored row) */
302     nz       = ai[i + 1] - ai[i];
303     ajtmpold = aj + ai[i];
304     v        = aa + 9 * ai[i];
305     for (j = 0; j < nz; j++) {
306       x    = rtmp + 9 * ajtmpold[j];
307       x[0] = v[0];
308       x[1] = v[1];
309       x[2] = v[2];
310       x[3] = v[3];
311       x[4] = v[4];
312       x[5] = v[5];
313       x[6] = v[6];
314       x[7] = v[7];
315       x[8] = v[8];
316       v += 9;
317     }
318     row = *ajtmp++;
319     while (row < i) {
320       pc = rtmp + 9 * row;
321       p1 = pc[0];
322       p2 = pc[1];
323       p3 = pc[2];
324       p4 = pc[3];
325       p5 = pc[4];
326       p6 = pc[5];
327       p7 = pc[6];
328       p8 = pc[7];
329       p9 = pc[8];
330       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
331         pv    = ba + 9 * diag_offset[row];
332         pj    = bj + diag_offset[row] + 1;
333         x1    = pv[0];
334         x2    = pv[1];
335         x3    = pv[2];
336         x4    = pv[3];
337         x5    = pv[4];
338         x6    = pv[5];
339         x7    = pv[6];
340         x8    = pv[7];
341         x9    = pv[8];
342         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
343         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
344         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
345 
346         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
347         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
348         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
349 
350         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
351         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
352         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
353 
354         nz = bi[row + 1] - diag_offset[row] - 1;
355         pv += 9;
356         for (j = 0; j < nz; j++) {
357           x1 = pv[0];
358           x2 = pv[1];
359           x3 = pv[2];
360           x4 = pv[3];
361           x5 = pv[4];
362           x6 = pv[5];
363           x7 = pv[6];
364           x8 = pv[7];
365           x9 = pv[8];
366           x  = rtmp + 9 * pj[j];
367           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
368           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
369           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
370 
371           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
372           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
373           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
374 
375           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
376           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
377           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
378           pv += 9;
379         }
380         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
381       }
382       row = *ajtmp++;
383     }
384     /* finished row so stick it into b->a */
385     pv = ba + 9 * bi[i];
386     pj = bj + bi[i];
387     nz = bi[i + 1] - bi[i];
388     for (j = 0; j < nz; j++) {
389       x     = rtmp + 9 * pj[j];
390       pv[0] = x[0];
391       pv[1] = x[1];
392       pv[2] = x[2];
393       pv[3] = x[3];
394       pv[4] = x[4];
395       pv[5] = x[5];
396       pv[6] = x[6];
397       pv[7] = x[7];
398       pv[8] = x[8];
399       pv += 9;
400     }
401     /* invert diagonal block */
402     w = ba + 9 * diag_offset[i];
403     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
404     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
405   }
406 
407   PetscCall(PetscFree(rtmp));
408 
409   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
410   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
411   C->assembled           = PETSC_TRUE;
412 
413   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
414   PetscFunctionReturn(PETSC_SUCCESS);
415 }
416 
417 /*
418   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
419     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
420 */
MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)421 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
422 {
423   Mat             C = B;
424   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
425   PetscInt        i, j, k, nz, nzL, row;
426   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
427   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
428   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
429   PetscInt        flg;
430   PetscReal       shift = info->shiftamount;
431   PetscBool       allowzeropivot, zeropivotdetected;
432 
433   PetscFunctionBegin;
434   allowzeropivot = PetscNot(A->erroriffailure);
435 
436   /* generate work space needed by the factorization */
437   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
438   PetscCall(PetscArrayzero(rtmp, bs2 * n));
439 
440   for (i = 0; i < n; i++) {
441     /* zero rtmp */
442     /* L part */
443     nz    = bi[i + 1] - bi[i];
444     bjtmp = bj + bi[i];
445     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
446 
447     /* U part */
448     nz    = bdiag[i] - bdiag[i + 1];
449     bjtmp = bj + bdiag[i + 1] + 1;
450     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
451 
452     /* load in initial (unfactored row) */
453     nz    = ai[i + 1] - ai[i];
454     ajtmp = aj + ai[i];
455     v     = aa + bs2 * ai[i];
456     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
457 
458     /* elimination */
459     bjtmp = bj + bi[i];
460     nzL   = bi[i + 1] - bi[i];
461     for (k = 0; k < nzL; k++) {
462       row = bjtmp[k];
463       pc  = rtmp + bs2 * row;
464       for (flg = 0, j = 0; j < bs2; j++) {
465         if (pc[j] != 0.0) {
466           flg = 1;
467           break;
468         }
469       }
470       if (flg) {
471         pv = b->a + bs2 * bdiag[row];
472         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
473         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
474 
475         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
476         pv = b->a + bs2 * (bdiag[row + 1] + 1);
477         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
478         for (j = 0; j < nz; j++) {
479           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
480           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
481           v = rtmp + bs2 * pj[j];
482           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
483           pv += bs2;
484         }
485         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
486       }
487     }
488 
489     /* finished row so stick it into b->a */
490     /* L part */
491     pv = b->a + bs2 * bi[i];
492     pj = b->j + bi[i];
493     nz = bi[i + 1] - bi[i];
494     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
495 
496     /* Mark diagonal and invert diagonal for simpler triangular solves */
497     pv = b->a + bs2 * bdiag[i];
498     pj = b->j + bdiag[i];
499     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
500     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
501     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
502 
503     /* U part */
504     pv = b->a + bs2 * (bdiag[i + 1] + 1);
505     pj = b->j + bdiag[i + 1] + 1;
506     nz = bdiag[i] - bdiag[i + 1] - 1;
507     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
508   }
509   PetscCall(PetscFree2(rtmp, mwork));
510 
511   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
512   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
513   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
514   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
515   C->assembled           = PETSC_TRUE;
516 
517   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520