xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision b2ccae6bdc8edea944f1c160ca3b2eb32c69ecb2)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 /*
8       Version for when blocks are 4 by 4
9 */
10 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13   IS              isrow = b->row, isicol = b->icol;
14   const PetscInt *r, *ic;
15   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16   PetscInt       *ajtmpold, *ajtmp, nz, row;
17   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
18   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
19   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
21   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
22   MatScalar       m13, m14, m15, m16;
23   MatScalar      *ba = b->a, *aa = a->a;
24   PetscBool       pivotinblocks = b->pivotinblocks;
25   PetscReal       shift         = info->shiftamount;
26   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
27 
28   PetscFunctionBegin;
29   PetscCall(ISGetIndices(isrow, &r));
30   PetscCall(ISGetIndices(isicol, &ic));
31   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
32   allowzeropivot = PetscNot(A->erroriffailure);
33 
34   for (i = 0; i < n; i++) {
35     nz    = bi[i + 1] - bi[i];
36     ajtmp = bj + bi[i];
37     for (j = 0; j < nz; j++) {
38       x    = rtmp + 16 * ajtmp[j];
39       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
40       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
41     }
42     /* load in initial (unfactored row) */
43     idx      = r[i];
44     nz       = ai[idx + 1] - ai[idx];
45     ajtmpold = aj + ai[idx];
46     v        = aa + 16 * ai[idx];
47     for (j = 0; j < nz; j++) {
48       x     = rtmp + 16 * ic[ajtmpold[j]];
49       x[0]  = v[0];
50       x[1]  = v[1];
51       x[2]  = v[2];
52       x[3]  = v[3];
53       x[4]  = v[4];
54       x[5]  = v[5];
55       x[6]  = v[6];
56       x[7]  = v[7];
57       x[8]  = v[8];
58       x[9]  = v[9];
59       x[10] = v[10];
60       x[11] = v[11];
61       x[12] = v[12];
62       x[13] = v[13];
63       x[14] = v[14];
64       x[15] = v[15];
65       v += 16;
66     }
67     row = *ajtmp++;
68     while (row < i) {
69       pc  = rtmp + 16 * row;
70       p1  = pc[0];
71       p2  = pc[1];
72       p3  = pc[2];
73       p4  = pc[3];
74       p5  = pc[4];
75       p6  = pc[5];
76       p7  = pc[6];
77       p8  = pc[7];
78       p9  = pc[8];
79       p10 = pc[9];
80       p11 = pc[10];
81       p12 = pc[11];
82       p13 = pc[12];
83       p14 = pc[13];
84       p15 = pc[14];
85       p16 = pc[15];
86       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
87         pv    = ba + 16 * diag_offset[row];
88         pj    = bj + diag_offset[row] + 1;
89         x1    = pv[0];
90         x2    = pv[1];
91         x3    = pv[2];
92         x4    = pv[3];
93         x5    = pv[4];
94         x6    = pv[5];
95         x7    = pv[6];
96         x8    = pv[7];
97         x9    = pv[8];
98         x10   = pv[9];
99         x11   = pv[10];
100         x12   = pv[11];
101         x13   = pv[12];
102         x14   = pv[13];
103         x15   = pv[14];
104         x16   = pv[15];
105         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
106         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
107         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
108         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
109 
110         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
111         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
112         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
113         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
114 
115         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
116         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
117         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
118         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
119 
120         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
121         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
122         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
123         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
124 
125         nz = bi[row + 1] - diag_offset[row] - 1;
126         pv += 16;
127         for (j = 0; j < nz; j++) {
128           x1  = pv[0];
129           x2  = pv[1];
130           x3  = pv[2];
131           x4  = pv[3];
132           x5  = pv[4];
133           x6  = pv[5];
134           x7  = pv[6];
135           x8  = pv[7];
136           x9  = pv[8];
137           x10 = pv[9];
138           x11 = pv[10];
139           x12 = pv[11];
140           x13 = pv[12];
141           x14 = pv[13];
142           x15 = pv[14];
143           x16 = pv[15];
144           x   = rtmp + 16 * pj[j];
145           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
146           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
147           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
148           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
149 
150           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
151           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
152           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
153           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
154 
155           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
156           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
157           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
158           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
159 
160           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
161           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
162           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
163           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
164 
165           pv += 16;
166         }
167         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
168       }
169       row = *ajtmp++;
170     }
171     /* finished row so stick it into b->a */
172     pv = ba + 16 * bi[i];
173     pj = bj + bi[i];
174     nz = bi[i + 1] - bi[i];
175     for (j = 0; j < nz; j++) {
176       x      = rtmp + 16 * pj[j];
177       pv[0]  = x[0];
178       pv[1]  = x[1];
179       pv[2]  = x[2];
180       pv[3]  = x[3];
181       pv[4]  = x[4];
182       pv[5]  = x[5];
183       pv[6]  = x[6];
184       pv[7]  = x[7];
185       pv[8]  = x[8];
186       pv[9]  = x[9];
187       pv[10] = x[10];
188       pv[11] = x[11];
189       pv[12] = x[12];
190       pv[13] = x[13];
191       pv[14] = x[14];
192       pv[15] = x[15];
193       pv += 16;
194     }
195     /* invert diagonal block */
196     w = ba + 16 * diag_offset[i];
197     if (pivotinblocks) {
198       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
199       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200     } else {
201       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
202     }
203   }
204 
205   PetscCall(PetscFree(rtmp));
206   PetscCall(ISRestoreIndices(isicol, &ic));
207   PetscCall(ISRestoreIndices(isrow, &r));
208 
209   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
210   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
211   C->assembled           = PETSC_TRUE;
212 
213   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /* MatLUFactorNumeric_SeqBAIJ_4 -
218      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
219        PetscKernel_A_gets_A_times_B()
220        PetscKernel_A_gets_A_minus_B_times_C()
221        PetscKernel_A_gets_inverse_A()
222 */
223 
224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
225 {
226   Mat             C = B;
227   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228   IS              isrow = b->row, isicol = b->icol;
229   const PetscInt *r, *ic;
230   PetscInt        i, j, k, nz, nzL, row;
231   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
232   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
233   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
234   PetscInt        flg;
235   PetscReal       shift;
236   PetscBool       allowzeropivot, zeropivotdetected;
237 
238   PetscFunctionBegin;
239   allowzeropivot = PetscNot(A->erroriffailure);
240   PetscCall(ISGetIndices(isrow, &r));
241   PetscCall(ISGetIndices(isicol, &ic));
242 
243   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
244     shift = 0;
245   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246     shift = info->shiftamount;
247   }
248 
249   /* generate work space needed by the factorization */
250   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
251   PetscCall(PetscArrayzero(rtmp, bs2 * n));
252 
253   for (i = 0; i < n; i++) {
254     /* zero rtmp */
255     /* L part */
256     nz    = bi[i + 1] - bi[i];
257     bjtmp = bj + bi[i];
258     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
259 
260     /* U part */
261     nz    = bdiag[i] - bdiag[i + 1];
262     bjtmp = bj + bdiag[i + 1] + 1;
263     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
264 
265     /* load in initial (unfactored row) */
266     nz    = ai[r[i] + 1] - ai[r[i]];
267     ajtmp = aj + ai[r[i]];
268     v     = aa + bs2 * ai[r[i]];
269     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
270 
271     /* elimination */
272     bjtmp = bj + bi[i];
273     nzL   = bi[i + 1] - bi[i];
274     for (k = 0; k < nzL; k++) {
275       row = bjtmp[k];
276       pc  = rtmp + bs2 * row;
277       for (flg = 0, j = 0; j < bs2; j++) {
278         if (pc[j] != 0.0) {
279           flg = 1;
280           break;
281         }
282       }
283       if (flg) {
284         pv = b->a + bs2 * bdiag[row];
285         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
287 
288         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289         pv = b->a + bs2 * (bdiag[row + 1] + 1);
290         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291         for (j = 0; j < nz; j++) {
292           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294           v = rtmp + bs2 * pj[j];
295           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
296           pv += bs2;
297         }
298         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299       }
300     }
301 
302     /* finished row so stick it into b->a */
303     /* L part */
304     pv = b->a + bs2 * bi[i];
305     pj = b->j + bi[i];
306     nz = bi[i + 1] - bi[i];
307     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
308 
309     /* Mark diagonal and invert diagonal for simpler triangular solves */
310     pv = b->a + bs2 * bdiag[i];
311     pj = b->j + bdiag[i];
312     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
313     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
314     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
315 
316     /* U part */
317     pv = b->a + bs2 * (bdiag[i + 1] + 1);
318     pj = b->j + bdiag[i + 1] + 1;
319     nz = bdiag[i] - bdiag[i + 1] - 1;
320     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
321   }
322 
323   PetscCall(PetscFree2(rtmp, mwork));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_4;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 
335 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336 {
337   /*
338     Default Version for when blocks are 4 by 4 Using natural ordering
339 */
340   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342   PetscInt    *ajtmpold, *ajtmp, nz, row;
343   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
345   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348   MatScalar    m13, m14, m15, m16;
349   MatScalar   *ba = b->a, *aa = a->a;
350   PetscBool    pivotinblocks = b->pivotinblocks;
351   PetscReal    shift         = info->shiftamount;
352   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
353 
354   PetscFunctionBegin;
355   allowzeropivot = PetscNot(A->erroriffailure);
356   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
357 
358   for (i = 0; i < n; i++) {
359     nz    = bi[i + 1] - bi[i];
360     ajtmp = bj + bi[i];
361     for (j = 0; j < nz; j++) {
362       x    = rtmp + 16 * ajtmp[j];
363       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
364       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
365     }
366     /* load in initial (unfactored row) */
367     nz       = ai[i + 1] - ai[i];
368     ajtmpold = aj + ai[i];
369     v        = aa + 16 * ai[i];
370     for (j = 0; j < nz; j++) {
371       x     = rtmp + 16 * ajtmpold[j];
372       x[0]  = v[0];
373       x[1]  = v[1];
374       x[2]  = v[2];
375       x[3]  = v[3];
376       x[4]  = v[4];
377       x[5]  = v[5];
378       x[6]  = v[6];
379       x[7]  = v[7];
380       x[8]  = v[8];
381       x[9]  = v[9];
382       x[10] = v[10];
383       x[11] = v[11];
384       x[12] = v[12];
385       x[13] = v[13];
386       x[14] = v[14];
387       x[15] = v[15];
388       v += 16;
389     }
390     row = *ajtmp++;
391     while (row < i) {
392       pc  = rtmp + 16 * row;
393       p1  = pc[0];
394       p2  = pc[1];
395       p3  = pc[2];
396       p4  = pc[3];
397       p5  = pc[4];
398       p6  = pc[5];
399       p7  = pc[6];
400       p8  = pc[7];
401       p9  = pc[8];
402       p10 = pc[9];
403       p11 = pc[10];
404       p12 = pc[11];
405       p13 = pc[12];
406       p14 = pc[13];
407       p15 = pc[14];
408       p16 = pc[15];
409       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
410         pv    = ba + 16 * diag_offset[row];
411         pj    = bj + diag_offset[row] + 1;
412         x1    = pv[0];
413         x2    = pv[1];
414         x3    = pv[2];
415         x4    = pv[3];
416         x5    = pv[4];
417         x6    = pv[5];
418         x7    = pv[6];
419         x8    = pv[7];
420         x9    = pv[8];
421         x10   = pv[9];
422         x11   = pv[10];
423         x12   = pv[11];
424         x13   = pv[12];
425         x14   = pv[13];
426         x15   = pv[14];
427         x16   = pv[15];
428         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
429         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
430         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
431         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
432 
433         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
434         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
435         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
436         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
437 
438         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
439         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
440         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
441         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
442 
443         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
444         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
445         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
446         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
447         nz           = bi[row + 1] - diag_offset[row] - 1;
448         pv += 16;
449         for (j = 0; j < nz; j++) {
450           x1  = pv[0];
451           x2  = pv[1];
452           x3  = pv[2];
453           x4  = pv[3];
454           x5  = pv[4];
455           x6  = pv[5];
456           x7  = pv[6];
457           x8  = pv[7];
458           x9  = pv[8];
459           x10 = pv[9];
460           x11 = pv[10];
461           x12 = pv[11];
462           x13 = pv[12];
463           x14 = pv[13];
464           x15 = pv[14];
465           x16 = pv[15];
466           x   = rtmp + 16 * pj[j];
467           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
468           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
469           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
470           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
471 
472           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
473           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
474           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
475           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
476 
477           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
478           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
479           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
480           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
481 
482           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
483           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
484           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
485           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
486 
487           pv += 16;
488         }
489         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
490       }
491       row = *ajtmp++;
492     }
493     /* finished row so stick it into b->a */
494     pv = ba + 16 * bi[i];
495     pj = bj + bi[i];
496     nz = bi[i + 1] - bi[i];
497     for (j = 0; j < nz; j++) {
498       x      = rtmp + 16 * pj[j];
499       pv[0]  = x[0];
500       pv[1]  = x[1];
501       pv[2]  = x[2];
502       pv[3]  = x[3];
503       pv[4]  = x[4];
504       pv[5]  = x[5];
505       pv[6]  = x[6];
506       pv[7]  = x[7];
507       pv[8]  = x[8];
508       pv[9]  = x[9];
509       pv[10] = x[10];
510       pv[11] = x[11];
511       pv[12] = x[12];
512       pv[13] = x[13];
513       pv[14] = x[14];
514       pv[15] = x[15];
515       pv += 16;
516     }
517     /* invert diagonal block */
518     w = ba + 16 * diag_offset[i];
519     if (pivotinblocks) {
520       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
521       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
522     } else {
523       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
524     }
525   }
526 
527   PetscCall(PetscFree(rtmp));
528 
529   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
530   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
531   C->assembled           = PETSC_TRUE;
532 
533   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
534   PetscFunctionReturn(PETSC_SUCCESS);
535 }
536 
537 /*
538   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
539     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
540 */
541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
542 {
543   Mat             C = B;
544   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
545   PetscInt        i, j, k, nz, nzL, row;
546   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
547   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
548   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
549   PetscInt        flg;
550   PetscReal       shift;
551   PetscBool       allowzeropivot, zeropivotdetected;
552 
553   PetscFunctionBegin;
554   allowzeropivot = PetscNot(A->erroriffailure);
555 
556   /* generate work space needed by the factorization */
557   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
558   PetscCall(PetscArrayzero(rtmp, bs2 * n));
559 
560   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
561     shift = 0;
562   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
563     shift = info->shiftamount;
564   }
565 
566   for (i = 0; i < n; i++) {
567     /* zero rtmp */
568     /* L part */
569     nz    = bi[i + 1] - bi[i];
570     bjtmp = bj + bi[i];
571     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
572 
573     /* U part */
574     nz    = bdiag[i] - bdiag[i + 1];
575     bjtmp = bj + bdiag[i + 1] + 1;
576     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
577 
578     /* load in initial (unfactored row) */
579     nz    = ai[i + 1] - ai[i];
580     ajtmp = aj + ai[i];
581     v     = aa + bs2 * ai[i];
582     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
583 
584     /* elimination */
585     bjtmp = bj + bi[i];
586     nzL   = bi[i + 1] - bi[i];
587     for (k = 0; k < nzL; k++) {
588       row = bjtmp[k];
589       pc  = rtmp + bs2 * row;
590       for (flg = 0, j = 0; j < bs2; j++) {
591         if (pc[j] != 0.0) {
592           flg = 1;
593           break;
594         }
595       }
596       if (flg) {
597         pv = b->a + bs2 * bdiag[row];
598         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
599         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
600 
601         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
602         pv = b->a + bs2 * (bdiag[row + 1] + 1);
603         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
604         for (j = 0; j < nz; j++) {
605           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
606           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
607           v = rtmp + bs2 * pj[j];
608           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
609           pv += bs2;
610         }
611         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
612       }
613     }
614 
615     /* finished row so stick it into b->a */
616     /* L part */
617     pv = b->a + bs2 * bi[i];
618     pj = b->j + bi[i];
619     nz = bi[i + 1] - bi[i];
620     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
621 
622     /* Mark diagonal and invert diagonal for simpler triangular solves */
623     pv = b->a + bs2 * bdiag[i];
624     pj = b->j + bdiag[i];
625     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
626     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
627     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
628 
629     /* U part */
630     pv = b->a + bs2 * (bdiag[i + 1] + 1);
631     pj = b->j + bdiag[i + 1] + 1;
632     nz = bdiag[i] - bdiag[i + 1] - 1;
633     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
634   }
635   PetscCall(PetscFree2(rtmp, mwork));
636 
637   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
638   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
639   C->assembled           = PETSC_TRUE;
640 
641   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
642   PetscFunctionReturn(PETSC_SUCCESS);
643 }
644