xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision e8c0849ab8fe171bed529bea27238c9b402db591)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 /*
8       Version for when blocks are 4 by 4
9 */
MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo * info)10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13   IS              isrow = b->row, isicol = b->icol;
14   const PetscInt *r, *ic;
15   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16   PetscInt       *ajtmpold, *ajtmp, nz, row;
17   const PetscInt *diag_offset;
18   PetscInt        idx, *ai = a->i, *aj = a->j, *pj;
19   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
20   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23   MatScalar       m13, m14, m15, m16;
24   MatScalar      *ba = b->a, *aa = a->a;
25   PetscBool       pivotinblocks = b->pivotinblocks;
26   PetscReal       shift         = info->shiftamount;
27   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
28 
29   PetscFunctionBegin;
30   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
31   A->factortype = MAT_FACTOR_NONE;
32   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
33   A->factortype = MAT_FACTOR_ILU;
34   PetscCall(ISGetIndices(isrow, &r));
35   PetscCall(ISGetIndices(isicol, &ic));
36   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
37   allowzeropivot = PetscNot(A->erroriffailure);
38 
39   for (i = 0; i < n; i++) {
40     nz    = bi[i + 1] - bi[i];
41     ajtmp = bj + bi[i];
42     for (j = 0; j < nz; j++) {
43       x    = rtmp + 16 * ajtmp[j];
44       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
45       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
46     }
47     /* load in initial (unfactored row) */
48     idx      = r[i];
49     nz       = ai[idx + 1] - ai[idx];
50     ajtmpold = aj + ai[idx];
51     v        = aa + 16 * ai[idx];
52     for (j = 0; j < nz; j++) {
53       x     = rtmp + 16 * ic[ajtmpold[j]];
54       x[0]  = v[0];
55       x[1]  = v[1];
56       x[2]  = v[2];
57       x[3]  = v[3];
58       x[4]  = v[4];
59       x[5]  = v[5];
60       x[6]  = v[6];
61       x[7]  = v[7];
62       x[8]  = v[8];
63       x[9]  = v[9];
64       x[10] = v[10];
65       x[11] = v[11];
66       x[12] = v[12];
67       x[13] = v[13];
68       x[14] = v[14];
69       x[15] = v[15];
70       v += 16;
71     }
72     row = *ajtmp++;
73     while (row < i) {
74       pc  = rtmp + 16 * row;
75       p1  = pc[0];
76       p2  = pc[1];
77       p3  = pc[2];
78       p4  = pc[3];
79       p5  = pc[4];
80       p6  = pc[5];
81       p7  = pc[6];
82       p8  = pc[7];
83       p9  = pc[8];
84       p10 = pc[9];
85       p11 = pc[10];
86       p12 = pc[11];
87       p13 = pc[12];
88       p14 = pc[13];
89       p15 = pc[14];
90       p16 = pc[15];
91       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
92         pv    = ba + 16 * diag_offset[row];
93         pj    = bj + diag_offset[row] + 1;
94         x1    = pv[0];
95         x2    = pv[1];
96         x3    = pv[2];
97         x4    = pv[3];
98         x5    = pv[4];
99         x6    = pv[5];
100         x7    = pv[6];
101         x8    = pv[7];
102         x9    = pv[8];
103         x10   = pv[9];
104         x11   = pv[10];
105         x12   = pv[11];
106         x13   = pv[12];
107         x14   = pv[13];
108         x15   = pv[14];
109         x16   = pv[15];
110         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
111         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
112         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
113         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
114 
115         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
116         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
117         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
118         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
119 
120         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
121         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
122         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
123         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
124 
125         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
126         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
127         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
128         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
129 
130         nz = bi[row + 1] - diag_offset[row] - 1;
131         pv += 16;
132         for (j = 0; j < nz; j++) {
133           x1  = pv[0];
134           x2  = pv[1];
135           x3  = pv[2];
136           x4  = pv[3];
137           x5  = pv[4];
138           x6  = pv[5];
139           x7  = pv[6];
140           x8  = pv[7];
141           x9  = pv[8];
142           x10 = pv[9];
143           x11 = pv[10];
144           x12 = pv[11];
145           x13 = pv[12];
146           x14 = pv[13];
147           x15 = pv[14];
148           x16 = pv[15];
149           x   = rtmp + 16 * pj[j];
150           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
151           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
152           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
153           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
154 
155           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
156           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
157           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
158           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
159 
160           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
161           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
162           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
163           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
164 
165           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
166           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
167           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
168           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
169 
170           pv += 16;
171         }
172         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
173       }
174       row = *ajtmp++;
175     }
176     /* finished row so stick it into b->a */
177     pv = ba + 16 * bi[i];
178     pj = bj + bi[i];
179     nz = bi[i + 1] - bi[i];
180     for (j = 0; j < nz; j++) {
181       x      = rtmp + 16 * pj[j];
182       pv[0]  = x[0];
183       pv[1]  = x[1];
184       pv[2]  = x[2];
185       pv[3]  = x[3];
186       pv[4]  = x[4];
187       pv[5]  = x[5];
188       pv[6]  = x[6];
189       pv[7]  = x[7];
190       pv[8]  = x[8];
191       pv[9]  = x[9];
192       pv[10] = x[10];
193       pv[11] = x[11];
194       pv[12] = x[12];
195       pv[13] = x[13];
196       pv[14] = x[14];
197       pv[15] = x[15];
198       pv += 16;
199     }
200     /* invert diagonal block */
201     w = ba + 16 * diag_offset[i];
202     if (pivotinblocks) {
203       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
204       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
205     } else {
206       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
207     }
208   }
209 
210   PetscCall(PetscFree(rtmp));
211   PetscCall(ISRestoreIndices(isicol, &ic));
212   PetscCall(ISRestoreIndices(isrow, &r));
213 
214   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
215   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
216   C->assembled           = PETSC_TRUE;
217 
218   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
219   PetscFunctionReturn(PETSC_SUCCESS);
220 }
221 
222 /* MatLUFactorNumeric_SeqBAIJ_4 -
223      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
224        PetscKernel_A_gets_A_times_B()
225        PetscKernel_A_gets_A_minus_B_times_C()
226        PetscKernel_A_gets_inverse_A()
227 */
228 
MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo * info)229 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
230 {
231   Mat             C = B;
232   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
233   IS              isrow = b->row, isicol = b->icol;
234   const PetscInt *r, *ic;
235   PetscInt        i, j, k, nz, nzL, row;
236   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
237   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
238   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
239   PetscInt        flg;
240   PetscReal       shift;
241   PetscBool       allowzeropivot, zeropivotdetected;
242 
243   PetscFunctionBegin;
244   allowzeropivot = PetscNot(A->erroriffailure);
245   PetscCall(ISGetIndices(isrow, &r));
246   PetscCall(ISGetIndices(isicol, &ic));
247 
248   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
249     shift = 0;
250   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
251     shift = info->shiftamount;
252   }
253 
254   /* generate work space needed by the factorization */
255   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
256   PetscCall(PetscArrayzero(rtmp, bs2 * n));
257 
258   for (i = 0; i < n; i++) {
259     /* zero rtmp */
260     /* L part */
261     nz    = bi[i + 1] - bi[i];
262     bjtmp = bj + bi[i];
263     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
264 
265     /* U part */
266     nz    = bdiag[i] - bdiag[i + 1];
267     bjtmp = bj + bdiag[i + 1] + 1;
268     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
269 
270     /* load in initial (unfactored row) */
271     nz    = ai[r[i] + 1] - ai[r[i]];
272     ajtmp = aj + ai[r[i]];
273     v     = aa + bs2 * ai[r[i]];
274     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
275 
276     /* elimination */
277     bjtmp = bj + bi[i];
278     nzL   = bi[i + 1] - bi[i];
279     for (k = 0; k < nzL; k++) {
280       row = bjtmp[k];
281       pc  = rtmp + bs2 * row;
282       for (flg = 0, j = 0; j < bs2; j++) {
283         if (pc[j] != 0.0) {
284           flg = 1;
285           break;
286         }
287       }
288       if (flg) {
289         pv = b->a + bs2 * bdiag[row];
290         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
291         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
292 
293         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
294         pv = b->a + bs2 * (bdiag[row + 1] + 1);
295         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
296         for (j = 0; j < nz; j++) {
297           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
298           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
299           v = rtmp + bs2 * pj[j];
300           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
301           pv += bs2;
302         }
303         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
304       }
305     }
306 
307     /* finished row so stick it into b->a */
308     /* L part */
309     pv = b->a + bs2 * bi[i];
310     pj = b->j + bi[i];
311     nz = bi[i + 1] - bi[i];
312     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
313 
314     /* Mark diagonal and invert diagonal for simpler triangular solves */
315     pv = b->a + bs2 * bdiag[i];
316     pj = b->j + bdiag[i];
317     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
318     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
319     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
320 
321     /* U part */
322     pv = b->a + bs2 * (bdiag[i + 1] + 1);
323     pj = b->j + bdiag[i + 1] + 1;
324     nz = bdiag[i] - bdiag[i + 1] - 1;
325     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
326   }
327 
328   PetscCall(PetscFree2(rtmp, mwork));
329   PetscCall(ISRestoreIndices(isicol, &ic));
330   PetscCall(ISRestoreIndices(isrow, &r));
331 
332   C->ops->solve          = MatSolve_SeqBAIJ_4;
333   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
334   C->assembled           = PETSC_TRUE;
335 
336   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
337   PetscFunctionReturn(PETSC_SUCCESS);
338 }
339 
MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)340 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
341 {
342   /*
343     Default Version for when blocks are 4 by 4 Using natural ordering
344 */
345   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
346   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
347   PetscInt       *ajtmpold, *ajtmp, nz, row;
348   const PetscInt *diag_offset;
349   PetscInt       *ai = a->i, *aj = a->j, *pj;
350   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
351   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
352   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
353   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
354   MatScalar       m13, m14, m15, m16;
355   MatScalar      *ba = b->a, *aa = a->a;
356   PetscBool       pivotinblocks = b->pivotinblocks;
357   PetscReal       shift         = info->shiftamount;
358   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
359 
360   PetscFunctionBegin;
361   /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
362   A->factortype = MAT_FACTOR_NONE;
363   PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
364   A->factortype  = MAT_FACTOR_ILU;
365   allowzeropivot = PetscNot(A->erroriffailure);
366   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
367 
368   for (i = 0; i < n; i++) {
369     nz    = bi[i + 1] - bi[i];
370     ajtmp = bj + bi[i];
371     for (j = 0; j < nz; j++) {
372       x    = rtmp + 16 * ajtmp[j];
373       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
374       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
375     }
376     /* load in initial (unfactored row) */
377     nz       = ai[i + 1] - ai[i];
378     ajtmpold = aj + ai[i];
379     v        = aa + 16 * ai[i];
380     for (j = 0; j < nz; j++) {
381       x     = rtmp + 16 * ajtmpold[j];
382       x[0]  = v[0];
383       x[1]  = v[1];
384       x[2]  = v[2];
385       x[3]  = v[3];
386       x[4]  = v[4];
387       x[5]  = v[5];
388       x[6]  = v[6];
389       x[7]  = v[7];
390       x[8]  = v[8];
391       x[9]  = v[9];
392       x[10] = v[10];
393       x[11] = v[11];
394       x[12] = v[12];
395       x[13] = v[13];
396       x[14] = v[14];
397       x[15] = v[15];
398       v += 16;
399     }
400     row = *ajtmp++;
401     while (row < i) {
402       pc  = rtmp + 16 * row;
403       p1  = pc[0];
404       p2  = pc[1];
405       p3  = pc[2];
406       p4  = pc[3];
407       p5  = pc[4];
408       p6  = pc[5];
409       p7  = pc[6];
410       p8  = pc[7];
411       p9  = pc[8];
412       p10 = pc[9];
413       p11 = pc[10];
414       p12 = pc[11];
415       p13 = pc[12];
416       p14 = pc[13];
417       p15 = pc[14];
418       p16 = pc[15];
419       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
420         pv    = ba + 16 * diag_offset[row];
421         pj    = bj + diag_offset[row] + 1;
422         x1    = pv[0];
423         x2    = pv[1];
424         x3    = pv[2];
425         x4    = pv[3];
426         x5    = pv[4];
427         x6    = pv[5];
428         x7    = pv[6];
429         x8    = pv[7];
430         x9    = pv[8];
431         x10   = pv[9];
432         x11   = pv[10];
433         x12   = pv[11];
434         x13   = pv[12];
435         x14   = pv[13];
436         x15   = pv[14];
437         x16   = pv[15];
438         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
439         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
440         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
441         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
442 
443         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
444         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
445         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
446         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
447 
448         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
449         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
450         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
451         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
452 
453         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
454         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
455         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
456         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
457         nz           = bi[row + 1] - diag_offset[row] - 1;
458         pv += 16;
459         for (j = 0; j < nz; j++) {
460           x1  = pv[0];
461           x2  = pv[1];
462           x3  = pv[2];
463           x4  = pv[3];
464           x5  = pv[4];
465           x6  = pv[5];
466           x7  = pv[6];
467           x8  = pv[7];
468           x9  = pv[8];
469           x10 = pv[9];
470           x11 = pv[10];
471           x12 = pv[11];
472           x13 = pv[12];
473           x14 = pv[13];
474           x15 = pv[14];
475           x16 = pv[15];
476           x   = rtmp + 16 * pj[j];
477           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
478           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
479           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
480           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
481 
482           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
483           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
484           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
485           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
486 
487           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
488           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
489           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
490           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
491 
492           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
493           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
494           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
495           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
496 
497           pv += 16;
498         }
499         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
500       }
501       row = *ajtmp++;
502     }
503     /* finished row so stick it into b->a */
504     pv = ba + 16 * bi[i];
505     pj = bj + bi[i];
506     nz = bi[i + 1] - bi[i];
507     for (j = 0; j < nz; j++) {
508       x      = rtmp + 16 * pj[j];
509       pv[0]  = x[0];
510       pv[1]  = x[1];
511       pv[2]  = x[2];
512       pv[3]  = x[3];
513       pv[4]  = x[4];
514       pv[5]  = x[5];
515       pv[6]  = x[6];
516       pv[7]  = x[7];
517       pv[8]  = x[8];
518       pv[9]  = x[9];
519       pv[10] = x[10];
520       pv[11] = x[11];
521       pv[12] = x[12];
522       pv[13] = x[13];
523       pv[14] = x[14];
524       pv[15] = x[15];
525       pv += 16;
526     }
527     /* invert diagonal block */
528     w = ba + 16 * diag_offset[i];
529     if (pivotinblocks) {
530       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
531       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
532     } else {
533       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
534     }
535   }
536 
537   PetscCall(PetscFree(rtmp));
538 
539   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
540   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
541   C->assembled           = PETSC_TRUE;
542 
543   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
544   PetscFunctionReturn(PETSC_SUCCESS);
545 }
546 
547 /*
548   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
549     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
550 */
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)551 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
552 {
553   Mat             C = B;
554   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
555   PetscInt        i, j, k, nz, nzL, row;
556   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
557   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
558   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
559   PetscInt        flg;
560   PetscReal       shift;
561   PetscBool       allowzeropivot, zeropivotdetected;
562 
563   PetscFunctionBegin;
564   allowzeropivot = PetscNot(A->erroriffailure);
565 
566   /* generate work space needed by the factorization */
567   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
568   PetscCall(PetscArrayzero(rtmp, bs2 * n));
569 
570   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
571     shift = 0;
572   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
573     shift = info->shiftamount;
574   }
575 
576   for (i = 0; i < n; i++) {
577     /* zero rtmp */
578     /* L part */
579     nz    = bi[i + 1] - bi[i];
580     bjtmp = bj + bi[i];
581     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
582 
583     /* U part */
584     nz    = bdiag[i] - bdiag[i + 1];
585     bjtmp = bj + bdiag[i + 1] + 1;
586     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
587 
588     /* load in initial (unfactored row) */
589     nz    = ai[i + 1] - ai[i];
590     ajtmp = aj + ai[i];
591     v     = aa + bs2 * ai[i];
592     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
593 
594     /* elimination */
595     bjtmp = bj + bi[i];
596     nzL   = bi[i + 1] - bi[i];
597     for (k = 0; k < nzL; k++) {
598       row = bjtmp[k];
599       pc  = rtmp + bs2 * row;
600       for (flg = 0, j = 0; j < bs2; j++) {
601         if (pc[j] != 0.0) {
602           flg = 1;
603           break;
604         }
605       }
606       if (flg) {
607         pv = b->a + bs2 * bdiag[row];
608         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
609         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
610 
611         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
612         pv = b->a + bs2 * (bdiag[row + 1] + 1);
613         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
614         for (j = 0; j < nz; j++) {
615           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
616           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
617           v = rtmp + bs2 * pj[j];
618           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
619           pv += bs2;
620         }
621         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
622       }
623     }
624 
625     /* finished row so stick it into b->a */
626     /* L part */
627     pv = b->a + bs2 * bi[i];
628     pj = b->j + bi[i];
629     nz = bi[i + 1] - bi[i];
630     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
631 
632     /* Mark diagonal and invert diagonal for simpler triangular solves */
633     pv = b->a + bs2 * bdiag[i];
634     pj = b->j + bdiag[i];
635     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
636     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
637     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
638 
639     /* U part */
640     pv = b->a + bs2 * (bdiag[i + 1] + 1);
641     pj = b->j + bdiag[i + 1] + 1;
642     nz = bdiag[i] - bdiag[i + 1] - 1;
643     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
644   }
645   PetscCall(PetscFree2(rtmp, mwork));
646 
647   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
648   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
649   C->assembled           = PETSC_TRUE;
650 
651   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
652   PetscFunctionReturn(PETSC_SUCCESS);
653 }
654