xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) {
13   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
14   IS              isrow = b->row, isicol = b->icol;
15   const PetscInt *r, *ic;
16   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
17   PetscInt       *ajtmpold, *ajtmp, nz, row;
18   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
19   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
20   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23   MatScalar       m13, m14, m15, m16;
24   MatScalar      *ba = b->a, *aa = a->a;
25   PetscBool       pivotinblocks = b->pivotinblocks;
26   PetscReal       shift         = info->shiftamount;
27   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
28 
29   PetscFunctionBegin;
30   PetscCall(ISGetIndices(isrow, &r));
31   PetscCall(ISGetIndices(isicol, &ic));
32   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
33   allowzeropivot = PetscNot(A->erroriffailure);
34 
35   for (i = 0; i < n; i++) {
36     nz    = bi[i + 1] - bi[i];
37     ajtmp = bj + bi[i];
38     for (j = 0; j < nz; j++) {
39       x    = rtmp + 16 * ajtmp[j];
40       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
41       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
42     }
43     /* load in initial (unfactored row) */
44     idx      = r[i];
45     nz       = ai[idx + 1] - ai[idx];
46     ajtmpold = aj + ai[idx];
47     v        = aa + 16 * ai[idx];
48     for (j = 0; j < nz; j++) {
49       x     = rtmp + 16 * ic[ajtmpold[j]];
50       x[0]  = v[0];
51       x[1]  = v[1];
52       x[2]  = v[2];
53       x[3]  = v[3];
54       x[4]  = v[4];
55       x[5]  = v[5];
56       x[6]  = v[6];
57       x[7]  = v[7];
58       x[8]  = v[8];
59       x[9]  = v[9];
60       x[10] = v[10];
61       x[11] = v[11];
62       x[12] = v[12];
63       x[13] = v[13];
64       x[14] = v[14];
65       x[15] = v[15];
66       v += 16;
67     }
68     row = *ajtmp++;
69     while (row < i) {
70       pc  = rtmp + 16 * row;
71       p1  = pc[0];
72       p2  = pc[1];
73       p3  = pc[2];
74       p4  = pc[3];
75       p5  = pc[4];
76       p6  = pc[5];
77       p7  = pc[6];
78       p8  = pc[7];
79       p9  = pc[8];
80       p10 = pc[9];
81       p11 = pc[10];
82       p12 = pc[11];
83       p13 = pc[12];
84       p14 = pc[13];
85       p15 = pc[14];
86       p16 = pc[15];
87       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
88         pv    = ba + 16 * diag_offset[row];
89         pj    = bj + diag_offset[row] + 1;
90         x1    = pv[0];
91         x2    = pv[1];
92         x3    = pv[2];
93         x4    = pv[3];
94         x5    = pv[4];
95         x6    = pv[5];
96         x7    = pv[6];
97         x8    = pv[7];
98         x9    = pv[8];
99         x10   = pv[9];
100         x11   = pv[10];
101         x12   = pv[11];
102         x13   = pv[12];
103         x14   = pv[13];
104         x15   = pv[14];
105         x16   = pv[15];
106         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
107         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
108         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
109         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
110 
111         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
112         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
113         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
114         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
115 
116         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
117         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
118         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
119         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
120 
121         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
122         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
123         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
124         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
125 
126         nz = bi[row + 1] - diag_offset[row] - 1;
127         pv += 16;
128         for (j = 0; j < nz; j++) {
129           x1  = pv[0];
130           x2  = pv[1];
131           x3  = pv[2];
132           x4  = pv[3];
133           x5  = pv[4];
134           x6  = pv[5];
135           x7  = pv[6];
136           x8  = pv[7];
137           x9  = pv[8];
138           x10 = pv[9];
139           x11 = pv[10];
140           x12 = pv[11];
141           x13 = pv[12];
142           x14 = pv[13];
143           x15 = pv[14];
144           x16 = pv[15];
145           x   = rtmp + 16 * pj[j];
146           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
147           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
148           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
149           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
150 
151           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
152           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
153           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
154           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
155 
156           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
157           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
158           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
159           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
160 
161           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
162           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
163           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
164           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
165 
166           pv += 16;
167         }
168         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
169       }
170       row = *ajtmp++;
171     }
172     /* finished row so stick it into b->a */
173     pv = ba + 16 * bi[i];
174     pj = bj + bi[i];
175     nz = bi[i + 1] - bi[i];
176     for (j = 0; j < nz; j++) {
177       x      = rtmp + 16 * pj[j];
178       pv[0]  = x[0];
179       pv[1]  = x[1];
180       pv[2]  = x[2];
181       pv[3]  = x[3];
182       pv[4]  = x[4];
183       pv[5]  = x[5];
184       pv[6]  = x[6];
185       pv[7]  = x[7];
186       pv[8]  = x[8];
187       pv[9]  = x[9];
188       pv[10] = x[10];
189       pv[11] = x[11];
190       pv[12] = x[12];
191       pv[13] = x[13];
192       pv[14] = x[14];
193       pv[15] = x[15];
194       pv += 16;
195     }
196     /* invert diagonal block */
197     w = ba + 16 * diag_offset[i];
198     if (pivotinblocks) {
199       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
200       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
201     } else {
202       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
203     }
204   }
205 
206   PetscCall(PetscFree(rtmp));
207   PetscCall(ISRestoreIndices(isicol, &ic));
208   PetscCall(ISRestoreIndices(isrow, &r));
209 
210   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
211   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
212   C->assembled           = PETSC_TRUE;
213 
214   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
215   PetscFunctionReturn(0);
216 }
217 
218 /* MatLUFactorNumeric_SeqBAIJ_4 -
219      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
220        PetscKernel_A_gets_A_times_B()
221        PetscKernel_A_gets_A_minus_B_times_C()
222        PetscKernel_A_gets_inverse_A()
223 */
224 
225 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) {
226   Mat             C = B;
227   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228   IS              isrow = b->row, isicol = b->icol;
229   const PetscInt *r, *ic;
230   PetscInt        i, j, k, nz, nzL, row;
231   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
232   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
233   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
234   PetscInt        flg;
235   PetscReal       shift;
236   PetscBool       allowzeropivot, zeropivotdetected;
237 
238   PetscFunctionBegin;
239   allowzeropivot = PetscNot(A->erroriffailure);
240   PetscCall(ISGetIndices(isrow, &r));
241   PetscCall(ISGetIndices(isicol, &ic));
242 
243   if (info->shifttype == MAT_SHIFT_NONE) {
244     shift = 0;
245   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246     shift = info->shiftamount;
247   }
248 
249   /* generate work space needed by the factorization */
250   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
251   PetscCall(PetscArrayzero(rtmp, bs2 * n));
252 
253   for (i = 0; i < n; i++) {
254     /* zero rtmp */
255     /* L part */
256     nz    = bi[i + 1] - bi[i];
257     bjtmp = bj + bi[i];
258     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
259 
260     /* U part */
261     nz    = bdiag[i] - bdiag[i + 1];
262     bjtmp = bj + bdiag[i + 1] + 1;
263     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
264 
265     /* load in initial (unfactored row) */
266     nz    = ai[r[i] + 1] - ai[r[i]];
267     ajtmp = aj + ai[r[i]];
268     v     = aa + bs2 * ai[r[i]];
269     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); }
270 
271     /* elimination */
272     bjtmp = bj + bi[i];
273     nzL   = bi[i + 1] - bi[i];
274     for (k = 0; k < nzL; k++) {
275       row = bjtmp[k];
276       pc  = rtmp + bs2 * row;
277       for (flg = 0, j = 0; j < bs2; j++) {
278         if (pc[j] != 0.0) {
279           flg = 1;
280           break;
281         }
282       }
283       if (flg) {
284         pv = b->a + bs2 * bdiag[row];
285         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
287 
288         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289         pv = b->a + bs2 * (bdiag[row + 1] + 1);
290         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291         for (j = 0; j < nz; j++) {
292           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294           v = rtmp + bs2 * pj[j];
295           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
296           pv += bs2;
297         }
298         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299       }
300     }
301 
302     /* finished row so stick it into b->a */
303     /* L part */
304     pv = b->a + bs2 * bi[i];
305     pj = b->j + bi[i];
306     nz = bi[i + 1] - bi[i];
307     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
308 
309     /* Mark diagonal and invert diagonal for simpler triangular solves */
310     pv = b->a + bs2 * bdiag[i];
311     pj = b->j + bdiag[i];
312     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
313     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
314     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
315 
316     /* U part */
317     pv = b->a + bs2 * (bdiag[i + 1] + 1);
318     pj = b->j + bdiag[i + 1] + 1;
319     nz = bdiag[i] - bdiag[i + 1] - 1;
320     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
321   }
322 
323   PetscCall(PetscFree2(rtmp, mwork));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_4;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(0);
333 }
334 
335 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) {
336   /*
337     Default Version for when blocks are 4 by 4 Using natural ordering
338 */
339   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341   PetscInt    *ajtmpold, *ajtmp, nz, row;
342   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
343   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
344   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
345   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
346   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
347   MatScalar    m13, m14, m15, m16;
348   MatScalar   *ba = b->a, *aa = a->a;
349   PetscBool    pivotinblocks = b->pivotinblocks;
350   PetscReal    shift         = info->shiftamount;
351   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
352 
353   PetscFunctionBegin;
354   allowzeropivot = PetscNot(A->erroriffailure);
355   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
356 
357   for (i = 0; i < n; i++) {
358     nz    = bi[i + 1] - bi[i];
359     ajtmp = bj + bi[i];
360     for (j = 0; j < nz; j++) {
361       x    = rtmp + 16 * ajtmp[j];
362       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
363       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
364     }
365     /* load in initial (unfactored row) */
366     nz       = ai[i + 1] - ai[i];
367     ajtmpold = aj + ai[i];
368     v        = aa + 16 * ai[i];
369     for (j = 0; j < nz; j++) {
370       x     = rtmp + 16 * ajtmpold[j];
371       x[0]  = v[0];
372       x[1]  = v[1];
373       x[2]  = v[2];
374       x[3]  = v[3];
375       x[4]  = v[4];
376       x[5]  = v[5];
377       x[6]  = v[6];
378       x[7]  = v[7];
379       x[8]  = v[8];
380       x[9]  = v[9];
381       x[10] = v[10];
382       x[11] = v[11];
383       x[12] = v[12];
384       x[13] = v[13];
385       x[14] = v[14];
386       x[15] = v[15];
387       v += 16;
388     }
389     row = *ajtmp++;
390     while (row < i) {
391       pc  = rtmp + 16 * row;
392       p1  = pc[0];
393       p2  = pc[1];
394       p3  = pc[2];
395       p4  = pc[3];
396       p5  = pc[4];
397       p6  = pc[5];
398       p7  = pc[6];
399       p8  = pc[7];
400       p9  = pc[8];
401       p10 = pc[9];
402       p11 = pc[10];
403       p12 = pc[11];
404       p13 = pc[12];
405       p14 = pc[13];
406       p15 = pc[14];
407       p16 = pc[15];
408       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
409         pv    = ba + 16 * diag_offset[row];
410         pj    = bj + diag_offset[row] + 1;
411         x1    = pv[0];
412         x2    = pv[1];
413         x3    = pv[2];
414         x4    = pv[3];
415         x5    = pv[4];
416         x6    = pv[5];
417         x7    = pv[6];
418         x8    = pv[7];
419         x9    = pv[8];
420         x10   = pv[9];
421         x11   = pv[10];
422         x12   = pv[11];
423         x13   = pv[12];
424         x14   = pv[13];
425         x15   = pv[14];
426         x16   = pv[15];
427         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
428         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
429         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
430         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
431 
432         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
433         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
434         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
435         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
436 
437         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
438         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
439         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
440         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
441 
442         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
443         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
444         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
445         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
446         nz           = bi[row + 1] - diag_offset[row] - 1;
447         pv += 16;
448         for (j = 0; j < nz; j++) {
449           x1  = pv[0];
450           x2  = pv[1];
451           x3  = pv[2];
452           x4  = pv[3];
453           x5  = pv[4];
454           x6  = pv[5];
455           x7  = pv[6];
456           x8  = pv[7];
457           x9  = pv[8];
458           x10 = pv[9];
459           x11 = pv[10];
460           x12 = pv[11];
461           x13 = pv[12];
462           x14 = pv[13];
463           x15 = pv[14];
464           x16 = pv[15];
465           x   = rtmp + 16 * pj[j];
466           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
467           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
468           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
469           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
470 
471           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
472           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
473           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
474           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
475 
476           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
477           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
478           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
479           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
480 
481           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
482           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
483           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
484           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
485 
486           pv += 16;
487         }
488         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
489       }
490       row = *ajtmp++;
491     }
492     /* finished row so stick it into b->a */
493     pv = ba + 16 * bi[i];
494     pj = bj + bi[i];
495     nz = bi[i + 1] - bi[i];
496     for (j = 0; j < nz; j++) {
497       x      = rtmp + 16 * pj[j];
498       pv[0]  = x[0];
499       pv[1]  = x[1];
500       pv[2]  = x[2];
501       pv[3]  = x[3];
502       pv[4]  = x[4];
503       pv[5]  = x[5];
504       pv[6]  = x[6];
505       pv[7]  = x[7];
506       pv[8]  = x[8];
507       pv[9]  = x[9];
508       pv[10] = x[10];
509       pv[11] = x[11];
510       pv[12] = x[12];
511       pv[13] = x[13];
512       pv[14] = x[14];
513       pv[15] = x[15];
514       pv += 16;
515     }
516     /* invert diagonal block */
517     w = ba + 16 * diag_offset[i];
518     if (pivotinblocks) {
519       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
520       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
521     } else {
522       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
523     }
524   }
525 
526   PetscCall(PetscFree(rtmp));
527 
528   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
529   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
530   C->assembled           = PETSC_TRUE;
531 
532   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
533   PetscFunctionReturn(0);
534 }
535 
536 /*
537   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
538     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
539 */
540 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) {
541   Mat             C = B;
542   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
543   PetscInt        i, j, k, nz, nzL, row;
544   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
545   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
546   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
547   PetscInt        flg;
548   PetscReal       shift;
549   PetscBool       allowzeropivot, zeropivotdetected;
550 
551   PetscFunctionBegin;
552   allowzeropivot = PetscNot(A->erroriffailure);
553 
554   /* generate work space needed by the factorization */
555   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
556   PetscCall(PetscArrayzero(rtmp, bs2 * n));
557 
558   if (info->shifttype == MAT_SHIFT_NONE) {
559     shift = 0;
560   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
561     shift = info->shiftamount;
562   }
563 
564   for (i = 0; i < n; i++) {
565     /* zero rtmp */
566     /* L part */
567     nz    = bi[i + 1] - bi[i];
568     bjtmp = bj + bi[i];
569     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
570 
571     /* U part */
572     nz    = bdiag[i] - bdiag[i + 1];
573     bjtmp = bj + bdiag[i + 1] + 1;
574     for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); }
575 
576     /* load in initial (unfactored row) */
577     nz    = ai[i + 1] - ai[i];
578     ajtmp = aj + ai[i];
579     v     = aa + bs2 * ai[i];
580     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); }
581 
582     /* elimination */
583     bjtmp = bj + bi[i];
584     nzL   = bi[i + 1] - bi[i];
585     for (k = 0; k < nzL; k++) {
586       row = bjtmp[k];
587       pc  = rtmp + bs2 * row;
588       for (flg = 0, j = 0; j < bs2; j++) {
589         if (pc[j] != 0.0) {
590           flg = 1;
591           break;
592         }
593       }
594       if (flg) {
595         pv = b->a + bs2 * bdiag[row];
596         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
597         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
598 
599         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
600         pv = b->a + bs2 * (bdiag[row + 1] + 1);
601         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
602         for (j = 0; j < nz; j++) {
603           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
604           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
605           v = rtmp + bs2 * pj[j];
606           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
607           pv += bs2;
608         }
609         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
610       }
611     }
612 
613     /* finished row so stick it into b->a */
614     /* L part */
615     pv = b->a + bs2 * bi[i];
616     pj = b->j + bi[i];
617     nz = bi[i + 1] - bi[i];
618     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
619 
620     /* Mark diagonal and invert diagonal for simpler triangular solves */
621     pv = b->a + bs2 * bdiag[i];
622     pj = b->j + bdiag[i];
623     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
624     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
625     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
626 
627     /* U part */
628     pv = b->a + bs2 * (bdiag[i + 1] + 1);
629     pj = b->j + bdiag[i + 1] + 1;
630     nz = bdiag[i] - bdiag[i + 1] - 1;
631     for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); }
632   }
633   PetscCall(PetscFree2(rtmp, mwork));
634 
635   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
636   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
637   C->assembled           = PETSC_TRUE;
638 
639   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
640   PetscFunctionReturn(0);
641 }
642 
643 #if defined(PETSC_HAVE_SSE)
644 
645 #include PETSC_HAVE_SSE
646 
647 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
648 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) {
649   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
650   int          i, j, n = a->mbs;
651   int         *bj = b->j, *bjtmp, *pj;
652   int          row;
653   int         *ajtmpold, nz, *bi = b->i;
654   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
655   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
656   MatScalar   *ba = b->a, *aa = a->a;
657   int          nonzero       = 0;
658   PetscBool    pivotinblocks = b->pivotinblocks;
659   PetscReal    shift         = info->shiftamount;
660   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
661 
662   PetscFunctionBegin;
663   allowzeropivot = PetscNot(A->erroriffailure);
664   SSE_SCOPE_BEGIN;
665 
666   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
667   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
668   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
669   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
670   /*    if ((unsigned long)bj==(unsigned long)aj) { */
671   /*      colscale = 4; */
672   /*    } */
673   for (i = 0; i < n; i++) {
674     nz    = bi[i + 1] - bi[i];
675     bjtmp = bj + bi[i];
676     /* zero out the 4x4 block accumulators */
677     /* zero out one register */
678     XOR_PS(XMM7, XMM7);
679     for (j = 0; j < nz; j++) {
680       x = rtmp + 16 * bjtmp[j];
681       /*        x = rtmp+4*bjtmp[j]; */
682       SSE_INLINE_BEGIN_1(x)
683       /* Copy zero register to memory locations */
684       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
685       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
686       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
687       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
688       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
689       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
690       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
691       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
692       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
693       SSE_INLINE_END_1;
694     }
695     /* load in initial (unfactored row) */
696     nz       = ai[i + 1] - ai[i];
697     ajtmpold = aj + ai[i];
698     v        = aa + 16 * ai[i];
699     for (j = 0; j < nz; j++) {
700       x = rtmp + 16 * ajtmpold[j];
701       /*        x = rtmp+colscale*ajtmpold[j]; */
702       /* Copy v block into x block */
703       SSE_INLINE_BEGIN_2(v, x)
704       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
705       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
706       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
707 
708       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
709       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
710 
711       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
712       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
713 
714       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
715       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
716 
717       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
718       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
719 
720       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
721       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
722 
723       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
724       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
725 
726       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
727       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
728       SSE_INLINE_END_2;
729 
730       v += 16;
731     }
732     /*      row = (*bjtmp++)/4; */
733     row = *bjtmp++;
734     while (row < i) {
735       pc = rtmp + 16 * row;
736       SSE_INLINE_BEGIN_1(pc)
737       /* Load block from lower triangle */
738       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
739       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
740       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
741 
742       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
743       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
744 
745       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
746       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
747 
748       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
749       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
750 
751       /* Compare block to zero block */
752 
753       SSE_COPY_PS(XMM4, XMM7)
754       SSE_CMPNEQ_PS(XMM4, XMM0)
755 
756       SSE_COPY_PS(XMM5, XMM7)
757       SSE_CMPNEQ_PS(XMM5, XMM1)
758 
759       SSE_COPY_PS(XMM6, XMM7)
760       SSE_CMPNEQ_PS(XMM6, XMM2)
761 
762       SSE_CMPNEQ_PS(XMM7, XMM3)
763 
764       /* Reduce the comparisons to one SSE register */
765       SSE_OR_PS(XMM6, XMM7)
766       SSE_OR_PS(XMM5, XMM4)
767       SSE_OR_PS(XMM5, XMM6)
768       SSE_INLINE_END_1;
769 
770       /* Reduce the one SSE register to an integer register for branching */
771       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
772       MOVEMASK(nonzero, XMM5);
773 
774       /* If block is nonzero ... */
775       if (nonzero) {
776         pv = ba + 16 * diag_offset[row];
777         PREFETCH_L1(&pv[16]);
778         pj = bj + diag_offset[row] + 1;
779 
780         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
781         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
782         /* but the diagonal was inverted already */
783         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
784 
785         SSE_INLINE_BEGIN_2(pv, pc)
786         /* Column 0, product is accumulated in XMM4 */
787         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
788         SSE_SHUFFLE(XMM4, XMM4, 0x00)
789         SSE_MULT_PS(XMM4, XMM0)
790 
791         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
792         SSE_SHUFFLE(XMM5, XMM5, 0x00)
793         SSE_MULT_PS(XMM5, XMM1)
794         SSE_ADD_PS(XMM4, XMM5)
795 
796         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
797         SSE_SHUFFLE(XMM6, XMM6, 0x00)
798         SSE_MULT_PS(XMM6, XMM2)
799         SSE_ADD_PS(XMM4, XMM6)
800 
801         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
802         SSE_SHUFFLE(XMM7, XMM7, 0x00)
803         SSE_MULT_PS(XMM7, XMM3)
804         SSE_ADD_PS(XMM4, XMM7)
805 
806         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
807         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
808 
809         /* Column 1, product is accumulated in XMM5 */
810         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
811         SSE_SHUFFLE(XMM5, XMM5, 0x00)
812         SSE_MULT_PS(XMM5, XMM0)
813 
814         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
815         SSE_SHUFFLE(XMM6, XMM6, 0x00)
816         SSE_MULT_PS(XMM6, XMM1)
817         SSE_ADD_PS(XMM5, XMM6)
818 
819         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
820         SSE_SHUFFLE(XMM7, XMM7, 0x00)
821         SSE_MULT_PS(XMM7, XMM2)
822         SSE_ADD_PS(XMM5, XMM7)
823 
824         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
825         SSE_SHUFFLE(XMM6, XMM6, 0x00)
826         SSE_MULT_PS(XMM6, XMM3)
827         SSE_ADD_PS(XMM5, XMM6)
828 
829         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
830         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
831 
832         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
833 
834         /* Column 2, product is accumulated in XMM6 */
835         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
836         SSE_SHUFFLE(XMM6, XMM6, 0x00)
837         SSE_MULT_PS(XMM6, XMM0)
838 
839         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
840         SSE_SHUFFLE(XMM7, XMM7, 0x00)
841         SSE_MULT_PS(XMM7, XMM1)
842         SSE_ADD_PS(XMM6, XMM7)
843 
844         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
845         SSE_SHUFFLE(XMM7, XMM7, 0x00)
846         SSE_MULT_PS(XMM7, XMM2)
847         SSE_ADD_PS(XMM6, XMM7)
848 
849         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
850         SSE_SHUFFLE(XMM7, XMM7, 0x00)
851         SSE_MULT_PS(XMM7, XMM3)
852         SSE_ADD_PS(XMM6, XMM7)
853 
854         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
855         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
856 
857         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
858         /* Column 3, product is accumulated in XMM0 */
859         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
860         SSE_SHUFFLE(XMM7, XMM7, 0x00)
861         SSE_MULT_PS(XMM0, XMM7)
862 
863         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
864         SSE_SHUFFLE(XMM7, XMM7, 0x00)
865         SSE_MULT_PS(XMM1, XMM7)
866         SSE_ADD_PS(XMM0, XMM1)
867 
868         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
869         SSE_SHUFFLE(XMM1, XMM1, 0x00)
870         SSE_MULT_PS(XMM1, XMM2)
871         SSE_ADD_PS(XMM0, XMM1)
872 
873         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
874         SSE_SHUFFLE(XMM7, XMM7, 0x00)
875         SSE_MULT_PS(XMM3, XMM7)
876         SSE_ADD_PS(XMM0, XMM3)
877 
878         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
879         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
880 
881         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
882         /* This is code to be maintained and read by humans after all. */
883         /* Copy Multiplier Col 3 into XMM3 */
884         SSE_COPY_PS(XMM3, XMM0)
885         /* Copy Multiplier Col 2 into XMM2 */
886         SSE_COPY_PS(XMM2, XMM6)
887         /* Copy Multiplier Col 1 into XMM1 */
888         SSE_COPY_PS(XMM1, XMM5)
889         /* Copy Multiplier Col 0 into XMM0 */
890         SSE_COPY_PS(XMM0, XMM4)
891         SSE_INLINE_END_2;
892 
893         /* Update the row: */
894         nz = bi[row + 1] - diag_offset[row] - 1;
895         pv += 16;
896         for (j = 0; j < nz; j++) {
897           PREFETCH_L1(&pv[16]);
898           x = rtmp + 16 * pj[j];
899           /*            x = rtmp + 4*pj[j]; */
900 
901           /* X:=X-M*PV, One column at a time */
902           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
903           SSE_INLINE_BEGIN_2(x, pv)
904           /* Load First Column of X*/
905           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
906           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
907 
908           /* Matrix-Vector Product: */
909           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
910           SSE_SHUFFLE(XMM5, XMM5, 0x00)
911           SSE_MULT_PS(XMM5, XMM0)
912           SSE_SUB_PS(XMM4, XMM5)
913 
914           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
915           SSE_SHUFFLE(XMM6, XMM6, 0x00)
916           SSE_MULT_PS(XMM6, XMM1)
917           SSE_SUB_PS(XMM4, XMM6)
918 
919           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
920           SSE_SHUFFLE(XMM7, XMM7, 0x00)
921           SSE_MULT_PS(XMM7, XMM2)
922           SSE_SUB_PS(XMM4, XMM7)
923 
924           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
925           SSE_SHUFFLE(XMM5, XMM5, 0x00)
926           SSE_MULT_PS(XMM5, XMM3)
927           SSE_SUB_PS(XMM4, XMM5)
928 
929           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
930           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
931 
932           /* Second Column */
933           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
934           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
935 
936           /* Matrix-Vector Product: */
937           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
938           SSE_SHUFFLE(XMM6, XMM6, 0x00)
939           SSE_MULT_PS(XMM6, XMM0)
940           SSE_SUB_PS(XMM5, XMM6)
941 
942           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
943           SSE_SHUFFLE(XMM7, XMM7, 0x00)
944           SSE_MULT_PS(XMM7, XMM1)
945           SSE_SUB_PS(XMM5, XMM7)
946 
947           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
948           SSE_SHUFFLE(XMM4, XMM4, 0x00)
949           SSE_MULT_PS(XMM4, XMM2)
950           SSE_SUB_PS(XMM5, XMM4)
951 
952           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
953           SSE_SHUFFLE(XMM6, XMM6, 0x00)
954           SSE_MULT_PS(XMM6, XMM3)
955           SSE_SUB_PS(XMM5, XMM6)
956 
957           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
958           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
959 
960           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
961 
962           /* Third Column */
963           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
964           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
965 
966           /* Matrix-Vector Product: */
967           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
968           SSE_SHUFFLE(XMM7, XMM7, 0x00)
969           SSE_MULT_PS(XMM7, XMM0)
970           SSE_SUB_PS(XMM6, XMM7)
971 
972           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
973           SSE_SHUFFLE(XMM4, XMM4, 0x00)
974           SSE_MULT_PS(XMM4, XMM1)
975           SSE_SUB_PS(XMM6, XMM4)
976 
977           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
978           SSE_SHUFFLE(XMM5, XMM5, 0x00)
979           SSE_MULT_PS(XMM5, XMM2)
980           SSE_SUB_PS(XMM6, XMM5)
981 
982           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
983           SSE_SHUFFLE(XMM7, XMM7, 0x00)
984           SSE_MULT_PS(XMM7, XMM3)
985           SSE_SUB_PS(XMM6, XMM7)
986 
987           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
988           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
989 
990           /* Fourth Column */
991           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
992           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
993 
994           /* Matrix-Vector Product: */
995           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
996           SSE_SHUFFLE(XMM5, XMM5, 0x00)
997           SSE_MULT_PS(XMM5, XMM0)
998           SSE_SUB_PS(XMM4, XMM5)
999 
1000           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1001           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1002           SSE_MULT_PS(XMM6, XMM1)
1003           SSE_SUB_PS(XMM4, XMM6)
1004 
1005           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1006           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1007           SSE_MULT_PS(XMM7, XMM2)
1008           SSE_SUB_PS(XMM4, XMM7)
1009 
1010           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1011           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1012           SSE_MULT_PS(XMM5, XMM3)
1013           SSE_SUB_PS(XMM4, XMM5)
1014 
1015           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1016           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1017           SSE_INLINE_END_2;
1018           pv += 16;
1019         }
1020         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1021       }
1022       row = *bjtmp++;
1023       /*        row = (*bjtmp++)/4; */
1024     }
1025     /* finished row so stick it into b->a */
1026     pv = ba + 16 * bi[i];
1027     pj = bj + bi[i];
1028     nz = bi[i + 1] - bi[i];
1029 
1030     /* Copy x block back into pv block */
1031     for (j = 0; j < nz; j++) {
1032       x = rtmp + 16 * pj[j];
1033       /*        x  = rtmp+4*pj[j]; */
1034 
1035       SSE_INLINE_BEGIN_2(x, pv)
1036       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1037       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1038       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1039 
1040       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1041       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1042 
1043       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1044       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1045 
1046       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1047       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1048 
1049       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1050       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1051 
1052       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1053       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1054 
1055       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1056       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1057 
1058       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1059       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1060       SSE_INLINE_END_2;
1061       pv += 16;
1062     }
1063     /* invert diagonal block */
1064     w = ba + 16 * diag_offset[i];
1065     if (pivotinblocks) {
1066       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1067       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1068     } else {
1069       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1070     }
1071     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1072     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1073   }
1074 
1075   PetscCall(PetscFree(rtmp));
1076 
1077   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1078   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1079   C->assembled           = PETSC_TRUE;
1080 
1081   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1082   /* Flop Count from inverting diagonal blocks */
1083   SSE_SCOPE_END;
1084   PetscFunctionReturn(0);
1085 }
1086 
1087 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) {
1088   Mat             A = C;
1089   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1090   int             i, j, n = a->mbs;
1091   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1092   unsigned short *aj = (unsigned short *)(a->j), *ajtmp;
1093   unsigned int    row;
1094   int             nz, *bi = b->i;
1095   int            *diag_offset = b->diag, *ai = a->i;
1096   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1097   MatScalar      *ba = b->a, *aa = a->a;
1098   int             nonzero       = 0;
1099   PetscBool       pivotinblocks = b->pivotinblocks;
1100   PetscReal       shift         = info->shiftamount;
1101   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1102 
1103   PetscFunctionBegin;
1104   allowzeropivot = PetscNot(A->erroriffailure);
1105   SSE_SCOPE_BEGIN;
1106 
1107   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1108   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1109   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1110   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1111   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1112   /*      colscale = 4; */
1113   /*    } */
1114 
1115   for (i = 0; i < n; i++) {
1116     nz    = bi[i + 1] - bi[i];
1117     bjtmp = bj + bi[i];
1118     /* zero out the 4x4 block accumulators */
1119     /* zero out one register */
1120     XOR_PS(XMM7, XMM7);
1121     for (j = 0; j < nz; j++) {
1122       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1123       /*        x = rtmp+4*bjtmp[j]; */
1124       SSE_INLINE_BEGIN_1(x)
1125       /* Copy zero register to memory locations */
1126       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1127       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1128       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1129       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1130       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1131       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1132       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1133       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1134       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1135       SSE_INLINE_END_1;
1136     }
1137     /* load in initial (unfactored row) */
1138     nz    = ai[i + 1] - ai[i];
1139     ajtmp = aj + ai[i];
1140     v     = aa + 16 * ai[i];
1141     for (j = 0; j < nz; j++) {
1142       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1143       /*        x = rtmp+colscale*ajtmp[j]; */
1144       /* Copy v block into x block */
1145       SSE_INLINE_BEGIN_2(v, x)
1146       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1147       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1148       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1149 
1150       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1151       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1152 
1153       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1154       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1155 
1156       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1157       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1158 
1159       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1160       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1161 
1162       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1163       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1164 
1165       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1166       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1167 
1168       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1169       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1170       SSE_INLINE_END_2;
1171 
1172       v += 16;
1173     }
1174     /*      row = (*bjtmp++)/4; */
1175     row = (unsigned int)(*bjtmp++);
1176     while (row < i) {
1177       pc = rtmp + 16 * row;
1178       SSE_INLINE_BEGIN_1(pc)
1179       /* Load block from lower triangle */
1180       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1181       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1182       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1183 
1184       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1185       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1186 
1187       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1188       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1189 
1190       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1191       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1192 
1193       /* Compare block to zero block */
1194 
1195       SSE_COPY_PS(XMM4, XMM7)
1196       SSE_CMPNEQ_PS(XMM4, XMM0)
1197 
1198       SSE_COPY_PS(XMM5, XMM7)
1199       SSE_CMPNEQ_PS(XMM5, XMM1)
1200 
1201       SSE_COPY_PS(XMM6, XMM7)
1202       SSE_CMPNEQ_PS(XMM6, XMM2)
1203 
1204       SSE_CMPNEQ_PS(XMM7, XMM3)
1205 
1206       /* Reduce the comparisons to one SSE register */
1207       SSE_OR_PS(XMM6, XMM7)
1208       SSE_OR_PS(XMM5, XMM4)
1209       SSE_OR_PS(XMM5, XMM6)
1210       SSE_INLINE_END_1;
1211 
1212       /* Reduce the one SSE register to an integer register for branching */
1213       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1214       MOVEMASK(nonzero, XMM5);
1215 
1216       /* If block is nonzero ... */
1217       if (nonzero) {
1218         pv = ba + 16 * diag_offset[row];
1219         PREFETCH_L1(&pv[16]);
1220         pj = bj + diag_offset[row] + 1;
1221 
1222         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1223         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1224         /* but the diagonal was inverted already */
1225         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1226 
1227         SSE_INLINE_BEGIN_2(pv, pc)
1228         /* Column 0, product is accumulated in XMM4 */
1229         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1230         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1231         SSE_MULT_PS(XMM4, XMM0)
1232 
1233         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1234         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1235         SSE_MULT_PS(XMM5, XMM1)
1236         SSE_ADD_PS(XMM4, XMM5)
1237 
1238         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1239         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1240         SSE_MULT_PS(XMM6, XMM2)
1241         SSE_ADD_PS(XMM4, XMM6)
1242 
1243         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1244         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1245         SSE_MULT_PS(XMM7, XMM3)
1246         SSE_ADD_PS(XMM4, XMM7)
1247 
1248         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1249         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1250 
1251         /* Column 1, product is accumulated in XMM5 */
1252         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1253         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1254         SSE_MULT_PS(XMM5, XMM0)
1255 
1256         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1257         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1258         SSE_MULT_PS(XMM6, XMM1)
1259         SSE_ADD_PS(XMM5, XMM6)
1260 
1261         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1262         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1263         SSE_MULT_PS(XMM7, XMM2)
1264         SSE_ADD_PS(XMM5, XMM7)
1265 
1266         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1267         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1268         SSE_MULT_PS(XMM6, XMM3)
1269         SSE_ADD_PS(XMM5, XMM6)
1270 
1271         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1272         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1273 
1274         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1275 
1276         /* Column 2, product is accumulated in XMM6 */
1277         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1278         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1279         SSE_MULT_PS(XMM6, XMM0)
1280 
1281         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1282         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1283         SSE_MULT_PS(XMM7, XMM1)
1284         SSE_ADD_PS(XMM6, XMM7)
1285 
1286         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1287         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1288         SSE_MULT_PS(XMM7, XMM2)
1289         SSE_ADD_PS(XMM6, XMM7)
1290 
1291         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1292         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1293         SSE_MULT_PS(XMM7, XMM3)
1294         SSE_ADD_PS(XMM6, XMM7)
1295 
1296         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1297         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1298 
1299         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1300         /* Column 3, product is accumulated in XMM0 */
1301         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1302         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1303         SSE_MULT_PS(XMM0, XMM7)
1304 
1305         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1306         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1307         SSE_MULT_PS(XMM1, XMM7)
1308         SSE_ADD_PS(XMM0, XMM1)
1309 
1310         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1311         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1312         SSE_MULT_PS(XMM1, XMM2)
1313         SSE_ADD_PS(XMM0, XMM1)
1314 
1315         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1316         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1317         SSE_MULT_PS(XMM3, XMM7)
1318         SSE_ADD_PS(XMM0, XMM3)
1319 
1320         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1321         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1322 
1323         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1324         /* This is code to be maintained and read by humans after all. */
1325         /* Copy Multiplier Col 3 into XMM3 */
1326         SSE_COPY_PS(XMM3, XMM0)
1327         /* Copy Multiplier Col 2 into XMM2 */
1328         SSE_COPY_PS(XMM2, XMM6)
1329         /* Copy Multiplier Col 1 into XMM1 */
1330         SSE_COPY_PS(XMM1, XMM5)
1331         /* Copy Multiplier Col 0 into XMM0 */
1332         SSE_COPY_PS(XMM0, XMM4)
1333         SSE_INLINE_END_2;
1334 
1335         /* Update the row: */
1336         nz = bi[row + 1] - diag_offset[row] - 1;
1337         pv += 16;
1338         for (j = 0; j < nz; j++) {
1339           PREFETCH_L1(&pv[16]);
1340           x = rtmp + 16 * ((unsigned int)pj[j]);
1341           /*            x = rtmp + 4*pj[j]; */
1342 
1343           /* X:=X-M*PV, One column at a time */
1344           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1345           SSE_INLINE_BEGIN_2(x, pv)
1346           /* Load First Column of X*/
1347           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1348           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1349 
1350           /* Matrix-Vector Product: */
1351           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1352           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1353           SSE_MULT_PS(XMM5, XMM0)
1354           SSE_SUB_PS(XMM4, XMM5)
1355 
1356           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1357           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1358           SSE_MULT_PS(XMM6, XMM1)
1359           SSE_SUB_PS(XMM4, XMM6)
1360 
1361           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1362           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1363           SSE_MULT_PS(XMM7, XMM2)
1364           SSE_SUB_PS(XMM4, XMM7)
1365 
1366           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1367           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1368           SSE_MULT_PS(XMM5, XMM3)
1369           SSE_SUB_PS(XMM4, XMM5)
1370 
1371           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1372           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1373 
1374           /* Second Column */
1375           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1376           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1377 
1378           /* Matrix-Vector Product: */
1379           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1380           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1381           SSE_MULT_PS(XMM6, XMM0)
1382           SSE_SUB_PS(XMM5, XMM6)
1383 
1384           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1385           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1386           SSE_MULT_PS(XMM7, XMM1)
1387           SSE_SUB_PS(XMM5, XMM7)
1388 
1389           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1390           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1391           SSE_MULT_PS(XMM4, XMM2)
1392           SSE_SUB_PS(XMM5, XMM4)
1393 
1394           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1395           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1396           SSE_MULT_PS(XMM6, XMM3)
1397           SSE_SUB_PS(XMM5, XMM6)
1398 
1399           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1400           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1401 
1402           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1403 
1404           /* Third Column */
1405           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1406           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1407 
1408           /* Matrix-Vector Product: */
1409           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1410           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1411           SSE_MULT_PS(XMM7, XMM0)
1412           SSE_SUB_PS(XMM6, XMM7)
1413 
1414           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1415           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1416           SSE_MULT_PS(XMM4, XMM1)
1417           SSE_SUB_PS(XMM6, XMM4)
1418 
1419           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1420           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1421           SSE_MULT_PS(XMM5, XMM2)
1422           SSE_SUB_PS(XMM6, XMM5)
1423 
1424           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1425           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1426           SSE_MULT_PS(XMM7, XMM3)
1427           SSE_SUB_PS(XMM6, XMM7)
1428 
1429           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1430           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1431 
1432           /* Fourth Column */
1433           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1434           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1435 
1436           /* Matrix-Vector Product: */
1437           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1438           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1439           SSE_MULT_PS(XMM5, XMM0)
1440           SSE_SUB_PS(XMM4, XMM5)
1441 
1442           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1443           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1444           SSE_MULT_PS(XMM6, XMM1)
1445           SSE_SUB_PS(XMM4, XMM6)
1446 
1447           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1448           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1449           SSE_MULT_PS(XMM7, XMM2)
1450           SSE_SUB_PS(XMM4, XMM7)
1451 
1452           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1453           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1454           SSE_MULT_PS(XMM5, XMM3)
1455           SSE_SUB_PS(XMM4, XMM5)
1456 
1457           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1458           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1459           SSE_INLINE_END_2;
1460           pv += 16;
1461         }
1462         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1463       }
1464       row = (unsigned int)(*bjtmp++);
1465       /*        row = (*bjtmp++)/4; */
1466       /*        bjtmp++; */
1467     }
1468     /* finished row so stick it into b->a */
1469     pv = ba + 16 * bi[i];
1470     pj = bj + bi[i];
1471     nz = bi[i + 1] - bi[i];
1472 
1473     /* Copy x block back into pv block */
1474     for (j = 0; j < nz; j++) {
1475       x = rtmp + 16 * ((unsigned int)pj[j]);
1476       /*        x  = rtmp+4*pj[j]; */
1477 
1478       SSE_INLINE_BEGIN_2(x, pv)
1479       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1480       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1481       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1482 
1483       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1484       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1485 
1486       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1487       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1488 
1489       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1490       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1491 
1492       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1493       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1494 
1495       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1496       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1497 
1498       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1499       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1500 
1501       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1502       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1503       SSE_INLINE_END_2;
1504       pv += 16;
1505     }
1506     /* invert diagonal block */
1507     w = ba + 16 * diag_offset[i];
1508     if (pivotinblocks) {
1509       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1510       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1511     } else {
1512       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1513     }
1514     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1515   }
1516 
1517   PetscCall(PetscFree(rtmp));
1518 
1519   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1520   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1521   C->assembled           = PETSC_TRUE;
1522 
1523   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1524   /* Flop Count from inverting diagonal blocks */
1525   SSE_SCOPE_END;
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) {
1530   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1531   int             i, j, n = a->mbs;
1532   unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj;
1533   unsigned int    row;
1534   int            *ajtmpold, nz, *bi = b->i;
1535   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1536   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1537   MatScalar      *ba = b->a, *aa = a->a;
1538   int             nonzero       = 0;
1539   PetscBool       pivotinblocks = b->pivotinblocks;
1540   PetscReal       shift         = info->shiftamount;
1541   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1542 
1543   PetscFunctionBegin;
1544   allowzeropivot = PetscNot(A->erroriffailure);
1545   SSE_SCOPE_BEGIN;
1546 
1547   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1548   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1549   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1550   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1551   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1552   /*      colscale = 4; */
1553   /*    } */
1554   if ((unsigned long)bj == (unsigned long)aj) { return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); }
1555 
1556   for (i = 0; i < n; i++) {
1557     nz    = bi[i + 1] - bi[i];
1558     bjtmp = bj + bi[i];
1559     /* zero out the 4x4 block accumulators */
1560     /* zero out one register */
1561     XOR_PS(XMM7, XMM7);
1562     for (j = 0; j < nz; j++) {
1563       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1564       /*        x = rtmp+4*bjtmp[j]; */
1565       SSE_INLINE_BEGIN_1(x)
1566       /* Copy zero register to memory locations */
1567       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1568       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1569       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1570       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1571       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1572       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1573       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1574       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1575       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1576       SSE_INLINE_END_1;
1577     }
1578     /* load in initial (unfactored row) */
1579     nz       = ai[i + 1] - ai[i];
1580     ajtmpold = aj + ai[i];
1581     v        = aa + 16 * ai[i];
1582     for (j = 0; j < nz; j++) {
1583       x = rtmp + 16 * ajtmpold[j];
1584       /*        x = rtmp+colscale*ajtmpold[j]; */
1585       /* Copy v block into x block */
1586       SSE_INLINE_BEGIN_2(v, x)
1587       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1588       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1589       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1590 
1591       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1592       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1593 
1594       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1595       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1596 
1597       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1598       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1599 
1600       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1601       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1602 
1603       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1604       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1605 
1606       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1607       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1608 
1609       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1610       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1611       SSE_INLINE_END_2;
1612 
1613       v += 16;
1614     }
1615     /*      row = (*bjtmp++)/4; */
1616     row = (unsigned int)(*bjtmp++);
1617     while (row < i) {
1618       pc = rtmp + 16 * row;
1619       SSE_INLINE_BEGIN_1(pc)
1620       /* Load block from lower triangle */
1621       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1622       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1623       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1624 
1625       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1626       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1627 
1628       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1629       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1630 
1631       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1632       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1633 
1634       /* Compare block to zero block */
1635 
1636       SSE_COPY_PS(XMM4, XMM7)
1637       SSE_CMPNEQ_PS(XMM4, XMM0)
1638 
1639       SSE_COPY_PS(XMM5, XMM7)
1640       SSE_CMPNEQ_PS(XMM5, XMM1)
1641 
1642       SSE_COPY_PS(XMM6, XMM7)
1643       SSE_CMPNEQ_PS(XMM6, XMM2)
1644 
1645       SSE_CMPNEQ_PS(XMM7, XMM3)
1646 
1647       /* Reduce the comparisons to one SSE register */
1648       SSE_OR_PS(XMM6, XMM7)
1649       SSE_OR_PS(XMM5, XMM4)
1650       SSE_OR_PS(XMM5, XMM6)
1651       SSE_INLINE_END_1;
1652 
1653       /* Reduce the one SSE register to an integer register for branching */
1654       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1655       MOVEMASK(nonzero, XMM5);
1656 
1657       /* If block is nonzero ... */
1658       if (nonzero) {
1659         pv = ba + 16 * diag_offset[row];
1660         PREFETCH_L1(&pv[16]);
1661         pj = bj + diag_offset[row] + 1;
1662 
1663         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1664         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1665         /* but the diagonal was inverted already */
1666         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1667 
1668         SSE_INLINE_BEGIN_2(pv, pc)
1669         /* Column 0, product is accumulated in XMM4 */
1670         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1671         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1672         SSE_MULT_PS(XMM4, XMM0)
1673 
1674         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1675         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1676         SSE_MULT_PS(XMM5, XMM1)
1677         SSE_ADD_PS(XMM4, XMM5)
1678 
1679         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1680         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1681         SSE_MULT_PS(XMM6, XMM2)
1682         SSE_ADD_PS(XMM4, XMM6)
1683 
1684         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1685         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1686         SSE_MULT_PS(XMM7, XMM3)
1687         SSE_ADD_PS(XMM4, XMM7)
1688 
1689         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1690         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1691 
1692         /* Column 1, product is accumulated in XMM5 */
1693         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1694         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1695         SSE_MULT_PS(XMM5, XMM0)
1696 
1697         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1698         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1699         SSE_MULT_PS(XMM6, XMM1)
1700         SSE_ADD_PS(XMM5, XMM6)
1701 
1702         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1703         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1704         SSE_MULT_PS(XMM7, XMM2)
1705         SSE_ADD_PS(XMM5, XMM7)
1706 
1707         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1708         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1709         SSE_MULT_PS(XMM6, XMM3)
1710         SSE_ADD_PS(XMM5, XMM6)
1711 
1712         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1713         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1714 
1715         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1716 
1717         /* Column 2, product is accumulated in XMM6 */
1718         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1719         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1720         SSE_MULT_PS(XMM6, XMM0)
1721 
1722         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1723         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1724         SSE_MULT_PS(XMM7, XMM1)
1725         SSE_ADD_PS(XMM6, XMM7)
1726 
1727         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1728         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1729         SSE_MULT_PS(XMM7, XMM2)
1730         SSE_ADD_PS(XMM6, XMM7)
1731 
1732         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1733         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1734         SSE_MULT_PS(XMM7, XMM3)
1735         SSE_ADD_PS(XMM6, XMM7)
1736 
1737         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1738         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1739 
1740         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1741         /* Column 3, product is accumulated in XMM0 */
1742         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1743         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1744         SSE_MULT_PS(XMM0, XMM7)
1745 
1746         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1747         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1748         SSE_MULT_PS(XMM1, XMM7)
1749         SSE_ADD_PS(XMM0, XMM1)
1750 
1751         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1752         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1753         SSE_MULT_PS(XMM1, XMM2)
1754         SSE_ADD_PS(XMM0, XMM1)
1755 
1756         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1757         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1758         SSE_MULT_PS(XMM3, XMM7)
1759         SSE_ADD_PS(XMM0, XMM3)
1760 
1761         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1762         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1763 
1764         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1765         /* This is code to be maintained and read by humans after all. */
1766         /* Copy Multiplier Col 3 into XMM3 */
1767         SSE_COPY_PS(XMM3, XMM0)
1768         /* Copy Multiplier Col 2 into XMM2 */
1769         SSE_COPY_PS(XMM2, XMM6)
1770         /* Copy Multiplier Col 1 into XMM1 */
1771         SSE_COPY_PS(XMM1, XMM5)
1772         /* Copy Multiplier Col 0 into XMM0 */
1773         SSE_COPY_PS(XMM0, XMM4)
1774         SSE_INLINE_END_2;
1775 
1776         /* Update the row: */
1777         nz = bi[row + 1] - diag_offset[row] - 1;
1778         pv += 16;
1779         for (j = 0; j < nz; j++) {
1780           PREFETCH_L1(&pv[16]);
1781           x = rtmp + 16 * ((unsigned int)pj[j]);
1782           /*            x = rtmp + 4*pj[j]; */
1783 
1784           /* X:=X-M*PV, One column at a time */
1785           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1786           SSE_INLINE_BEGIN_2(x, pv)
1787           /* Load First Column of X*/
1788           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1789           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1790 
1791           /* Matrix-Vector Product: */
1792           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1793           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1794           SSE_MULT_PS(XMM5, XMM0)
1795           SSE_SUB_PS(XMM4, XMM5)
1796 
1797           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1798           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1799           SSE_MULT_PS(XMM6, XMM1)
1800           SSE_SUB_PS(XMM4, XMM6)
1801 
1802           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1803           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1804           SSE_MULT_PS(XMM7, XMM2)
1805           SSE_SUB_PS(XMM4, XMM7)
1806 
1807           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1808           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1809           SSE_MULT_PS(XMM5, XMM3)
1810           SSE_SUB_PS(XMM4, XMM5)
1811 
1812           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1813           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1814 
1815           /* Second Column */
1816           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1817           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1818 
1819           /* Matrix-Vector Product: */
1820           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1821           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1822           SSE_MULT_PS(XMM6, XMM0)
1823           SSE_SUB_PS(XMM5, XMM6)
1824 
1825           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1826           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1827           SSE_MULT_PS(XMM7, XMM1)
1828           SSE_SUB_PS(XMM5, XMM7)
1829 
1830           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1831           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1832           SSE_MULT_PS(XMM4, XMM2)
1833           SSE_SUB_PS(XMM5, XMM4)
1834 
1835           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1836           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1837           SSE_MULT_PS(XMM6, XMM3)
1838           SSE_SUB_PS(XMM5, XMM6)
1839 
1840           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1841           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1842 
1843           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1844 
1845           /* Third Column */
1846           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1847           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1848 
1849           /* Matrix-Vector Product: */
1850           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1851           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1852           SSE_MULT_PS(XMM7, XMM0)
1853           SSE_SUB_PS(XMM6, XMM7)
1854 
1855           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1856           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1857           SSE_MULT_PS(XMM4, XMM1)
1858           SSE_SUB_PS(XMM6, XMM4)
1859 
1860           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1861           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1862           SSE_MULT_PS(XMM5, XMM2)
1863           SSE_SUB_PS(XMM6, XMM5)
1864 
1865           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1866           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1867           SSE_MULT_PS(XMM7, XMM3)
1868           SSE_SUB_PS(XMM6, XMM7)
1869 
1870           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1871           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1872 
1873           /* Fourth Column */
1874           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1875           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1876 
1877           /* Matrix-Vector Product: */
1878           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1879           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1880           SSE_MULT_PS(XMM5, XMM0)
1881           SSE_SUB_PS(XMM4, XMM5)
1882 
1883           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1884           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1885           SSE_MULT_PS(XMM6, XMM1)
1886           SSE_SUB_PS(XMM4, XMM6)
1887 
1888           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1889           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1890           SSE_MULT_PS(XMM7, XMM2)
1891           SSE_SUB_PS(XMM4, XMM7)
1892 
1893           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1894           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1895           SSE_MULT_PS(XMM5, XMM3)
1896           SSE_SUB_PS(XMM4, XMM5)
1897 
1898           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1899           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1900           SSE_INLINE_END_2;
1901           pv += 16;
1902         }
1903         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1904       }
1905       row = (unsigned int)(*bjtmp++);
1906       /*        row = (*bjtmp++)/4; */
1907       /*        bjtmp++; */
1908     }
1909     /* finished row so stick it into b->a */
1910     pv = ba + 16 * bi[i];
1911     pj = bj + bi[i];
1912     nz = bi[i + 1] - bi[i];
1913 
1914     /* Copy x block back into pv block */
1915     for (j = 0; j < nz; j++) {
1916       x = rtmp + 16 * ((unsigned int)pj[j]);
1917       /*        x  = rtmp+4*pj[j]; */
1918 
1919       SSE_INLINE_BEGIN_2(x, pv)
1920       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1921       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1922       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1923 
1924       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1925       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1926 
1927       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1928       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1929 
1930       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1931       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1932 
1933       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1934       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1935 
1936       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1937       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1938 
1939       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1940       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1941 
1942       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1943       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1944       SSE_INLINE_END_2;
1945       pv += 16;
1946     }
1947     /* invert diagonal block */
1948     w = ba + 16 * diag_offset[i];
1949     if (pivotinblocks) {
1950       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1951       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1952     } else {
1953       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1954     }
1955     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1956     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1957   }
1958 
1959   PetscCall(PetscFree(rtmp));
1960 
1961   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1962   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1963   C->assembled           = PETSC_TRUE;
1964 
1965   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1966   /* Flop Count from inverting diagonal blocks */
1967   SSE_SCOPE_END;
1968   PetscFunctionReturn(0);
1969 }
1970 
1971 #endif
1972