xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision f13dfd9ea68e0ddeee984e65c377a1819eab8a8a)
1 /*
2     Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 
7 /*
8       Version for when blocks are 4 by 4
9 */
10 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13   IS              isrow = b->row, isicol = b->icol;
14   const PetscInt *r, *ic;
15   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16   PetscInt       *ajtmpold, *ajtmp, nz, row;
17   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
18   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
19   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
20   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
21   MatScalar       p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
22   MatScalar       m13, m14, m15, m16;
23   MatScalar      *ba = b->a, *aa = a->a;
24   PetscBool       pivotinblocks = b->pivotinblocks;
25   PetscReal       shift         = info->shiftamount;
26   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
27 
28   PetscFunctionBegin;
29   PetscCall(ISGetIndices(isrow, &r));
30   PetscCall(ISGetIndices(isicol, &ic));
31   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
32   allowzeropivot = PetscNot(A->erroriffailure);
33 
34   for (i = 0; i < n; i++) {
35     nz    = bi[i + 1] - bi[i];
36     ajtmp = bj + bi[i];
37     for (j = 0; j < nz; j++) {
38       x    = rtmp + 16 * ajtmp[j];
39       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
40       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
41     }
42     /* load in initial (unfactored row) */
43     idx      = r[i];
44     nz       = ai[idx + 1] - ai[idx];
45     ajtmpold = aj + ai[idx];
46     v        = aa + 16 * ai[idx];
47     for (j = 0; j < nz; j++) {
48       x     = rtmp + 16 * ic[ajtmpold[j]];
49       x[0]  = v[0];
50       x[1]  = v[1];
51       x[2]  = v[2];
52       x[3]  = v[3];
53       x[4]  = v[4];
54       x[5]  = v[5];
55       x[6]  = v[6];
56       x[7]  = v[7];
57       x[8]  = v[8];
58       x[9]  = v[9];
59       x[10] = v[10];
60       x[11] = v[11];
61       x[12] = v[12];
62       x[13] = v[13];
63       x[14] = v[14];
64       x[15] = v[15];
65       v += 16;
66     }
67     row = *ajtmp++;
68     while (row < i) {
69       pc  = rtmp + 16 * row;
70       p1  = pc[0];
71       p2  = pc[1];
72       p3  = pc[2];
73       p4  = pc[3];
74       p5  = pc[4];
75       p6  = pc[5];
76       p7  = pc[6];
77       p8  = pc[7];
78       p9  = pc[8];
79       p10 = pc[9];
80       p11 = pc[10];
81       p12 = pc[11];
82       p13 = pc[12];
83       p14 = pc[13];
84       p15 = pc[14];
85       p16 = pc[15];
86       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
87         pv    = ba + 16 * diag_offset[row];
88         pj    = bj + diag_offset[row] + 1;
89         x1    = pv[0];
90         x2    = pv[1];
91         x3    = pv[2];
92         x4    = pv[3];
93         x5    = pv[4];
94         x6    = pv[5];
95         x7    = pv[6];
96         x8    = pv[7];
97         x9    = pv[8];
98         x10   = pv[9];
99         x11   = pv[10];
100         x12   = pv[11];
101         x13   = pv[12];
102         x14   = pv[13];
103         x15   = pv[14];
104         x16   = pv[15];
105         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
106         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
107         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
108         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
109 
110         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
111         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
112         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
113         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
114 
115         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
116         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
117         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
118         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
119 
120         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
121         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
122         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
123         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
124 
125         nz = bi[row + 1] - diag_offset[row] - 1;
126         pv += 16;
127         for (j = 0; j < nz; j++) {
128           x1  = pv[0];
129           x2  = pv[1];
130           x3  = pv[2];
131           x4  = pv[3];
132           x5  = pv[4];
133           x6  = pv[5];
134           x7  = pv[6];
135           x8  = pv[7];
136           x9  = pv[8];
137           x10 = pv[9];
138           x11 = pv[10];
139           x12 = pv[11];
140           x13 = pv[12];
141           x14 = pv[13];
142           x15 = pv[14];
143           x16 = pv[15];
144           x   = rtmp + 16 * pj[j];
145           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
146           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
147           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
148           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
149 
150           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
151           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
152           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
153           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
154 
155           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
156           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
157           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
158           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
159 
160           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
161           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
162           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
163           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
164 
165           pv += 16;
166         }
167         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
168       }
169       row = *ajtmp++;
170     }
171     /* finished row so stick it into b->a */
172     pv = ba + 16 * bi[i];
173     pj = bj + bi[i];
174     nz = bi[i + 1] - bi[i];
175     for (j = 0; j < nz; j++) {
176       x      = rtmp + 16 * pj[j];
177       pv[0]  = x[0];
178       pv[1]  = x[1];
179       pv[2]  = x[2];
180       pv[3]  = x[3];
181       pv[4]  = x[4];
182       pv[5]  = x[5];
183       pv[6]  = x[6];
184       pv[7]  = x[7];
185       pv[8]  = x[8];
186       pv[9]  = x[9];
187       pv[10] = x[10];
188       pv[11] = x[11];
189       pv[12] = x[12];
190       pv[13] = x[13];
191       pv[14] = x[14];
192       pv[15] = x[15];
193       pv += 16;
194     }
195     /* invert diagonal block */
196     w = ba + 16 * diag_offset[i];
197     if (pivotinblocks) {
198       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
199       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200     } else {
201       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
202     }
203   }
204 
205   PetscCall(PetscFree(rtmp));
206   PetscCall(ISRestoreIndices(isicol, &ic));
207   PetscCall(ISRestoreIndices(isrow, &r));
208 
209   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
210   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
211   C->assembled           = PETSC_TRUE;
212 
213   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /* MatLUFactorNumeric_SeqBAIJ_4 -
218      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
219        PetscKernel_A_gets_A_times_B()
220        PetscKernel_A_gets_A_minus_B_times_C()
221        PetscKernel_A_gets_inverse_A()
222 */
223 
224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
225 {
226   Mat             C = B;
227   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228   IS              isrow = b->row, isicol = b->icol;
229   const PetscInt *r, *ic;
230   PetscInt        i, j, k, nz, nzL, row;
231   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
232   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
233   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
234   PetscInt        flg;
235   PetscReal       shift;
236   PetscBool       allowzeropivot, zeropivotdetected;
237 
238   PetscFunctionBegin;
239   allowzeropivot = PetscNot(A->erroriffailure);
240   PetscCall(ISGetIndices(isrow, &r));
241   PetscCall(ISGetIndices(isicol, &ic));
242 
243   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
244     shift = 0;
245   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
246     shift = info->shiftamount;
247   }
248 
249   /* generate work space needed by the factorization */
250   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
251   PetscCall(PetscArrayzero(rtmp, bs2 * n));
252 
253   for (i = 0; i < n; i++) {
254     /* zero rtmp */
255     /* L part */
256     nz    = bi[i + 1] - bi[i];
257     bjtmp = bj + bi[i];
258     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
259 
260     /* U part */
261     nz    = bdiag[i] - bdiag[i + 1];
262     bjtmp = bj + bdiag[i + 1] + 1;
263     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
264 
265     /* load in initial (unfactored row) */
266     nz    = ai[r[i] + 1] - ai[r[i]];
267     ajtmp = aj + ai[r[i]];
268     v     = aa + bs2 * ai[r[i]];
269     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
270 
271     /* elimination */
272     bjtmp = bj + bi[i];
273     nzL   = bi[i + 1] - bi[i];
274     for (k = 0; k < nzL; k++) {
275       row = bjtmp[k];
276       pc  = rtmp + bs2 * row;
277       for (flg = 0, j = 0; j < bs2; j++) {
278         if (pc[j] != 0.0) {
279           flg = 1;
280           break;
281         }
282       }
283       if (flg) {
284         pv = b->a + bs2 * bdiag[row];
285         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
286         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
287 
288         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
289         pv = b->a + bs2 * (bdiag[row + 1] + 1);
290         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
291         for (j = 0; j < nz; j++) {
292           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
293           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
294           v = rtmp + bs2 * pj[j];
295           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
296           pv += bs2;
297         }
298         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
299       }
300     }
301 
302     /* finished row so stick it into b->a */
303     /* L part */
304     pv = b->a + bs2 * bi[i];
305     pj = b->j + bi[i];
306     nz = bi[i + 1] - bi[i];
307     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
308 
309     /* Mark diagonal and invert diagonal for simpler triangular solves */
310     pv = b->a + bs2 * bdiag[i];
311     pj = b->j + bdiag[i];
312     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
313     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
314     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
315 
316     /* U part */
317     pv = b->a + bs2 * (bdiag[i + 1] + 1);
318     pj = b->j + bdiag[i + 1] + 1;
319     nz = bdiag[i] - bdiag[i + 1] - 1;
320     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
321   }
322 
323   PetscCall(PetscFree2(rtmp, mwork));
324   PetscCall(ISRestoreIndices(isicol, &ic));
325   PetscCall(ISRestoreIndices(isrow, &r));
326 
327   C->ops->solve          = MatSolve_SeqBAIJ_4;
328   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
329   C->assembled           = PETSC_TRUE;
330 
331   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 
335 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
336 {
337   /*
338     Default Version for when blocks are 4 by 4 Using natural ordering
339 */
340   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
341   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
342   PetscInt    *ajtmpold, *ajtmp, nz, row;
343   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
344   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
345   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
346   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
347   MatScalar    p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
348   MatScalar    m13, m14, m15, m16;
349   MatScalar   *ba = b->a, *aa = a->a;
350   PetscBool    pivotinblocks = b->pivotinblocks;
351   PetscReal    shift         = info->shiftamount;
352   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
353 
354   PetscFunctionBegin;
355   allowzeropivot = PetscNot(A->erroriffailure);
356   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
357 
358   for (i = 0; i < n; i++) {
359     nz    = bi[i + 1] - bi[i];
360     ajtmp = bj + bi[i];
361     for (j = 0; j < nz; j++) {
362       x    = rtmp + 16 * ajtmp[j];
363       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
364       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
365     }
366     /* load in initial (unfactored row) */
367     nz       = ai[i + 1] - ai[i];
368     ajtmpold = aj + ai[i];
369     v        = aa + 16 * ai[i];
370     for (j = 0; j < nz; j++) {
371       x     = rtmp + 16 * ajtmpold[j];
372       x[0]  = v[0];
373       x[1]  = v[1];
374       x[2]  = v[2];
375       x[3]  = v[3];
376       x[4]  = v[4];
377       x[5]  = v[5];
378       x[6]  = v[6];
379       x[7]  = v[7];
380       x[8]  = v[8];
381       x[9]  = v[9];
382       x[10] = v[10];
383       x[11] = v[11];
384       x[12] = v[12];
385       x[13] = v[13];
386       x[14] = v[14];
387       x[15] = v[15];
388       v += 16;
389     }
390     row = *ajtmp++;
391     while (row < i) {
392       pc  = rtmp + 16 * row;
393       p1  = pc[0];
394       p2  = pc[1];
395       p3  = pc[2];
396       p4  = pc[3];
397       p5  = pc[4];
398       p6  = pc[5];
399       p7  = pc[6];
400       p8  = pc[7];
401       p9  = pc[8];
402       p10 = pc[9];
403       p11 = pc[10];
404       p12 = pc[11];
405       p13 = pc[12];
406       p14 = pc[13];
407       p15 = pc[14];
408       p16 = pc[15];
409       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
410         pv    = ba + 16 * diag_offset[row];
411         pj    = bj + diag_offset[row] + 1;
412         x1    = pv[0];
413         x2    = pv[1];
414         x3    = pv[2];
415         x4    = pv[3];
416         x5    = pv[4];
417         x6    = pv[5];
418         x7    = pv[6];
419         x8    = pv[7];
420         x9    = pv[8];
421         x10   = pv[9];
422         x11   = pv[10];
423         x12   = pv[11];
424         x13   = pv[12];
425         x14   = pv[13];
426         x15   = pv[14];
427         x16   = pv[15];
428         pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
429         pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
430         pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
431         pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
432 
433         pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
434         pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
435         pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
436         pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
437 
438         pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
439         pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
440         pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
441         pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
442 
443         pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
444         pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
445         pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
446         pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
447         nz           = bi[row + 1] - diag_offset[row] - 1;
448         pv += 16;
449         for (j = 0; j < nz; j++) {
450           x1  = pv[0];
451           x2  = pv[1];
452           x3  = pv[2];
453           x4  = pv[3];
454           x5  = pv[4];
455           x6  = pv[5];
456           x7  = pv[6];
457           x8  = pv[7];
458           x9  = pv[8];
459           x10 = pv[9];
460           x11 = pv[10];
461           x12 = pv[11];
462           x13 = pv[12];
463           x14 = pv[13];
464           x15 = pv[14];
465           x16 = pv[15];
466           x   = rtmp + 16 * pj[j];
467           x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
468           x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
469           x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
470           x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
471 
472           x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
473           x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
474           x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
475           x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
476 
477           x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
478           x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
479           x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
480           x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
481 
482           x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
483           x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
484           x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
485           x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
486 
487           pv += 16;
488         }
489         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
490       }
491       row = *ajtmp++;
492     }
493     /* finished row so stick it into b->a */
494     pv = ba + 16 * bi[i];
495     pj = bj + bi[i];
496     nz = bi[i + 1] - bi[i];
497     for (j = 0; j < nz; j++) {
498       x      = rtmp + 16 * pj[j];
499       pv[0]  = x[0];
500       pv[1]  = x[1];
501       pv[2]  = x[2];
502       pv[3]  = x[3];
503       pv[4]  = x[4];
504       pv[5]  = x[5];
505       pv[6]  = x[6];
506       pv[7]  = x[7];
507       pv[8]  = x[8];
508       pv[9]  = x[9];
509       pv[10] = x[10];
510       pv[11] = x[11];
511       pv[12] = x[12];
512       pv[13] = x[13];
513       pv[14] = x[14];
514       pv[15] = x[15];
515       pv += 16;
516     }
517     /* invert diagonal block */
518     w = ba + 16 * diag_offset[i];
519     if (pivotinblocks) {
520       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
521       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
522     } else {
523       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
524     }
525   }
526 
527   PetscCall(PetscFree(rtmp));
528 
529   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
530   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
531   C->assembled           = PETSC_TRUE;
532 
533   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
534   PetscFunctionReturn(PETSC_SUCCESS);
535 }
536 
537 /*
538   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
539     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
540 */
541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
542 {
543   Mat             C = B;
544   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
545   PetscInt        i, j, k, nz, nzL, row;
546   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
547   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
548   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
549   PetscInt        flg;
550   PetscReal       shift;
551   PetscBool       allowzeropivot, zeropivotdetected;
552 
553   PetscFunctionBegin;
554   allowzeropivot = PetscNot(A->erroriffailure);
555 
556   /* generate work space needed by the factorization */
557   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
558   PetscCall(PetscArrayzero(rtmp, bs2 * n));
559 
560   if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
561     shift = 0;
562   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
563     shift = info->shiftamount;
564   }
565 
566   for (i = 0; i < n; i++) {
567     /* zero rtmp */
568     /* L part */
569     nz    = bi[i + 1] - bi[i];
570     bjtmp = bj + bi[i];
571     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
572 
573     /* U part */
574     nz    = bdiag[i] - bdiag[i + 1];
575     bjtmp = bj + bdiag[i + 1] + 1;
576     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
577 
578     /* load in initial (unfactored row) */
579     nz    = ai[i + 1] - ai[i];
580     ajtmp = aj + ai[i];
581     v     = aa + bs2 * ai[i];
582     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
583 
584     /* elimination */
585     bjtmp = bj + bi[i];
586     nzL   = bi[i + 1] - bi[i];
587     for (k = 0; k < nzL; k++) {
588       row = bjtmp[k];
589       pc  = rtmp + bs2 * row;
590       for (flg = 0, j = 0; j < bs2; j++) {
591         if (pc[j] != 0.0) {
592           flg = 1;
593           break;
594         }
595       }
596       if (flg) {
597         pv = b->a + bs2 * bdiag[row];
598         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
599         PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
600 
601         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
602         pv = b->a + bs2 * (bdiag[row + 1] + 1);
603         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
604         for (j = 0; j < nz; j++) {
605           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
606           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
607           v = rtmp + bs2 * pj[j];
608           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
609           pv += bs2;
610         }
611         PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
612       }
613     }
614 
615     /* finished row so stick it into b->a */
616     /* L part */
617     pv = b->a + bs2 * bi[i];
618     pj = b->j + bi[i];
619     nz = bi[i + 1] - bi[i];
620     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
621 
622     /* Mark diagonal and invert diagonal for simpler triangular solves */
623     pv = b->a + bs2 * bdiag[i];
624     pj = b->j + bdiag[i];
625     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
626     PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
627     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
628 
629     /* U part */
630     pv = b->a + bs2 * (bdiag[i + 1] + 1);
631     pj = b->j + bdiag[i + 1] + 1;
632     nz = bdiag[i] - bdiag[i + 1] - 1;
633     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
634   }
635   PetscCall(PetscFree2(rtmp, mwork));
636 
637   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
638   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
639   C->assembled           = PETSC_TRUE;
640 
641   PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
642   PetscFunctionReturn(PETSC_SUCCESS);
643 }
644 
645 #if defined(PETSC_HAVE_SSE)
646 
647   #include PETSC_HAVE_SSE
648 
649 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
650 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info)
651 {
652   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
653   int          i, j, n = a->mbs;
654   int         *bj = b->j, *bjtmp, *pj;
655   int          row;
656   int         *ajtmpold, nz, *bi = b->i;
657   int         *diag_offset = b->diag, *ai = a->i, *aj = a->j;
658   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
659   MatScalar   *ba = b->a, *aa = a->a;
660   int          nonzero       = 0;
661   PetscBool    pivotinblocks = b->pivotinblocks;
662   PetscReal    shift         = info->shiftamount;
663   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
664 
665   PetscFunctionBegin;
666   allowzeropivot = PetscNot(A->erroriffailure);
667   SSE_SCOPE_BEGIN;
668 
669   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
670   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
671   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
672   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
673   /*    if ((unsigned long)bj==(unsigned long)aj) { */
674   /*      colscale = 4; */
675   /*    } */
676   for (i = 0; i < n; i++) {
677     nz    = bi[i + 1] - bi[i];
678     bjtmp = bj + bi[i];
679     /* zero out the 4x4 block accumulators */
680     /* zero out one register */
681     XOR_PS(XMM7, XMM7);
682     for (j = 0; j < nz; j++) {
683       x = rtmp + 16 * bjtmp[j];
684       /*        x = rtmp+4*bjtmp[j]; */
685       SSE_INLINE_BEGIN_1(x)
686       /* Copy zero register to memory locations */
687       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
688       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
689       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
690       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
691       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
692       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
693       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
694       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
695       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
696       SSE_INLINE_END_1;
697     }
698     /* load in initial (unfactored row) */
699     nz       = ai[i + 1] - ai[i];
700     ajtmpold = aj + ai[i];
701     v        = aa + 16 * ai[i];
702     for (j = 0; j < nz; j++) {
703       x = rtmp + 16 * ajtmpold[j];
704       /*        x = rtmp+colscale*ajtmpold[j]; */
705       /* Copy v block into x block */
706       SSE_INLINE_BEGIN_2(v, x)
707       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
708       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
709       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
710 
711       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
712       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
713 
714       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
715       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
716 
717       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
718       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
719 
720       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
721       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
722 
723       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
724       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
725 
726       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
727       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
728 
729       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
730       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
731       SSE_INLINE_END_2;
732 
733       v += 16;
734     }
735     /*      row = (*bjtmp++)/4; */
736     row = *bjtmp++;
737     while (row < i) {
738       pc = rtmp + 16 * row;
739       SSE_INLINE_BEGIN_1(pc)
740       /* Load block from lower triangle */
741       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
742       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
743       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
744 
745       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
746       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
747 
748       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
749       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
750 
751       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
752       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
753 
754       /* Compare block to zero block */
755 
756       SSE_COPY_PS(XMM4, XMM7)
757       SSE_CMPNEQ_PS(XMM4, XMM0)
758 
759       SSE_COPY_PS(XMM5, XMM7)
760       SSE_CMPNEQ_PS(XMM5, XMM1)
761 
762       SSE_COPY_PS(XMM6, XMM7)
763       SSE_CMPNEQ_PS(XMM6, XMM2)
764 
765       SSE_CMPNEQ_PS(XMM7, XMM3)
766 
767       /* Reduce the comparisons to one SSE register */
768       SSE_OR_PS(XMM6, XMM7)
769       SSE_OR_PS(XMM5, XMM4)
770       SSE_OR_PS(XMM5, XMM6)
771       SSE_INLINE_END_1;
772 
773       /* Reduce the one SSE register to an integer register for branching */
774       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
775       MOVEMASK(nonzero, XMM5);
776 
777       /* If block is nonzero ... */
778       if (nonzero) {
779         pv = ba + 16 * diag_offset[row];
780         PREFETCH_L1(&pv[16]);
781         pj = bj + diag_offset[row] + 1;
782 
783         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
784         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
785         /* but the diagonal was inverted already */
786         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
787 
788         SSE_INLINE_BEGIN_2(pv, pc)
789         /* Column 0, product is accumulated in XMM4 */
790         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
791         SSE_SHUFFLE(XMM4, XMM4, 0x00)
792         SSE_MULT_PS(XMM4, XMM0)
793 
794         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
795         SSE_SHUFFLE(XMM5, XMM5, 0x00)
796         SSE_MULT_PS(XMM5, XMM1)
797         SSE_ADD_PS(XMM4, XMM5)
798 
799         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
800         SSE_SHUFFLE(XMM6, XMM6, 0x00)
801         SSE_MULT_PS(XMM6, XMM2)
802         SSE_ADD_PS(XMM4, XMM6)
803 
804         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
805         SSE_SHUFFLE(XMM7, XMM7, 0x00)
806         SSE_MULT_PS(XMM7, XMM3)
807         SSE_ADD_PS(XMM4, XMM7)
808 
809         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
810         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
811 
812         /* Column 1, product is accumulated in XMM5 */
813         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
814         SSE_SHUFFLE(XMM5, XMM5, 0x00)
815         SSE_MULT_PS(XMM5, XMM0)
816 
817         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
818         SSE_SHUFFLE(XMM6, XMM6, 0x00)
819         SSE_MULT_PS(XMM6, XMM1)
820         SSE_ADD_PS(XMM5, XMM6)
821 
822         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
823         SSE_SHUFFLE(XMM7, XMM7, 0x00)
824         SSE_MULT_PS(XMM7, XMM2)
825         SSE_ADD_PS(XMM5, XMM7)
826 
827         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
828         SSE_SHUFFLE(XMM6, XMM6, 0x00)
829         SSE_MULT_PS(XMM6, XMM3)
830         SSE_ADD_PS(XMM5, XMM6)
831 
832         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
833         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
834 
835         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
836 
837         /* Column 2, product is accumulated in XMM6 */
838         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
839         SSE_SHUFFLE(XMM6, XMM6, 0x00)
840         SSE_MULT_PS(XMM6, XMM0)
841 
842         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
843         SSE_SHUFFLE(XMM7, XMM7, 0x00)
844         SSE_MULT_PS(XMM7, XMM1)
845         SSE_ADD_PS(XMM6, XMM7)
846 
847         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
848         SSE_SHUFFLE(XMM7, XMM7, 0x00)
849         SSE_MULT_PS(XMM7, XMM2)
850         SSE_ADD_PS(XMM6, XMM7)
851 
852         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
853         SSE_SHUFFLE(XMM7, XMM7, 0x00)
854         SSE_MULT_PS(XMM7, XMM3)
855         SSE_ADD_PS(XMM6, XMM7)
856 
857         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
858         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
859 
860         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
861         /* Column 3, product is accumulated in XMM0 */
862         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
863         SSE_SHUFFLE(XMM7, XMM7, 0x00)
864         SSE_MULT_PS(XMM0, XMM7)
865 
866         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
867         SSE_SHUFFLE(XMM7, XMM7, 0x00)
868         SSE_MULT_PS(XMM1, XMM7)
869         SSE_ADD_PS(XMM0, XMM1)
870 
871         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
872         SSE_SHUFFLE(XMM1, XMM1, 0x00)
873         SSE_MULT_PS(XMM1, XMM2)
874         SSE_ADD_PS(XMM0, XMM1)
875 
876         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
877         SSE_SHUFFLE(XMM7, XMM7, 0x00)
878         SSE_MULT_PS(XMM3, XMM7)
879         SSE_ADD_PS(XMM0, XMM3)
880 
881         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
882         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
883 
884         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
885         /* This is code to be maintained and read by humans after all. */
886         /* Copy Multiplier Col 3 into XMM3 */
887         SSE_COPY_PS(XMM3, XMM0)
888         /* Copy Multiplier Col 2 into XMM2 */
889         SSE_COPY_PS(XMM2, XMM6)
890         /* Copy Multiplier Col 1 into XMM1 */
891         SSE_COPY_PS(XMM1, XMM5)
892         /* Copy Multiplier Col 0 into XMM0 */
893         SSE_COPY_PS(XMM0, XMM4)
894         SSE_INLINE_END_2;
895 
896         /* Update the row: */
897         nz = bi[row + 1] - diag_offset[row] - 1;
898         pv += 16;
899         for (j = 0; j < nz; j++) {
900           PREFETCH_L1(&pv[16]);
901           x = rtmp + 16 * pj[j];
902           /*            x = rtmp + 4*pj[j]; */
903 
904           /* X:=X-M*PV, One column at a time */
905           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
906           SSE_INLINE_BEGIN_2(x, pv)
907           /* Load First Column of X*/
908           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
909           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
910 
911           /* Matrix-Vector Product: */
912           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
913           SSE_SHUFFLE(XMM5, XMM5, 0x00)
914           SSE_MULT_PS(XMM5, XMM0)
915           SSE_SUB_PS(XMM4, XMM5)
916 
917           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
918           SSE_SHUFFLE(XMM6, XMM6, 0x00)
919           SSE_MULT_PS(XMM6, XMM1)
920           SSE_SUB_PS(XMM4, XMM6)
921 
922           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
923           SSE_SHUFFLE(XMM7, XMM7, 0x00)
924           SSE_MULT_PS(XMM7, XMM2)
925           SSE_SUB_PS(XMM4, XMM7)
926 
927           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
928           SSE_SHUFFLE(XMM5, XMM5, 0x00)
929           SSE_MULT_PS(XMM5, XMM3)
930           SSE_SUB_PS(XMM4, XMM5)
931 
932           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
933           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
934 
935           /* Second Column */
936           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
937           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
938 
939           /* Matrix-Vector Product: */
940           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
941           SSE_SHUFFLE(XMM6, XMM6, 0x00)
942           SSE_MULT_PS(XMM6, XMM0)
943           SSE_SUB_PS(XMM5, XMM6)
944 
945           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
946           SSE_SHUFFLE(XMM7, XMM7, 0x00)
947           SSE_MULT_PS(XMM7, XMM1)
948           SSE_SUB_PS(XMM5, XMM7)
949 
950           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
951           SSE_SHUFFLE(XMM4, XMM4, 0x00)
952           SSE_MULT_PS(XMM4, XMM2)
953           SSE_SUB_PS(XMM5, XMM4)
954 
955           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
956           SSE_SHUFFLE(XMM6, XMM6, 0x00)
957           SSE_MULT_PS(XMM6, XMM3)
958           SSE_SUB_PS(XMM5, XMM6)
959 
960           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
961           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
962 
963           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
964 
965           /* Third Column */
966           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
967           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
968 
969           /* Matrix-Vector Product: */
970           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
971           SSE_SHUFFLE(XMM7, XMM7, 0x00)
972           SSE_MULT_PS(XMM7, XMM0)
973           SSE_SUB_PS(XMM6, XMM7)
974 
975           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
976           SSE_SHUFFLE(XMM4, XMM4, 0x00)
977           SSE_MULT_PS(XMM4, XMM1)
978           SSE_SUB_PS(XMM6, XMM4)
979 
980           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
981           SSE_SHUFFLE(XMM5, XMM5, 0x00)
982           SSE_MULT_PS(XMM5, XMM2)
983           SSE_SUB_PS(XMM6, XMM5)
984 
985           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
986           SSE_SHUFFLE(XMM7, XMM7, 0x00)
987           SSE_MULT_PS(XMM7, XMM3)
988           SSE_SUB_PS(XMM6, XMM7)
989 
990           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
991           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
992 
993           /* Fourth Column */
994           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
995           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
996 
997           /* Matrix-Vector Product: */
998           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
999           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1000           SSE_MULT_PS(XMM5, XMM0)
1001           SSE_SUB_PS(XMM4, XMM5)
1002 
1003           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1004           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1005           SSE_MULT_PS(XMM6, XMM1)
1006           SSE_SUB_PS(XMM4, XMM6)
1007 
1008           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1009           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1010           SSE_MULT_PS(XMM7, XMM2)
1011           SSE_SUB_PS(XMM4, XMM7)
1012 
1013           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1014           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1015           SSE_MULT_PS(XMM5, XMM3)
1016           SSE_SUB_PS(XMM4, XMM5)
1017 
1018           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1019           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1020           SSE_INLINE_END_2;
1021           pv += 16;
1022         }
1023         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1024       }
1025       row = *bjtmp++;
1026       /*        row = (*bjtmp++)/4; */
1027     }
1028     /* finished row so stick it into b->a */
1029     pv = ba + 16 * bi[i];
1030     pj = bj + bi[i];
1031     nz = bi[i + 1] - bi[i];
1032 
1033     /* Copy x block back into pv block */
1034     for (j = 0; j < nz; j++) {
1035       x = rtmp + 16 * pj[j];
1036       /*        x  = rtmp+4*pj[j]; */
1037 
1038       SSE_INLINE_BEGIN_2(x, pv)
1039       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1040       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1041       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1042 
1043       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1044       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1045 
1046       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1047       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1048 
1049       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1050       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1051 
1052       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1053       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1054 
1055       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1056       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1057 
1058       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1059       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1060 
1061       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1062       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1063       SSE_INLINE_END_2;
1064       pv += 16;
1065     }
1066     /* invert diagonal block */
1067     w = ba + 16 * diag_offset[i];
1068     if (pivotinblocks) {
1069       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1070       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1071     } else {
1072       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1073     }
1074     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1075     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1076   }
1077 
1078   PetscCall(PetscFree(rtmp));
1079 
1080   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1081   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1082   C->assembled           = PETSC_TRUE;
1083 
1084   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1085   /* Flop Count from inverting diagonal blocks */
1086   SSE_SCOPE_END;
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1091 {
1092   Mat             A = C;
1093   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1094   int             i, j, n = a->mbs;
1095   unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj;
1096   unsigned short *aj = (unsigned short *)a->j, *ajtmp;
1097   unsigned int    row;
1098   int             nz, *bi = b->i;
1099   int            *diag_offset = b->diag, *ai = a->i;
1100   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1101   MatScalar      *ba = b->a, *aa = a->a;
1102   int             nonzero       = 0;
1103   PetscBool       pivotinblocks = b->pivotinblocks;
1104   PetscReal       shift         = info->shiftamount;
1105   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1106 
1107   PetscFunctionBegin;
1108   allowzeropivot = PetscNot(A->erroriffailure);
1109   SSE_SCOPE_BEGIN;
1110 
1111   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1112   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1113   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1114   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1115   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1116   /*      colscale = 4; */
1117   /*    } */
1118 
1119   for (i = 0; i < n; i++) {
1120     nz    = bi[i + 1] - bi[i];
1121     bjtmp = bj + bi[i];
1122     /* zero out the 4x4 block accumulators */
1123     /* zero out one register */
1124     XOR_PS(XMM7, XMM7);
1125     for (j = 0; j < nz; j++) {
1126       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1127       /*        x = rtmp+4*bjtmp[j]; */
1128       SSE_INLINE_BEGIN_1(x)
1129       /* Copy zero register to memory locations */
1130       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1131       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1132       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1133       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1134       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1135       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1136       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1137       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1138       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1139       SSE_INLINE_END_1;
1140     }
1141     /* load in initial (unfactored row) */
1142     nz    = ai[i + 1] - ai[i];
1143     ajtmp = aj + ai[i];
1144     v     = aa + 16 * ai[i];
1145     for (j = 0; j < nz; j++) {
1146       x = rtmp + 16 * ((unsigned int)ajtmp[j]);
1147       /*        x = rtmp+colscale*ajtmp[j]; */
1148       /* Copy v block into x block */
1149       SSE_INLINE_BEGIN_2(v, x)
1150       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1151       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1152       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1153 
1154       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1155       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1156 
1157       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1158       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1159 
1160       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1161       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1162 
1163       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1164       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1165 
1166       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1167       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1168 
1169       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1170       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1171 
1172       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1173       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1174       SSE_INLINE_END_2;
1175 
1176       v += 16;
1177     }
1178     /*      row = (*bjtmp++)/4; */
1179     row = (unsigned int)(*bjtmp++);
1180     while (row < i) {
1181       pc = rtmp + 16 * row;
1182       SSE_INLINE_BEGIN_1(pc)
1183       /* Load block from lower triangle */
1184       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1185       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1186       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1187 
1188       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1189       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1190 
1191       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1192       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1193 
1194       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1195       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1196 
1197       /* Compare block to zero block */
1198 
1199       SSE_COPY_PS(XMM4, XMM7)
1200       SSE_CMPNEQ_PS(XMM4, XMM0)
1201 
1202       SSE_COPY_PS(XMM5, XMM7)
1203       SSE_CMPNEQ_PS(XMM5, XMM1)
1204 
1205       SSE_COPY_PS(XMM6, XMM7)
1206       SSE_CMPNEQ_PS(XMM6, XMM2)
1207 
1208       SSE_CMPNEQ_PS(XMM7, XMM3)
1209 
1210       /* Reduce the comparisons to one SSE register */
1211       SSE_OR_PS(XMM6, XMM7)
1212       SSE_OR_PS(XMM5, XMM4)
1213       SSE_OR_PS(XMM5, XMM6)
1214       SSE_INLINE_END_1;
1215 
1216       /* Reduce the one SSE register to an integer register for branching */
1217       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1218       MOVEMASK(nonzero, XMM5);
1219 
1220       /* If block is nonzero ... */
1221       if (nonzero) {
1222         pv = ba + 16 * diag_offset[row];
1223         PREFETCH_L1(&pv[16]);
1224         pj = bj + diag_offset[row] + 1;
1225 
1226         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1227         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1228         /* but the diagonal was inverted already */
1229         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1230 
1231         SSE_INLINE_BEGIN_2(pv, pc)
1232         /* Column 0, product is accumulated in XMM4 */
1233         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1234         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1235         SSE_MULT_PS(XMM4, XMM0)
1236 
1237         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1238         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1239         SSE_MULT_PS(XMM5, XMM1)
1240         SSE_ADD_PS(XMM4, XMM5)
1241 
1242         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1243         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1244         SSE_MULT_PS(XMM6, XMM2)
1245         SSE_ADD_PS(XMM4, XMM6)
1246 
1247         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1248         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1249         SSE_MULT_PS(XMM7, XMM3)
1250         SSE_ADD_PS(XMM4, XMM7)
1251 
1252         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1253         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1254 
1255         /* Column 1, product is accumulated in XMM5 */
1256         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1257         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1258         SSE_MULT_PS(XMM5, XMM0)
1259 
1260         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1261         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1262         SSE_MULT_PS(XMM6, XMM1)
1263         SSE_ADD_PS(XMM5, XMM6)
1264 
1265         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1266         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1267         SSE_MULT_PS(XMM7, XMM2)
1268         SSE_ADD_PS(XMM5, XMM7)
1269 
1270         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1271         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1272         SSE_MULT_PS(XMM6, XMM3)
1273         SSE_ADD_PS(XMM5, XMM6)
1274 
1275         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1276         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1277 
1278         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1279 
1280         /* Column 2, product is accumulated in XMM6 */
1281         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1282         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1283         SSE_MULT_PS(XMM6, XMM0)
1284 
1285         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1286         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1287         SSE_MULT_PS(XMM7, XMM1)
1288         SSE_ADD_PS(XMM6, XMM7)
1289 
1290         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1291         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1292         SSE_MULT_PS(XMM7, XMM2)
1293         SSE_ADD_PS(XMM6, XMM7)
1294 
1295         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1296         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1297         SSE_MULT_PS(XMM7, XMM3)
1298         SSE_ADD_PS(XMM6, XMM7)
1299 
1300         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1301         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1302 
1303         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1304         /* Column 3, product is accumulated in XMM0 */
1305         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1306         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1307         SSE_MULT_PS(XMM0, XMM7)
1308 
1309         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1310         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1311         SSE_MULT_PS(XMM1, XMM7)
1312         SSE_ADD_PS(XMM0, XMM1)
1313 
1314         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1315         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1316         SSE_MULT_PS(XMM1, XMM2)
1317         SSE_ADD_PS(XMM0, XMM1)
1318 
1319         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1320         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1321         SSE_MULT_PS(XMM3, XMM7)
1322         SSE_ADD_PS(XMM0, XMM3)
1323 
1324         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1325         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1326 
1327         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1328         /* This is code to be maintained and read by humans after all. */
1329         /* Copy Multiplier Col 3 into XMM3 */
1330         SSE_COPY_PS(XMM3, XMM0)
1331         /* Copy Multiplier Col 2 into XMM2 */
1332         SSE_COPY_PS(XMM2, XMM6)
1333         /* Copy Multiplier Col 1 into XMM1 */
1334         SSE_COPY_PS(XMM1, XMM5)
1335         /* Copy Multiplier Col 0 into XMM0 */
1336         SSE_COPY_PS(XMM0, XMM4)
1337         SSE_INLINE_END_2;
1338 
1339         /* Update the row: */
1340         nz = bi[row + 1] - diag_offset[row] - 1;
1341         pv += 16;
1342         for (j = 0; j < nz; j++) {
1343           PREFETCH_L1(&pv[16]);
1344           x = rtmp + 16 * ((unsigned int)pj[j]);
1345           /*            x = rtmp + 4*pj[j]; */
1346 
1347           /* X:=X-M*PV, One column at a time */
1348           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1349           SSE_INLINE_BEGIN_2(x, pv)
1350           /* Load First Column of X*/
1351           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1352           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1353 
1354           /* Matrix-Vector Product: */
1355           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1356           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1357           SSE_MULT_PS(XMM5, XMM0)
1358           SSE_SUB_PS(XMM4, XMM5)
1359 
1360           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1361           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1362           SSE_MULT_PS(XMM6, XMM1)
1363           SSE_SUB_PS(XMM4, XMM6)
1364 
1365           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1366           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1367           SSE_MULT_PS(XMM7, XMM2)
1368           SSE_SUB_PS(XMM4, XMM7)
1369 
1370           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1371           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1372           SSE_MULT_PS(XMM5, XMM3)
1373           SSE_SUB_PS(XMM4, XMM5)
1374 
1375           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1376           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1377 
1378           /* Second Column */
1379           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1380           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1381 
1382           /* Matrix-Vector Product: */
1383           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1384           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1385           SSE_MULT_PS(XMM6, XMM0)
1386           SSE_SUB_PS(XMM5, XMM6)
1387 
1388           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1389           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1390           SSE_MULT_PS(XMM7, XMM1)
1391           SSE_SUB_PS(XMM5, XMM7)
1392 
1393           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1394           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1395           SSE_MULT_PS(XMM4, XMM2)
1396           SSE_SUB_PS(XMM5, XMM4)
1397 
1398           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1399           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1400           SSE_MULT_PS(XMM6, XMM3)
1401           SSE_SUB_PS(XMM5, XMM6)
1402 
1403           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1404           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1405 
1406           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1407 
1408           /* Third Column */
1409           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1410           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1411 
1412           /* Matrix-Vector Product: */
1413           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1414           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1415           SSE_MULT_PS(XMM7, XMM0)
1416           SSE_SUB_PS(XMM6, XMM7)
1417 
1418           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1419           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1420           SSE_MULT_PS(XMM4, XMM1)
1421           SSE_SUB_PS(XMM6, XMM4)
1422 
1423           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1424           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1425           SSE_MULT_PS(XMM5, XMM2)
1426           SSE_SUB_PS(XMM6, XMM5)
1427 
1428           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1429           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1430           SSE_MULT_PS(XMM7, XMM3)
1431           SSE_SUB_PS(XMM6, XMM7)
1432 
1433           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1434           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1435 
1436           /* Fourth Column */
1437           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1438           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1439 
1440           /* Matrix-Vector Product: */
1441           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1442           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1443           SSE_MULT_PS(XMM5, XMM0)
1444           SSE_SUB_PS(XMM4, XMM5)
1445 
1446           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1447           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1448           SSE_MULT_PS(XMM6, XMM1)
1449           SSE_SUB_PS(XMM4, XMM6)
1450 
1451           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1452           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1453           SSE_MULT_PS(XMM7, XMM2)
1454           SSE_SUB_PS(XMM4, XMM7)
1455 
1456           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1457           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1458           SSE_MULT_PS(XMM5, XMM3)
1459           SSE_SUB_PS(XMM4, XMM5)
1460 
1461           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1462           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1463           SSE_INLINE_END_2;
1464           pv += 16;
1465         }
1466         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1467       }
1468       row = (unsigned int)(*bjtmp++);
1469       /*        row = (*bjtmp++)/4; */
1470       /*        bjtmp++; */
1471     }
1472     /* finished row so stick it into b->a */
1473     pv = ba + 16 * bi[i];
1474     pj = bj + bi[i];
1475     nz = bi[i + 1] - bi[i];
1476 
1477     /* Copy x block back into pv block */
1478     for (j = 0; j < nz; j++) {
1479       x = rtmp + 16 * ((unsigned int)pj[j]);
1480       /*        x  = rtmp+4*pj[j]; */
1481 
1482       SSE_INLINE_BEGIN_2(x, pv)
1483       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1484       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1485       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1486 
1487       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1488       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1489 
1490       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1491       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1492 
1493       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1494       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1495 
1496       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1497       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1498 
1499       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1500       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1501 
1502       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1503       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1504 
1505       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1506       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1507       SSE_INLINE_END_2;
1508       pv += 16;
1509     }
1510     /* invert diagonal block */
1511     w = ba + 16 * diag_offset[i];
1512     if (pivotinblocks) {
1513       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
1514       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1515     } else {
1516       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1517     }
1518     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1519   }
1520 
1521   PetscCall(PetscFree(rtmp));
1522 
1523   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1524   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1525   C->assembled           = PETSC_TRUE;
1526 
1527   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1528   /* Flop Count from inverting diagonal blocks */
1529   SSE_SCOPE_END;
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 
1533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info)
1534 {
1535   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1536   int             i, j, n = a->mbs;
1537   unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj;
1538   unsigned int    row;
1539   int            *ajtmpold, nz, *bi = b->i;
1540   int            *diag_offset = b->diag, *ai = a->i, *aj = a->j;
1541   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1542   MatScalar      *ba = b->a, *aa = a->a;
1543   int             nonzero       = 0;
1544   PetscBool       pivotinblocks = b->pivotinblocks;
1545   PetscReal       shift         = info->shiftamount;
1546   PetscBool       allowzeropivot, zeropivotdetected = PETSC_FALSE;
1547 
1548   PetscFunctionBegin;
1549   allowzeropivot = PetscNot(A->erroriffailure);
1550   SSE_SCOPE_BEGIN;
1551 
1552   PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned.  SSE will not work.");
1553   PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned.  SSE will not work.");
1554   PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
1555   PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1556   /*    if ((unsigned long)bj==(unsigned long)aj) { */
1557   /*      colscale = 4; */
1558   /*    } */
1559   if ((unsigned long)bj == (unsigned long)aj) return MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C);
1560 
1561   for (i = 0; i < n; i++) {
1562     nz    = bi[i + 1] - bi[i];
1563     bjtmp = bj + bi[i];
1564     /* zero out the 4x4 block accumulators */
1565     /* zero out one register */
1566     XOR_PS(XMM7, XMM7);
1567     for (j = 0; j < nz; j++) {
1568       x = rtmp + 16 * ((unsigned int)bjtmp[j]);
1569       /*        x = rtmp+4*bjtmp[j]; */
1570       SSE_INLINE_BEGIN_1(x)
1571       /* Copy zero register to memory locations */
1572       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1573       SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7)
1574       SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7)
1575       SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7)
1576       SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7)
1577       SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7)
1578       SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7)
1579       SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1580       SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7)
1581       SSE_INLINE_END_1;
1582     }
1583     /* load in initial (unfactored row) */
1584     nz       = ai[i + 1] - ai[i];
1585     ajtmpold = aj + ai[i];
1586     v        = aa + 16 * ai[i];
1587     for (j = 0; j < nz; j++) {
1588       x = rtmp + 16 * ajtmpold[j];
1589       /*        x = rtmp+colscale*ajtmpold[j]; */
1590       /* Copy v block into x block */
1591       SSE_INLINE_BEGIN_2(v, x)
1592       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1593       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1594       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0)
1595 
1596       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1)
1597       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1)
1598 
1599       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2)
1600       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2)
1601 
1602       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3)
1603       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3)
1604 
1605       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4)
1606       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4)
1607 
1608       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5)
1609       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5)
1610 
1611       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6)
1612       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6)
1613 
1614       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1615       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1616       SSE_INLINE_END_2;
1617 
1618       v += 16;
1619     }
1620     /*      row = (*bjtmp++)/4; */
1621     row = (unsigned int)(*bjtmp++);
1622     while (row < i) {
1623       pc = rtmp + 16 * row;
1624       SSE_INLINE_BEGIN_1(pc)
1625       /* Load block from lower triangle */
1626       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1627       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0)
1628       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0)
1629 
1630       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1)
1631       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1)
1632 
1633       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2)
1634       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2)
1635 
1636       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3)
1637       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3)
1638 
1639       /* Compare block to zero block */
1640 
1641       SSE_COPY_PS(XMM4, XMM7)
1642       SSE_CMPNEQ_PS(XMM4, XMM0)
1643 
1644       SSE_COPY_PS(XMM5, XMM7)
1645       SSE_CMPNEQ_PS(XMM5, XMM1)
1646 
1647       SSE_COPY_PS(XMM6, XMM7)
1648       SSE_CMPNEQ_PS(XMM6, XMM2)
1649 
1650       SSE_CMPNEQ_PS(XMM7, XMM3)
1651 
1652       /* Reduce the comparisons to one SSE register */
1653       SSE_OR_PS(XMM6, XMM7)
1654       SSE_OR_PS(XMM5, XMM4)
1655       SSE_OR_PS(XMM5, XMM6)
1656       SSE_INLINE_END_1;
1657 
1658       /* Reduce the one SSE register to an integer register for branching */
1659       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1660       MOVEMASK(nonzero, XMM5);
1661 
1662       /* If block is nonzero ... */
1663       if (nonzero) {
1664         pv = ba + 16 * diag_offset[row];
1665         PREFETCH_L1(&pv[16]);
1666         pj = bj + diag_offset[row] + 1;
1667 
1668         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1669         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1670         /* but the diagonal was inverted already */
1671         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1672 
1673         SSE_INLINE_BEGIN_2(pv, pc)
1674         /* Column 0, product is accumulated in XMM4 */
1675         SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4)
1676         SSE_SHUFFLE(XMM4, XMM4, 0x00)
1677         SSE_MULT_PS(XMM4, XMM0)
1678 
1679         SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5)
1680         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1681         SSE_MULT_PS(XMM5, XMM1)
1682         SSE_ADD_PS(XMM4, XMM5)
1683 
1684         SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6)
1685         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1686         SSE_MULT_PS(XMM6, XMM2)
1687         SSE_ADD_PS(XMM4, XMM6)
1688 
1689         SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7)
1690         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1691         SSE_MULT_PS(XMM7, XMM3)
1692         SSE_ADD_PS(XMM4, XMM7)
1693 
1694         SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4)
1695         SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4)
1696 
1697         /* Column 1, product is accumulated in XMM5 */
1698         SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5)
1699         SSE_SHUFFLE(XMM5, XMM5, 0x00)
1700         SSE_MULT_PS(XMM5, XMM0)
1701 
1702         SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6)
1703         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1704         SSE_MULT_PS(XMM6, XMM1)
1705         SSE_ADD_PS(XMM5, XMM6)
1706 
1707         SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7)
1708         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1709         SSE_MULT_PS(XMM7, XMM2)
1710         SSE_ADD_PS(XMM5, XMM7)
1711 
1712         SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6)
1713         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1714         SSE_MULT_PS(XMM6, XMM3)
1715         SSE_ADD_PS(XMM5, XMM6)
1716 
1717         SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5)
1718         SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5)
1719 
1720         SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24)
1721 
1722         /* Column 2, product is accumulated in XMM6 */
1723         SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6)
1724         SSE_SHUFFLE(XMM6, XMM6, 0x00)
1725         SSE_MULT_PS(XMM6, XMM0)
1726 
1727         SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7)
1728         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1729         SSE_MULT_PS(XMM7, XMM1)
1730         SSE_ADD_PS(XMM6, XMM7)
1731 
1732         SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7)
1733         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1734         SSE_MULT_PS(XMM7, XMM2)
1735         SSE_ADD_PS(XMM6, XMM7)
1736 
1737         SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7)
1738         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1739         SSE_MULT_PS(XMM7, XMM3)
1740         SSE_ADD_PS(XMM6, XMM7)
1741 
1742         SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6)
1743         SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1744 
1745         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1746         /* Column 3, product is accumulated in XMM0 */
1747         SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7)
1748         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1749         SSE_MULT_PS(XMM0, XMM7)
1750 
1751         SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7)
1752         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1753         SSE_MULT_PS(XMM1, XMM7)
1754         SSE_ADD_PS(XMM0, XMM1)
1755 
1756         SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1)
1757         SSE_SHUFFLE(XMM1, XMM1, 0x00)
1758         SSE_MULT_PS(XMM1, XMM2)
1759         SSE_ADD_PS(XMM0, XMM1)
1760 
1761         SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7)
1762         SSE_SHUFFLE(XMM7, XMM7, 0x00)
1763         SSE_MULT_PS(XMM3, XMM7)
1764         SSE_ADD_PS(XMM0, XMM3)
1765 
1766         SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0)
1767         SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1768 
1769         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1770         /* This is code to be maintained and read by humans after all. */
1771         /* Copy Multiplier Col 3 into XMM3 */
1772         SSE_COPY_PS(XMM3, XMM0)
1773         /* Copy Multiplier Col 2 into XMM2 */
1774         SSE_COPY_PS(XMM2, XMM6)
1775         /* Copy Multiplier Col 1 into XMM1 */
1776         SSE_COPY_PS(XMM1, XMM5)
1777         /* Copy Multiplier Col 0 into XMM0 */
1778         SSE_COPY_PS(XMM0, XMM4)
1779         SSE_INLINE_END_2;
1780 
1781         /* Update the row: */
1782         nz = bi[row + 1] - diag_offset[row] - 1;
1783         pv += 16;
1784         for (j = 0; j < nz; j++) {
1785           PREFETCH_L1(&pv[16]);
1786           x = rtmp + 16 * ((unsigned int)pj[j]);
1787           /*            x = rtmp + 4*pj[j]; */
1788 
1789           /* X:=X-M*PV, One column at a time */
1790           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1791           SSE_INLINE_BEGIN_2(x, pv)
1792           /* Load First Column of X*/
1793           SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1794           SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1795 
1796           /* Matrix-Vector Product: */
1797           SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5)
1798           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1799           SSE_MULT_PS(XMM5, XMM0)
1800           SSE_SUB_PS(XMM4, XMM5)
1801 
1802           SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6)
1803           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1804           SSE_MULT_PS(XMM6, XMM1)
1805           SSE_SUB_PS(XMM4, XMM6)
1806 
1807           SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7)
1808           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1809           SSE_MULT_PS(XMM7, XMM2)
1810           SSE_SUB_PS(XMM4, XMM7)
1811 
1812           SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5)
1813           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1814           SSE_MULT_PS(XMM5, XMM3)
1815           SSE_SUB_PS(XMM4, XMM5)
1816 
1817           SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4)
1818           SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4)
1819 
1820           /* Second Column */
1821           SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1822           SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1823 
1824           /* Matrix-Vector Product: */
1825           SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6)
1826           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1827           SSE_MULT_PS(XMM6, XMM0)
1828           SSE_SUB_PS(XMM5, XMM6)
1829 
1830           SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7)
1831           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1832           SSE_MULT_PS(XMM7, XMM1)
1833           SSE_SUB_PS(XMM5, XMM7)
1834 
1835           SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4)
1836           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1837           SSE_MULT_PS(XMM4, XMM2)
1838           SSE_SUB_PS(XMM5, XMM4)
1839 
1840           SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6)
1841           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1842           SSE_MULT_PS(XMM6, XMM3)
1843           SSE_SUB_PS(XMM5, XMM6)
1844 
1845           SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5)
1846           SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5)
1847 
1848           SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24)
1849 
1850           /* Third Column */
1851           SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1852           SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1853 
1854           /* Matrix-Vector Product: */
1855           SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7)
1856           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1857           SSE_MULT_PS(XMM7, XMM0)
1858           SSE_SUB_PS(XMM6, XMM7)
1859 
1860           SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4)
1861           SSE_SHUFFLE(XMM4, XMM4, 0x00)
1862           SSE_MULT_PS(XMM4, XMM1)
1863           SSE_SUB_PS(XMM6, XMM4)
1864 
1865           SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5)
1866           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1867           SSE_MULT_PS(XMM5, XMM2)
1868           SSE_SUB_PS(XMM6, XMM5)
1869 
1870           SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7)
1871           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1872           SSE_MULT_PS(XMM7, XMM3)
1873           SSE_SUB_PS(XMM6, XMM7)
1874 
1875           SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6)
1876           SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1877 
1878           /* Fourth Column */
1879           SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1880           SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1881 
1882           /* Matrix-Vector Product: */
1883           SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5)
1884           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1885           SSE_MULT_PS(XMM5, XMM0)
1886           SSE_SUB_PS(XMM4, XMM5)
1887 
1888           SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6)
1889           SSE_SHUFFLE(XMM6, XMM6, 0x00)
1890           SSE_MULT_PS(XMM6, XMM1)
1891           SSE_SUB_PS(XMM4, XMM6)
1892 
1893           SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7)
1894           SSE_SHUFFLE(XMM7, XMM7, 0x00)
1895           SSE_MULT_PS(XMM7, XMM2)
1896           SSE_SUB_PS(XMM4, XMM7)
1897 
1898           SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5)
1899           SSE_SHUFFLE(XMM5, XMM5, 0x00)
1900           SSE_MULT_PS(XMM5, XMM3)
1901           SSE_SUB_PS(XMM4, XMM5)
1902 
1903           SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4)
1904           SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4)
1905           SSE_INLINE_END_2;
1906           pv += 16;
1907         }
1908         PetscCall(PetscLogFlops(128.0 * nz + 112.0));
1909       }
1910       row = (unsigned int)(*bjtmp++);
1911       /*        row = (*bjtmp++)/4; */
1912       /*        bjtmp++; */
1913     }
1914     /* finished row so stick it into b->a */
1915     pv = ba + 16 * bi[i];
1916     pj = bj + bi[i];
1917     nz = bi[i + 1] - bi[i];
1918 
1919     /* Copy x block back into pv block */
1920     for (j = 0; j < nz; j++) {
1921       x = rtmp + 16 * ((unsigned int)pj[j]);
1922       /*        x  = rtmp+4*pj[j]; */
1923 
1924       SSE_INLINE_BEGIN_2(x, pv)
1925       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1926       SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1)
1927       SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1)
1928 
1929       SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2)
1930       SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2)
1931 
1932       SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3)
1933       SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3)
1934 
1935       SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4)
1936       SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4)
1937 
1938       SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5)
1939       SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5)
1940 
1941       SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6)
1942       SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6)
1943 
1944       SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7)
1945       SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7)
1946 
1947       SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0)
1948       SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0)
1949       SSE_INLINE_END_2;
1950       pv += 16;
1951     }
1952     /* invert diagonal block */
1953     w = ba + 16 * diag_offset[i];
1954     if (pivotinblocks) {
1955       PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected));
1956       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1957     } else {
1958       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1959     }
1960     /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1961     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1962   }
1963 
1964   PetscCall(PetscFree(rtmp));
1965 
1966   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1967   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1968   C->assembled           = PETSC_TRUE;
1969 
1970   PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs));
1971   /* Flop Count from inverting diagonal blocks */
1972   SSE_SCOPE_END;
1973   PetscFunctionReturn(PETSC_SUCCESS);
1974 }
1975 
1976 #endif
1977