xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13 {
14   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15   IS             isrow = b->row,isicol = b->icol;
16   const PetscInt *r,*ic;
17   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
18   PetscInt       *ajtmpold,*ajtmp,nz,row;
19   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
20   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
21   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
22   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
23   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
24   MatScalar      m13,m14,m15,m16;
25   MatScalar      *ba           = b->a,*aa = a->a;
26   PetscBool      pivotinblocks = b->pivotinblocks;
27   PetscReal      shift         = info->shiftamount;
28   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
29 
30   PetscFunctionBegin;
31   PetscCall(ISGetIndices(isrow,&r));
32   PetscCall(ISGetIndices(isicol,&ic));
33   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
34   allowzeropivot = PetscNot(A->erroriffailure);
35 
36   for (i=0; i<n; i++) {
37     nz    = bi[i+1] - bi[i];
38     ajtmp = bj + bi[i];
39     for  (j=0; j<nz; j++) {
40       x     = rtmp+16*ajtmp[j];
41       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
42       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
43     }
44     /* load in initial (unfactored row) */
45     idx      = r[i];
46     nz       = ai[idx+1] - ai[idx];
47     ajtmpold = aj + ai[idx];
48     v        = aa + 16*ai[idx];
49     for (j=0; j<nz; j++) {
50       x     = rtmp+16*ic[ajtmpold[j]];
51       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
52       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
53       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
54       x[14] = v[14]; x[15] = v[15];
55       v    += 16;
56     }
57     row = *ajtmp++;
58     while (row < i) {
59       pc  = rtmp + 16*row;
60       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
61       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
62       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
63       p15 = pc[14]; p16 = pc[15];
64       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
65           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
66           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
67           || p16 != 0.0) {
68         pv    = ba + 16*diag_offset[row];
69         pj    = bj + diag_offset[row] + 1;
70         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
71         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
72         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
73         x15   = pv[14]; x16 = pv[15];
74         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
75         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
76         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
77         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
78 
79         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
80         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
81         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
82         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
83 
84         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
85         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
86         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
87         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
88 
89         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
90         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
91         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
92         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
93 
94         nz  = bi[row+1] - diag_offset[row] - 1;
95         pv += 16;
96         for (j=0; j<nz; j++) {
97           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
98           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
99           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
100           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
101           x     = rtmp + 16*pj[j];
102           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
103           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
104           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
105           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
106 
107           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
108           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
109           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
110           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
111 
112           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
113           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
114           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
115           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
116 
117           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
118           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
119           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
120           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
121 
122           pv += 16;
123         }
124         PetscCall(PetscLogFlops(128.0*nz+112.0));
125       }
126       row = *ajtmp++;
127     }
128     /* finished row so stick it into b->a */
129     pv = ba + 16*bi[i];
130     pj = bj + bi[i];
131     nz = bi[i+1] - bi[i];
132     for (j=0; j<nz; j++) {
133       x      = rtmp+16*pj[j];
134       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
135       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
136       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
137       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
138       pv    += 16;
139     }
140     /* invert diagonal block */
141     w = ba + 16*diag_offset[i];
142     if (pivotinblocks) {
143       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
144       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
145     } else {
146       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
147     }
148   }
149 
150   PetscCall(PetscFree(rtmp));
151   PetscCall(ISRestoreIndices(isicol,&ic));
152   PetscCall(ISRestoreIndices(isrow,&r));
153 
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled           = PETSC_TRUE;
157 
158   PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */
159   PetscFunctionReturn(0);
160 }
161 
162 /* MatLUFactorNumeric_SeqBAIJ_4 -
163      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
164        PetscKernel_A_gets_A_times_B()
165        PetscKernel_A_gets_A_minus_B_times_C()
166        PetscKernel_A_gets_inverse_A()
167 */
168 
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C     = B;
172   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   const PetscInt *r,*ic;
175   PetscInt       i,j,k,nz,nzL,row;
176   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
177   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
178   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
179   PetscInt       flg;
180   PetscReal      shift;
181   PetscBool      allowzeropivot,zeropivotdetected;
182 
183   PetscFunctionBegin;
184   allowzeropivot = PetscNot(A->erroriffailure);
185   PetscCall(ISGetIndices(isrow,&r));
186   PetscCall(ISGetIndices(isicol,&ic));
187 
188   if (info->shifttype == MAT_SHIFT_NONE) {
189     shift = 0;
190   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
191     shift = info->shiftamount;
192   }
193 
194   /* generate work space needed by the factorization */
195   PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork));
196   PetscCall(PetscArrayzero(rtmp,bs2*n));
197 
198   for (i=0; i<n; i++) {
199     /* zero rtmp */
200     /* L part */
201     nz    = bi[i+1] - bi[i];
202     bjtmp = bj + bi[i];
203     for  (j=0; j<nz; j++) {
204       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
205     }
206 
207     /* U part */
208     nz    = bdiag[i]-bdiag[i+1];
209     bjtmp = bj + bdiag[i+1]+1;
210     for  (j=0; j<nz; j++) {
211       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
212     }
213 
214     /* load in initial (unfactored row) */
215     nz    = ai[r[i]+1] - ai[r[i]];
216     ajtmp = aj + ai[r[i]];
217     v     = aa + bs2*ai[r[i]];
218     for (j=0; j<nz; j++) {
219       PetscCall(PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2));
220     }
221 
222     /* elimination */
223     bjtmp = bj + bi[i];
224     nzL   = bi[i+1] - bi[i];
225     for (k=0; k < nzL; k++) {
226       row = bjtmp[k];
227       pc  = rtmp + bs2*row;
228       for (flg=0,j=0; j<bs2; j++) {
229         if (pc[j]!=0.0) {
230           flg = 1;
231           break;
232         }
233       }
234       if (flg) {
235         pv = b->a + bs2*bdiag[row];
236         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
237         PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork));
238 
239         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
240         pv = b->a + bs2*(bdiag[row+1]+1);
241         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
242         for (j=0; j<nz; j++) {
243           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
244           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
245           v    = rtmp + bs2*pj[j];
246           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv));
247           pv  += bs2;
248         }
249         PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
250       }
251     }
252 
253     /* finished row so stick it into b->a */
254     /* L part */
255     pv = b->a + bs2*bi[i];
256     pj = b->j + bi[i];
257     nz = bi[i+1] - bi[i];
258     for (j=0; j<nz; j++) {
259       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
260     }
261 
262     /* Mark diagonal and invert diagonal for simpler triangular solves */
263     pv   = b->a + bs2*bdiag[i];
264     pj   = b->j + bdiag[i];
265     PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2));
266     PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected));
267     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
268 
269     /* U part */
270     pv = b->a + bs2*(bdiag[i+1]+1);
271     pj = b->j + bdiag[i+1]+1;
272     nz = bdiag[i] - bdiag[i+1] - 1;
273     for (j=0; j<nz; j++) {
274       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
275     }
276   }
277 
278   PetscCall(PetscFree2(rtmp,mwork));
279   PetscCall(ISRestoreIndices(isicol,&ic));
280   PetscCall(ISRestoreIndices(isrow,&r));
281 
282   C->ops->solve          = MatSolve_SeqBAIJ_4;
283   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
284   C->assembled           = PETSC_TRUE;
285 
286   PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */
287   PetscFunctionReturn(0);
288 }
289 
290 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
291 {
292 /*
293     Default Version for when blocks are 4 by 4 Using natural ordering
294 */
295   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
296   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
297   PetscInt       *ajtmpold,*ajtmp,nz,row;
298   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
299   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
300   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
301   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
302   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
303   MatScalar      m13,m14,m15,m16;
304   MatScalar      *ba           = b->a,*aa = a->a;
305   PetscBool      pivotinblocks = b->pivotinblocks;
306   PetscReal      shift         = info->shiftamount;
307   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
308 
309   PetscFunctionBegin;
310   allowzeropivot = PetscNot(A->erroriffailure);
311   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
312 
313   for (i=0; i<n; i++) {
314     nz    = bi[i+1] - bi[i];
315     ajtmp = bj + bi[i];
316     for  (j=0; j<nz; j++) {
317       x     = rtmp+16*ajtmp[j];
318       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
319       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
320     }
321     /* load in initial (unfactored row) */
322     nz       = ai[i+1] - ai[i];
323     ajtmpold = aj + ai[i];
324     v        = aa + 16*ai[i];
325     for (j=0; j<nz; j++) {
326       x     = rtmp+16*ajtmpold[j];
327       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
328       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
329       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
330       x[14] = v[14]; x[15] = v[15];
331       v    += 16;
332     }
333     row = *ajtmp++;
334     while (row < i) {
335       pc  = rtmp + 16*row;
336       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
337       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
338       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
339       p15 = pc[14]; p16 = pc[15];
340       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
341           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
342           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
343           || p16 != 0.0) {
344         pv    = ba + 16*diag_offset[row];
345         pj    = bj + diag_offset[row] + 1;
346         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
347         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
348         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
349         x15   = pv[14]; x16 = pv[15];
350         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
351         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
352         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
353         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
354 
355         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
356         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
357         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
358         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
359 
360         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
361         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
362         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
363         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
364 
365         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
366         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
367         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
368         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
369         nz     = bi[row+1] - diag_offset[row] - 1;
370         pv    += 16;
371         for (j=0; j<nz; j++) {
372           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
373           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
374           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
375           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
376           x     = rtmp + 16*pj[j];
377           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
378           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
379           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
380           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
381 
382           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
383           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
384           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
385           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
386 
387           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
388           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
389           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
390           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
391 
392           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
393           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
394           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
395           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
396 
397           pv += 16;
398         }
399         PetscCall(PetscLogFlops(128.0*nz+112.0));
400       }
401       row = *ajtmp++;
402     }
403     /* finished row so stick it into b->a */
404     pv = ba + 16*bi[i];
405     pj = bj + bi[i];
406     nz = bi[i+1] - bi[i];
407     for (j=0; j<nz; j++) {
408       x      = rtmp+16*pj[j];
409       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
410       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
411       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
412       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
413       pv    += 16;
414     }
415     /* invert diagonal block */
416     w = ba + 16*diag_offset[i];
417     if (pivotinblocks) {
418       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
419       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
420     } else {
421       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
422     }
423   }
424 
425   PetscCall(PetscFree(rtmp));
426 
427   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
428   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
429   C->assembled           = PETSC_TRUE;
430 
431   PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */
432   PetscFunctionReturn(0);
433 }
434 
435 /*
436   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
437     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
438 */
439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
440 {
441   Mat            C =B;
442   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
443   PetscInt       i,j,k,nz,nzL,row;
444   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
445   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
446   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
447   PetscInt       flg;
448   PetscReal      shift;
449   PetscBool      allowzeropivot,zeropivotdetected;
450 
451   PetscFunctionBegin;
452   allowzeropivot = PetscNot(A->erroriffailure);
453 
454   /* generate work space needed by the factorization */
455   PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork));
456   PetscCall(PetscArrayzero(rtmp,bs2*n));
457 
458   if (info->shifttype == MAT_SHIFT_NONE) {
459     shift = 0;
460   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
461     shift = info->shiftamount;
462   }
463 
464   for (i=0; i<n; i++) {
465     /* zero rtmp */
466     /* L part */
467     nz    = bi[i+1] - bi[i];
468     bjtmp = bj + bi[i];
469     for  (j=0; j<nz; j++) {
470       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
471     }
472 
473     /* U part */
474     nz    = bdiag[i] - bdiag[i+1];
475     bjtmp = bj + bdiag[i+1]+1;
476     for  (j=0; j<nz; j++) {
477       PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2));
478     }
479 
480     /* load in initial (unfactored row) */
481     nz    = ai[i+1] - ai[i];
482     ajtmp = aj + ai[i];
483     v     = aa + bs2*ai[i];
484     for (j=0; j<nz; j++) {
485       PetscCall(PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2));
486     }
487 
488     /* elimination */
489     bjtmp = bj + bi[i];
490     nzL   = bi[i+1] - bi[i];
491     for (k=0; k < nzL; k++) {
492       row = bjtmp[k];
493       pc  = rtmp + bs2*row;
494       for (flg=0,j=0; j<bs2; j++) {
495         if (pc[j]!=0.0) {
496           flg = 1;
497           break;
498         }
499       }
500       if (flg) {
501         pv = b->a + bs2*bdiag[row];
502         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
503         PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork));
504 
505         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
506         pv = b->a + bs2*(bdiag[row+1]+1);
507         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
508         for (j=0; j<nz; j++) {
509           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
510           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
511           v    = rtmp + bs2*pj[j];
512           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv));
513           pv  += bs2;
514         }
515         PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
516       }
517     }
518 
519     /* finished row so stick it into b->a */
520     /* L part */
521     pv = b->a + bs2*bi[i];
522     pj = b->j + bi[i];
523     nz = bi[i+1] - bi[i];
524     for (j=0; j<nz; j++) {
525       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
526     }
527 
528     /* Mark diagonal and invert diagonal for simpler triangular solves */
529     pv   = b->a + bs2*bdiag[i];
530     pj   = b->j + bdiag[i];
531     PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2));
532     PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected));
533     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
534 
535     /* U part */
536     pv = b->a + bs2*(bdiag[i+1]+1);
537     pj = b->j + bdiag[i+1]+1;
538     nz = bdiag[i] - bdiag[i+1] - 1;
539     for (j=0; j<nz; j++) {
540       PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2));
541     }
542   }
543   PetscCall(PetscFree2(rtmp,mwork));
544 
545   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
546   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
547   C->assembled           = PETSC_TRUE;
548 
549   PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */
550   PetscFunctionReturn(0);
551 }
552 
553 #if defined(PETSC_HAVE_SSE)
554 
555 #include PETSC_HAVE_SSE
556 
557 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
558 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
559 {
560   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
561   int            i,j,n = a->mbs;
562   int            *bj = b->j,*bjtmp,*pj;
563   int            row;
564   int            *ajtmpold,nz,*bi=b->i;
565   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
566   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
567   MatScalar      *ba    = b->a,*aa = a->a;
568   int            nonzero=0;
569   PetscBool      pivotinblocks = b->pivotinblocks;
570   PetscReal      shift         = info->shiftamount;
571   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
572 
573   PetscFunctionBegin;
574   allowzeropivot = PetscNot(A->erroriffailure);
575   SSE_SCOPE_BEGIN;
576 
577   PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
578   PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
579   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
580   PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
581 /*    if ((unsigned long)bj==(unsigned long)aj) { */
582 /*      colscale = 4; */
583 /*    } */
584   for (i=0; i<n; i++) {
585     nz    = bi[i+1] - bi[i];
586     bjtmp = bj + bi[i];
587     /* zero out the 4x4 block accumulators */
588     /* zero out one register */
589     XOR_PS(XMM7,XMM7);
590     for  (j=0; j<nz; j++) {
591       x = rtmp+16*bjtmp[j];
592 /*        x = rtmp+4*bjtmp[j]; */
593       SSE_INLINE_BEGIN_1(x)
594       /* Copy zero register to memory locations */
595       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
596       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
597       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
598       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
599       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
600       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
601       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
602       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
603       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
604       SSE_INLINE_END_1;
605     }
606     /* load in initial (unfactored row) */
607     nz       = ai[i+1] - ai[i];
608     ajtmpold = aj + ai[i];
609     v        = aa + 16*ai[i];
610     for (j=0; j<nz; j++) {
611       x = rtmp+16*ajtmpold[j];
612 /*        x = rtmp+colscale*ajtmpold[j]; */
613       /* Copy v block into x block */
614       SSE_INLINE_BEGIN_2(v,x)
615       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
616       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
617       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
618 
619       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
620       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
621 
622       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
623       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
624 
625       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
626       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
627 
628       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
629       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
630 
631       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
632       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
633 
634       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
635       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
636 
637       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
638       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
639       SSE_INLINE_END_2;
640 
641       v += 16;
642     }
643 /*      row = (*bjtmp++)/4; */
644     row = *bjtmp++;
645     while (row < i) {
646       pc = rtmp + 16*row;
647       SSE_INLINE_BEGIN_1(pc)
648       /* Load block from lower triangle */
649       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
650       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
651       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
652 
653       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
654       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
655 
656       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
657       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
658 
659       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
660       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
661 
662       /* Compare block to zero block */
663 
664       SSE_COPY_PS(XMM4,XMM7)
665       SSE_CMPNEQ_PS(XMM4,XMM0)
666 
667       SSE_COPY_PS(XMM5,XMM7)
668       SSE_CMPNEQ_PS(XMM5,XMM1)
669 
670       SSE_COPY_PS(XMM6,XMM7)
671       SSE_CMPNEQ_PS(XMM6,XMM2)
672 
673       SSE_CMPNEQ_PS(XMM7,XMM3)
674 
675       /* Reduce the comparisons to one SSE register */
676       SSE_OR_PS(XMM6,XMM7)
677       SSE_OR_PS(XMM5,XMM4)
678       SSE_OR_PS(XMM5,XMM6)
679       SSE_INLINE_END_1;
680 
681       /* Reduce the one SSE register to an integer register for branching */
682       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
683       MOVEMASK(nonzero,XMM5);
684 
685       /* If block is nonzero ... */
686       if (nonzero) {
687         pv = ba + 16*diag_offset[row];
688         PREFETCH_L1(&pv[16]);
689         pj = bj + diag_offset[row] + 1;
690 
691         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
692         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
693         /* but the diagonal was inverted already */
694         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
695 
696         SSE_INLINE_BEGIN_2(pv,pc)
697         /* Column 0, product is accumulated in XMM4 */
698         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
699         SSE_SHUFFLE(XMM4,XMM4,0x00)
700         SSE_MULT_PS(XMM4,XMM0)
701 
702         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
703         SSE_SHUFFLE(XMM5,XMM5,0x00)
704         SSE_MULT_PS(XMM5,XMM1)
705         SSE_ADD_PS(XMM4,XMM5)
706 
707         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
708         SSE_SHUFFLE(XMM6,XMM6,0x00)
709         SSE_MULT_PS(XMM6,XMM2)
710         SSE_ADD_PS(XMM4,XMM6)
711 
712         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
713         SSE_SHUFFLE(XMM7,XMM7,0x00)
714         SSE_MULT_PS(XMM7,XMM3)
715         SSE_ADD_PS(XMM4,XMM7)
716 
717         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
718         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
719 
720         /* Column 1, product is accumulated in XMM5 */
721         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
722         SSE_SHUFFLE(XMM5,XMM5,0x00)
723         SSE_MULT_PS(XMM5,XMM0)
724 
725         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
726         SSE_SHUFFLE(XMM6,XMM6,0x00)
727         SSE_MULT_PS(XMM6,XMM1)
728         SSE_ADD_PS(XMM5,XMM6)
729 
730         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
731         SSE_SHUFFLE(XMM7,XMM7,0x00)
732         SSE_MULT_PS(XMM7,XMM2)
733         SSE_ADD_PS(XMM5,XMM7)
734 
735         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
736         SSE_SHUFFLE(XMM6,XMM6,0x00)
737         SSE_MULT_PS(XMM6,XMM3)
738         SSE_ADD_PS(XMM5,XMM6)
739 
740         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
741         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
742 
743         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
744 
745         /* Column 2, product is accumulated in XMM6 */
746         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
747         SSE_SHUFFLE(XMM6,XMM6,0x00)
748         SSE_MULT_PS(XMM6,XMM0)
749 
750         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
751         SSE_SHUFFLE(XMM7,XMM7,0x00)
752         SSE_MULT_PS(XMM7,XMM1)
753         SSE_ADD_PS(XMM6,XMM7)
754 
755         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
756         SSE_SHUFFLE(XMM7,XMM7,0x00)
757         SSE_MULT_PS(XMM7,XMM2)
758         SSE_ADD_PS(XMM6,XMM7)
759 
760         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
761         SSE_SHUFFLE(XMM7,XMM7,0x00)
762         SSE_MULT_PS(XMM7,XMM3)
763         SSE_ADD_PS(XMM6,XMM7)
764 
765         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
766         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
767 
768         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
769         /* Column 3, product is accumulated in XMM0 */
770         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
771         SSE_SHUFFLE(XMM7,XMM7,0x00)
772         SSE_MULT_PS(XMM0,XMM7)
773 
774         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
775         SSE_SHUFFLE(XMM7,XMM7,0x00)
776         SSE_MULT_PS(XMM1,XMM7)
777         SSE_ADD_PS(XMM0,XMM1)
778 
779         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
780         SSE_SHUFFLE(XMM1,XMM1,0x00)
781         SSE_MULT_PS(XMM1,XMM2)
782         SSE_ADD_PS(XMM0,XMM1)
783 
784         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
785         SSE_SHUFFLE(XMM7,XMM7,0x00)
786         SSE_MULT_PS(XMM3,XMM7)
787         SSE_ADD_PS(XMM0,XMM3)
788 
789         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
790         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
791 
792         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
793         /* This is code to be maintained and read by humans after all. */
794         /* Copy Multiplier Col 3 into XMM3 */
795         SSE_COPY_PS(XMM3,XMM0)
796         /* Copy Multiplier Col 2 into XMM2 */
797         SSE_COPY_PS(XMM2,XMM6)
798         /* Copy Multiplier Col 1 into XMM1 */
799         SSE_COPY_PS(XMM1,XMM5)
800         /* Copy Multiplier Col 0 into XMM0 */
801         SSE_COPY_PS(XMM0,XMM4)
802         SSE_INLINE_END_2;
803 
804         /* Update the row: */
805         nz  = bi[row+1] - diag_offset[row] - 1;
806         pv += 16;
807         for (j=0; j<nz; j++) {
808           PREFETCH_L1(&pv[16]);
809           x = rtmp + 16*pj[j];
810 /*            x = rtmp + 4*pj[j]; */
811 
812           /* X:=X-M*PV, One column at a time */
813           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
814           SSE_INLINE_BEGIN_2(x,pv)
815           /* Load First Column of X*/
816           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
817           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
818 
819           /* Matrix-Vector Product: */
820           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
821           SSE_SHUFFLE(XMM5,XMM5,0x00)
822           SSE_MULT_PS(XMM5,XMM0)
823           SSE_SUB_PS(XMM4,XMM5)
824 
825           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
826           SSE_SHUFFLE(XMM6,XMM6,0x00)
827           SSE_MULT_PS(XMM6,XMM1)
828           SSE_SUB_PS(XMM4,XMM6)
829 
830           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
831           SSE_SHUFFLE(XMM7,XMM7,0x00)
832           SSE_MULT_PS(XMM7,XMM2)
833           SSE_SUB_PS(XMM4,XMM7)
834 
835           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
836           SSE_SHUFFLE(XMM5,XMM5,0x00)
837           SSE_MULT_PS(XMM5,XMM3)
838           SSE_SUB_PS(XMM4,XMM5)
839 
840           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
841           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
842 
843           /* Second Column */
844           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
845           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
846 
847           /* Matrix-Vector Product: */
848           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
849           SSE_SHUFFLE(XMM6,XMM6,0x00)
850           SSE_MULT_PS(XMM6,XMM0)
851           SSE_SUB_PS(XMM5,XMM6)
852 
853           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
854           SSE_SHUFFLE(XMM7,XMM7,0x00)
855           SSE_MULT_PS(XMM7,XMM1)
856           SSE_SUB_PS(XMM5,XMM7)
857 
858           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
859           SSE_SHUFFLE(XMM4,XMM4,0x00)
860           SSE_MULT_PS(XMM4,XMM2)
861           SSE_SUB_PS(XMM5,XMM4)
862 
863           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
864           SSE_SHUFFLE(XMM6,XMM6,0x00)
865           SSE_MULT_PS(XMM6,XMM3)
866           SSE_SUB_PS(XMM5,XMM6)
867 
868           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
869           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
870 
871           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
872 
873           /* Third Column */
874           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
875           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
876 
877           /* Matrix-Vector Product: */
878           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
879           SSE_SHUFFLE(XMM7,XMM7,0x00)
880           SSE_MULT_PS(XMM7,XMM0)
881           SSE_SUB_PS(XMM6,XMM7)
882 
883           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
884           SSE_SHUFFLE(XMM4,XMM4,0x00)
885           SSE_MULT_PS(XMM4,XMM1)
886           SSE_SUB_PS(XMM6,XMM4)
887 
888           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
889           SSE_SHUFFLE(XMM5,XMM5,0x00)
890           SSE_MULT_PS(XMM5,XMM2)
891           SSE_SUB_PS(XMM6,XMM5)
892 
893           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
894           SSE_SHUFFLE(XMM7,XMM7,0x00)
895           SSE_MULT_PS(XMM7,XMM3)
896           SSE_SUB_PS(XMM6,XMM7)
897 
898           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
899           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
900 
901           /* Fourth Column */
902           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
903           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
904 
905           /* Matrix-Vector Product: */
906           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
907           SSE_SHUFFLE(XMM5,XMM5,0x00)
908           SSE_MULT_PS(XMM5,XMM0)
909           SSE_SUB_PS(XMM4,XMM5)
910 
911           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
912           SSE_SHUFFLE(XMM6,XMM6,0x00)
913           SSE_MULT_PS(XMM6,XMM1)
914           SSE_SUB_PS(XMM4,XMM6)
915 
916           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
917           SSE_SHUFFLE(XMM7,XMM7,0x00)
918           SSE_MULT_PS(XMM7,XMM2)
919           SSE_SUB_PS(XMM4,XMM7)
920 
921           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
922           SSE_SHUFFLE(XMM5,XMM5,0x00)
923           SSE_MULT_PS(XMM5,XMM3)
924           SSE_SUB_PS(XMM4,XMM5)
925 
926           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
927           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
928           SSE_INLINE_END_2;
929           pv += 16;
930         }
931         PetscCall(PetscLogFlops(128.0*nz+112.0));
932       }
933       row = *bjtmp++;
934 /*        row = (*bjtmp++)/4; */
935     }
936     /* finished row so stick it into b->a */
937     pv = ba + 16*bi[i];
938     pj = bj + bi[i];
939     nz = bi[i+1] - bi[i];
940 
941     /* Copy x block back into pv block */
942     for (j=0; j<nz; j++) {
943       x = rtmp+16*pj[j];
944 /*        x  = rtmp+4*pj[j]; */
945 
946       SSE_INLINE_BEGIN_2(x,pv)
947       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
948       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
949       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
950 
951       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
952       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
953 
954       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
955       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
956 
957       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
958       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
959 
960       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
961       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
962 
963       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
964       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
965 
966       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
967       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
968 
969       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
970       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
971       SSE_INLINE_END_2;
972       pv += 16;
973     }
974     /* invert diagonal block */
975     w = ba + 16*diag_offset[i];
976     if (pivotinblocks) {
977       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
978       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
979     } else {
980       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
981     }
982 /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
983     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
984   }
985 
986   PetscCall(PetscFree(rtmp));
987 
988   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
989   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
990   C->assembled           = PETSC_TRUE;
991 
992   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
993   /* Flop Count from inverting diagonal blocks */
994   SSE_SCOPE_END;
995   PetscFunctionReturn(0);
996 }
997 
998 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
999 {
1000   Mat            A  =C;
1001   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1002   int            i,j,n = a->mbs;
1003   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1004   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1005   unsigned int   row;
1006   int            nz,*bi=b->i;
1007   int            *diag_offset = b->diag,*ai=a->i;
1008   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1009   MatScalar      *ba    = b->a,*aa = a->a;
1010   int            nonzero=0;
1011   PetscBool      pivotinblocks = b->pivotinblocks;
1012   PetscReal      shift = info->shiftamount;
1013   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1014 
1015   PetscFunctionBegin;
1016   allowzeropivot = PetscNot(A->erroriffailure);
1017   SSE_SCOPE_BEGIN;
1018 
1019   PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1020   PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1021   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
1022   PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1023 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1024 /*      colscale = 4; */
1025 /*    } */
1026 
1027   for (i=0; i<n; i++) {
1028     nz    = bi[i+1] - bi[i];
1029     bjtmp = bj + bi[i];
1030     /* zero out the 4x4 block accumulators */
1031     /* zero out one register */
1032     XOR_PS(XMM7,XMM7);
1033     for  (j=0; j<nz; j++) {
1034       x = rtmp+16*((unsigned int)bjtmp[j]);
1035 /*        x = rtmp+4*bjtmp[j]; */
1036       SSE_INLINE_BEGIN_1(x)
1037       /* Copy zero register to memory locations */
1038       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1039       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1040       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1041       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1042       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1043       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1044       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1045       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1046       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1047       SSE_INLINE_END_1;
1048     }
1049     /* load in initial (unfactored row) */
1050     nz    = ai[i+1] - ai[i];
1051     ajtmp = aj + ai[i];
1052     v     = aa + 16*ai[i];
1053     for (j=0; j<nz; j++) {
1054       x = rtmp+16*((unsigned int)ajtmp[j]);
1055 /*        x = rtmp+colscale*ajtmp[j]; */
1056       /* Copy v block into x block */
1057       SSE_INLINE_BEGIN_2(v,x)
1058       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1059       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1060       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1061 
1062       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1063       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1064 
1065       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1066       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1067 
1068       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1069       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1070 
1071       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1072       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1073 
1074       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1075       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1076 
1077       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1078       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1079 
1080       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1081       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1082       SSE_INLINE_END_2;
1083 
1084       v += 16;
1085     }
1086 /*      row = (*bjtmp++)/4; */
1087     row = (unsigned int)(*bjtmp++);
1088     while (row < i) {
1089       pc = rtmp + 16*row;
1090       SSE_INLINE_BEGIN_1(pc)
1091       /* Load block from lower triangle */
1092       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1093       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1094       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1095 
1096       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1097       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1098 
1099       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1100       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1101 
1102       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1103       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1104 
1105       /* Compare block to zero block */
1106 
1107       SSE_COPY_PS(XMM4,XMM7)
1108       SSE_CMPNEQ_PS(XMM4,XMM0)
1109 
1110       SSE_COPY_PS(XMM5,XMM7)
1111       SSE_CMPNEQ_PS(XMM5,XMM1)
1112 
1113       SSE_COPY_PS(XMM6,XMM7)
1114       SSE_CMPNEQ_PS(XMM6,XMM2)
1115 
1116       SSE_CMPNEQ_PS(XMM7,XMM3)
1117 
1118       /* Reduce the comparisons to one SSE register */
1119       SSE_OR_PS(XMM6,XMM7)
1120       SSE_OR_PS(XMM5,XMM4)
1121       SSE_OR_PS(XMM5,XMM6)
1122       SSE_INLINE_END_1;
1123 
1124       /* Reduce the one SSE register to an integer register for branching */
1125       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1126       MOVEMASK(nonzero,XMM5);
1127 
1128       /* If block is nonzero ... */
1129       if (nonzero) {
1130         pv = ba + 16*diag_offset[row];
1131         PREFETCH_L1(&pv[16]);
1132         pj = bj + diag_offset[row] + 1;
1133 
1134         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1135         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1136         /* but the diagonal was inverted already */
1137         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1138 
1139         SSE_INLINE_BEGIN_2(pv,pc)
1140         /* Column 0, product is accumulated in XMM4 */
1141         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1142         SSE_SHUFFLE(XMM4,XMM4,0x00)
1143         SSE_MULT_PS(XMM4,XMM0)
1144 
1145         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1146         SSE_SHUFFLE(XMM5,XMM5,0x00)
1147         SSE_MULT_PS(XMM5,XMM1)
1148         SSE_ADD_PS(XMM4,XMM5)
1149 
1150         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1151         SSE_SHUFFLE(XMM6,XMM6,0x00)
1152         SSE_MULT_PS(XMM6,XMM2)
1153         SSE_ADD_PS(XMM4,XMM6)
1154 
1155         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1156         SSE_SHUFFLE(XMM7,XMM7,0x00)
1157         SSE_MULT_PS(XMM7,XMM3)
1158         SSE_ADD_PS(XMM4,XMM7)
1159 
1160         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1161         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1162 
1163         /* Column 1, product is accumulated in XMM5 */
1164         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1165         SSE_SHUFFLE(XMM5,XMM5,0x00)
1166         SSE_MULT_PS(XMM5,XMM0)
1167 
1168         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1169         SSE_SHUFFLE(XMM6,XMM6,0x00)
1170         SSE_MULT_PS(XMM6,XMM1)
1171         SSE_ADD_PS(XMM5,XMM6)
1172 
1173         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1174         SSE_SHUFFLE(XMM7,XMM7,0x00)
1175         SSE_MULT_PS(XMM7,XMM2)
1176         SSE_ADD_PS(XMM5,XMM7)
1177 
1178         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1179         SSE_SHUFFLE(XMM6,XMM6,0x00)
1180         SSE_MULT_PS(XMM6,XMM3)
1181         SSE_ADD_PS(XMM5,XMM6)
1182 
1183         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1184         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1185 
1186         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1187 
1188         /* Column 2, product is accumulated in XMM6 */
1189         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1190         SSE_SHUFFLE(XMM6,XMM6,0x00)
1191         SSE_MULT_PS(XMM6,XMM0)
1192 
1193         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1194         SSE_SHUFFLE(XMM7,XMM7,0x00)
1195         SSE_MULT_PS(XMM7,XMM1)
1196         SSE_ADD_PS(XMM6,XMM7)
1197 
1198         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1199         SSE_SHUFFLE(XMM7,XMM7,0x00)
1200         SSE_MULT_PS(XMM7,XMM2)
1201         SSE_ADD_PS(XMM6,XMM7)
1202 
1203         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1204         SSE_SHUFFLE(XMM7,XMM7,0x00)
1205         SSE_MULT_PS(XMM7,XMM3)
1206         SSE_ADD_PS(XMM6,XMM7)
1207 
1208         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1209         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1210 
1211         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1212         /* Column 3, product is accumulated in XMM0 */
1213         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1214         SSE_SHUFFLE(XMM7,XMM7,0x00)
1215         SSE_MULT_PS(XMM0,XMM7)
1216 
1217         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1218         SSE_SHUFFLE(XMM7,XMM7,0x00)
1219         SSE_MULT_PS(XMM1,XMM7)
1220         SSE_ADD_PS(XMM0,XMM1)
1221 
1222         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1223         SSE_SHUFFLE(XMM1,XMM1,0x00)
1224         SSE_MULT_PS(XMM1,XMM2)
1225         SSE_ADD_PS(XMM0,XMM1)
1226 
1227         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1228         SSE_SHUFFLE(XMM7,XMM7,0x00)
1229         SSE_MULT_PS(XMM3,XMM7)
1230         SSE_ADD_PS(XMM0,XMM3)
1231 
1232         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1233         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1234 
1235         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1236         /* This is code to be maintained and read by humans after all. */
1237         /* Copy Multiplier Col 3 into XMM3 */
1238         SSE_COPY_PS(XMM3,XMM0)
1239         /* Copy Multiplier Col 2 into XMM2 */
1240         SSE_COPY_PS(XMM2,XMM6)
1241         /* Copy Multiplier Col 1 into XMM1 */
1242         SSE_COPY_PS(XMM1,XMM5)
1243         /* Copy Multiplier Col 0 into XMM0 */
1244         SSE_COPY_PS(XMM0,XMM4)
1245         SSE_INLINE_END_2;
1246 
1247         /* Update the row: */
1248         nz  = bi[row+1] - diag_offset[row] - 1;
1249         pv += 16;
1250         for (j=0; j<nz; j++) {
1251           PREFETCH_L1(&pv[16]);
1252           x = rtmp + 16*((unsigned int)pj[j]);
1253 /*            x = rtmp + 4*pj[j]; */
1254 
1255           /* X:=X-M*PV, One column at a time */
1256           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1257           SSE_INLINE_BEGIN_2(x,pv)
1258           /* Load First Column of X*/
1259           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1260           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1261 
1262           /* Matrix-Vector Product: */
1263           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1264           SSE_SHUFFLE(XMM5,XMM5,0x00)
1265           SSE_MULT_PS(XMM5,XMM0)
1266           SSE_SUB_PS(XMM4,XMM5)
1267 
1268           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1269           SSE_SHUFFLE(XMM6,XMM6,0x00)
1270           SSE_MULT_PS(XMM6,XMM1)
1271           SSE_SUB_PS(XMM4,XMM6)
1272 
1273           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1274           SSE_SHUFFLE(XMM7,XMM7,0x00)
1275           SSE_MULT_PS(XMM7,XMM2)
1276           SSE_SUB_PS(XMM4,XMM7)
1277 
1278           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1279           SSE_SHUFFLE(XMM5,XMM5,0x00)
1280           SSE_MULT_PS(XMM5,XMM3)
1281           SSE_SUB_PS(XMM4,XMM5)
1282 
1283           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1284           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1285 
1286           /* Second Column */
1287           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1288           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1289 
1290           /* Matrix-Vector Product: */
1291           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1292           SSE_SHUFFLE(XMM6,XMM6,0x00)
1293           SSE_MULT_PS(XMM6,XMM0)
1294           SSE_SUB_PS(XMM5,XMM6)
1295 
1296           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1297           SSE_SHUFFLE(XMM7,XMM7,0x00)
1298           SSE_MULT_PS(XMM7,XMM1)
1299           SSE_SUB_PS(XMM5,XMM7)
1300 
1301           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1302           SSE_SHUFFLE(XMM4,XMM4,0x00)
1303           SSE_MULT_PS(XMM4,XMM2)
1304           SSE_SUB_PS(XMM5,XMM4)
1305 
1306           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1307           SSE_SHUFFLE(XMM6,XMM6,0x00)
1308           SSE_MULT_PS(XMM6,XMM3)
1309           SSE_SUB_PS(XMM5,XMM6)
1310 
1311           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1312           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1313 
1314           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1315 
1316           /* Third Column */
1317           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1318           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1319 
1320           /* Matrix-Vector Product: */
1321           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1322           SSE_SHUFFLE(XMM7,XMM7,0x00)
1323           SSE_MULT_PS(XMM7,XMM0)
1324           SSE_SUB_PS(XMM6,XMM7)
1325 
1326           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1327           SSE_SHUFFLE(XMM4,XMM4,0x00)
1328           SSE_MULT_PS(XMM4,XMM1)
1329           SSE_SUB_PS(XMM6,XMM4)
1330 
1331           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1332           SSE_SHUFFLE(XMM5,XMM5,0x00)
1333           SSE_MULT_PS(XMM5,XMM2)
1334           SSE_SUB_PS(XMM6,XMM5)
1335 
1336           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1337           SSE_SHUFFLE(XMM7,XMM7,0x00)
1338           SSE_MULT_PS(XMM7,XMM3)
1339           SSE_SUB_PS(XMM6,XMM7)
1340 
1341           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1342           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1343 
1344           /* Fourth Column */
1345           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1346           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1347 
1348           /* Matrix-Vector Product: */
1349           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1350           SSE_SHUFFLE(XMM5,XMM5,0x00)
1351           SSE_MULT_PS(XMM5,XMM0)
1352           SSE_SUB_PS(XMM4,XMM5)
1353 
1354           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1355           SSE_SHUFFLE(XMM6,XMM6,0x00)
1356           SSE_MULT_PS(XMM6,XMM1)
1357           SSE_SUB_PS(XMM4,XMM6)
1358 
1359           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1360           SSE_SHUFFLE(XMM7,XMM7,0x00)
1361           SSE_MULT_PS(XMM7,XMM2)
1362           SSE_SUB_PS(XMM4,XMM7)
1363 
1364           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1365           SSE_SHUFFLE(XMM5,XMM5,0x00)
1366           SSE_MULT_PS(XMM5,XMM3)
1367           SSE_SUB_PS(XMM4,XMM5)
1368 
1369           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1370           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1371           SSE_INLINE_END_2;
1372           pv += 16;
1373         }
1374         PetscCall(PetscLogFlops(128.0*nz+112.0));
1375       }
1376       row = (unsigned int)(*bjtmp++);
1377 /*        row = (*bjtmp++)/4; */
1378 /*        bjtmp++; */
1379     }
1380     /* finished row so stick it into b->a */
1381     pv = ba + 16*bi[i];
1382     pj = bj + bi[i];
1383     nz = bi[i+1] - bi[i];
1384 
1385     /* Copy x block back into pv block */
1386     for (j=0; j<nz; j++) {
1387       x = rtmp+16*((unsigned int)pj[j]);
1388 /*        x  = rtmp+4*pj[j]; */
1389 
1390       SSE_INLINE_BEGIN_2(x,pv)
1391       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1392       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1393       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1394 
1395       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1396       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1397 
1398       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1399       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1400 
1401       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1402       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1403 
1404       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1405       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1406 
1407       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1408       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1409 
1410       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1411       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1412 
1413       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1414       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1415       SSE_INLINE_END_2;
1416       pv += 16;
1417     }
1418     /* invert diagonal block */
1419     w = ba + 16*diag_offset[i];
1420     if (pivotinblocks) {
1421       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected));
1422       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1423     } else {
1424       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1425     }
1426     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1427   }
1428 
1429   PetscCall(PetscFree(rtmp));
1430 
1431   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1432   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1433   C->assembled           = PETSC_TRUE;
1434 
1435   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
1436   /* Flop Count from inverting diagonal blocks */
1437   SSE_SCOPE_END;
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1442 {
1443   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1444   int            i,j,n = a->mbs;
1445   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1446   unsigned int   row;
1447   int            *ajtmpold,nz,*bi=b->i;
1448   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1449   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1450   MatScalar      *ba    = b->a,*aa = a->a;
1451   int            nonzero=0;
1452   PetscBool      pivotinblocks = b->pivotinblocks;
1453   PetscReal      shift = info->shiftamount;
1454   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1455 
1456   PetscFunctionBegin;
1457   allowzeropivot = PetscNot(A->erroriffailure);
1458   SSE_SCOPE_BEGIN;
1459 
1460   PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1461   PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1462   PetscCall(PetscMalloc1(16*(n+1),&rtmp));
1463   PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1464 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1465 /*      colscale = 4; */
1466 /*    } */
1467   if ((unsigned long)bj==(unsigned long)aj) {
1468     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1469   }
1470 
1471   for (i=0; i<n; i++) {
1472     nz    = bi[i+1] - bi[i];
1473     bjtmp = bj + bi[i];
1474     /* zero out the 4x4 block accumulators */
1475     /* zero out one register */
1476     XOR_PS(XMM7,XMM7);
1477     for  (j=0; j<nz; j++) {
1478       x = rtmp+16*((unsigned int)bjtmp[j]);
1479 /*        x = rtmp+4*bjtmp[j]; */
1480       SSE_INLINE_BEGIN_1(x)
1481       /* Copy zero register to memory locations */
1482       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1483       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1484       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1485       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1486       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1487       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1488       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1489       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1490       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1491       SSE_INLINE_END_1;
1492     }
1493     /* load in initial (unfactored row) */
1494     nz       = ai[i+1] - ai[i];
1495     ajtmpold = aj + ai[i];
1496     v        = aa + 16*ai[i];
1497     for (j=0; j<nz; j++) {
1498       x = rtmp+16*ajtmpold[j];
1499 /*        x = rtmp+colscale*ajtmpold[j]; */
1500       /* Copy v block into x block */
1501       SSE_INLINE_BEGIN_2(v,x)
1502       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1503       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1504       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1505 
1506       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1507       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1508 
1509       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1510       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1511 
1512       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1513       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1514 
1515       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1516       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1517 
1518       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1519       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1520 
1521       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1522       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1523 
1524       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1525       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1526       SSE_INLINE_END_2;
1527 
1528       v += 16;
1529     }
1530 /*      row = (*bjtmp++)/4; */
1531     row = (unsigned int)(*bjtmp++);
1532     while (row < i) {
1533       pc = rtmp + 16*row;
1534       SSE_INLINE_BEGIN_1(pc)
1535       /* Load block from lower triangle */
1536       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1537       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1538       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1539 
1540       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1541       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1542 
1543       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1544       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1545 
1546       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1547       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1548 
1549       /* Compare block to zero block */
1550 
1551       SSE_COPY_PS(XMM4,XMM7)
1552       SSE_CMPNEQ_PS(XMM4,XMM0)
1553 
1554       SSE_COPY_PS(XMM5,XMM7)
1555       SSE_CMPNEQ_PS(XMM5,XMM1)
1556 
1557       SSE_COPY_PS(XMM6,XMM7)
1558       SSE_CMPNEQ_PS(XMM6,XMM2)
1559 
1560       SSE_CMPNEQ_PS(XMM7,XMM3)
1561 
1562       /* Reduce the comparisons to one SSE register */
1563       SSE_OR_PS(XMM6,XMM7)
1564       SSE_OR_PS(XMM5,XMM4)
1565       SSE_OR_PS(XMM5,XMM6)
1566       SSE_INLINE_END_1;
1567 
1568       /* Reduce the one SSE register to an integer register for branching */
1569       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1570       MOVEMASK(nonzero,XMM5);
1571 
1572       /* If block is nonzero ... */
1573       if (nonzero) {
1574         pv = ba + 16*diag_offset[row];
1575         PREFETCH_L1(&pv[16]);
1576         pj = bj + diag_offset[row] + 1;
1577 
1578         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1579         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1580         /* but the diagonal was inverted already */
1581         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1582 
1583         SSE_INLINE_BEGIN_2(pv,pc)
1584         /* Column 0, product is accumulated in XMM4 */
1585         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1586         SSE_SHUFFLE(XMM4,XMM4,0x00)
1587         SSE_MULT_PS(XMM4,XMM0)
1588 
1589         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1590         SSE_SHUFFLE(XMM5,XMM5,0x00)
1591         SSE_MULT_PS(XMM5,XMM1)
1592         SSE_ADD_PS(XMM4,XMM5)
1593 
1594         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1595         SSE_SHUFFLE(XMM6,XMM6,0x00)
1596         SSE_MULT_PS(XMM6,XMM2)
1597         SSE_ADD_PS(XMM4,XMM6)
1598 
1599         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1600         SSE_SHUFFLE(XMM7,XMM7,0x00)
1601         SSE_MULT_PS(XMM7,XMM3)
1602         SSE_ADD_PS(XMM4,XMM7)
1603 
1604         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1605         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1606 
1607         /* Column 1, product is accumulated in XMM5 */
1608         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1609         SSE_SHUFFLE(XMM5,XMM5,0x00)
1610         SSE_MULT_PS(XMM5,XMM0)
1611 
1612         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1613         SSE_SHUFFLE(XMM6,XMM6,0x00)
1614         SSE_MULT_PS(XMM6,XMM1)
1615         SSE_ADD_PS(XMM5,XMM6)
1616 
1617         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1618         SSE_SHUFFLE(XMM7,XMM7,0x00)
1619         SSE_MULT_PS(XMM7,XMM2)
1620         SSE_ADD_PS(XMM5,XMM7)
1621 
1622         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1623         SSE_SHUFFLE(XMM6,XMM6,0x00)
1624         SSE_MULT_PS(XMM6,XMM3)
1625         SSE_ADD_PS(XMM5,XMM6)
1626 
1627         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1628         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1629 
1630         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1631 
1632         /* Column 2, product is accumulated in XMM6 */
1633         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1634         SSE_SHUFFLE(XMM6,XMM6,0x00)
1635         SSE_MULT_PS(XMM6,XMM0)
1636 
1637         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1638         SSE_SHUFFLE(XMM7,XMM7,0x00)
1639         SSE_MULT_PS(XMM7,XMM1)
1640         SSE_ADD_PS(XMM6,XMM7)
1641 
1642         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1643         SSE_SHUFFLE(XMM7,XMM7,0x00)
1644         SSE_MULT_PS(XMM7,XMM2)
1645         SSE_ADD_PS(XMM6,XMM7)
1646 
1647         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1648         SSE_SHUFFLE(XMM7,XMM7,0x00)
1649         SSE_MULT_PS(XMM7,XMM3)
1650         SSE_ADD_PS(XMM6,XMM7)
1651 
1652         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1653         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1654 
1655         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1656         /* Column 3, product is accumulated in XMM0 */
1657         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1658         SSE_SHUFFLE(XMM7,XMM7,0x00)
1659         SSE_MULT_PS(XMM0,XMM7)
1660 
1661         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1662         SSE_SHUFFLE(XMM7,XMM7,0x00)
1663         SSE_MULT_PS(XMM1,XMM7)
1664         SSE_ADD_PS(XMM0,XMM1)
1665 
1666         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1667         SSE_SHUFFLE(XMM1,XMM1,0x00)
1668         SSE_MULT_PS(XMM1,XMM2)
1669         SSE_ADD_PS(XMM0,XMM1)
1670 
1671         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1672         SSE_SHUFFLE(XMM7,XMM7,0x00)
1673         SSE_MULT_PS(XMM3,XMM7)
1674         SSE_ADD_PS(XMM0,XMM3)
1675 
1676         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1677         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1678 
1679         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1680         /* This is code to be maintained and read by humans after all. */
1681         /* Copy Multiplier Col 3 into XMM3 */
1682         SSE_COPY_PS(XMM3,XMM0)
1683         /* Copy Multiplier Col 2 into XMM2 */
1684         SSE_COPY_PS(XMM2,XMM6)
1685         /* Copy Multiplier Col 1 into XMM1 */
1686         SSE_COPY_PS(XMM1,XMM5)
1687         /* Copy Multiplier Col 0 into XMM0 */
1688         SSE_COPY_PS(XMM0,XMM4)
1689         SSE_INLINE_END_2;
1690 
1691         /* Update the row: */
1692         nz  = bi[row+1] - diag_offset[row] - 1;
1693         pv += 16;
1694         for (j=0; j<nz; j++) {
1695           PREFETCH_L1(&pv[16]);
1696           x = rtmp + 16*((unsigned int)pj[j]);
1697 /*            x = rtmp + 4*pj[j]; */
1698 
1699           /* X:=X-M*PV, One column at a time */
1700           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1701           SSE_INLINE_BEGIN_2(x,pv)
1702           /* Load First Column of X*/
1703           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1704           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1705 
1706           /* Matrix-Vector Product: */
1707           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1708           SSE_SHUFFLE(XMM5,XMM5,0x00)
1709           SSE_MULT_PS(XMM5,XMM0)
1710           SSE_SUB_PS(XMM4,XMM5)
1711 
1712           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1713           SSE_SHUFFLE(XMM6,XMM6,0x00)
1714           SSE_MULT_PS(XMM6,XMM1)
1715           SSE_SUB_PS(XMM4,XMM6)
1716 
1717           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1718           SSE_SHUFFLE(XMM7,XMM7,0x00)
1719           SSE_MULT_PS(XMM7,XMM2)
1720           SSE_SUB_PS(XMM4,XMM7)
1721 
1722           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1723           SSE_SHUFFLE(XMM5,XMM5,0x00)
1724           SSE_MULT_PS(XMM5,XMM3)
1725           SSE_SUB_PS(XMM4,XMM5)
1726 
1727           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1728           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1729 
1730           /* Second Column */
1731           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1732           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1733 
1734           /* Matrix-Vector Product: */
1735           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1736           SSE_SHUFFLE(XMM6,XMM6,0x00)
1737           SSE_MULT_PS(XMM6,XMM0)
1738           SSE_SUB_PS(XMM5,XMM6)
1739 
1740           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1741           SSE_SHUFFLE(XMM7,XMM7,0x00)
1742           SSE_MULT_PS(XMM7,XMM1)
1743           SSE_SUB_PS(XMM5,XMM7)
1744 
1745           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1746           SSE_SHUFFLE(XMM4,XMM4,0x00)
1747           SSE_MULT_PS(XMM4,XMM2)
1748           SSE_SUB_PS(XMM5,XMM4)
1749 
1750           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1751           SSE_SHUFFLE(XMM6,XMM6,0x00)
1752           SSE_MULT_PS(XMM6,XMM3)
1753           SSE_SUB_PS(XMM5,XMM6)
1754 
1755           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1756           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1757 
1758           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1759 
1760           /* Third Column */
1761           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1762           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1763 
1764           /* Matrix-Vector Product: */
1765           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1766           SSE_SHUFFLE(XMM7,XMM7,0x00)
1767           SSE_MULT_PS(XMM7,XMM0)
1768           SSE_SUB_PS(XMM6,XMM7)
1769 
1770           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1771           SSE_SHUFFLE(XMM4,XMM4,0x00)
1772           SSE_MULT_PS(XMM4,XMM1)
1773           SSE_SUB_PS(XMM6,XMM4)
1774 
1775           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1776           SSE_SHUFFLE(XMM5,XMM5,0x00)
1777           SSE_MULT_PS(XMM5,XMM2)
1778           SSE_SUB_PS(XMM6,XMM5)
1779 
1780           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1781           SSE_SHUFFLE(XMM7,XMM7,0x00)
1782           SSE_MULT_PS(XMM7,XMM3)
1783           SSE_SUB_PS(XMM6,XMM7)
1784 
1785           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1786           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1787 
1788           /* Fourth Column */
1789           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1790           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1791 
1792           /* Matrix-Vector Product: */
1793           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1794           SSE_SHUFFLE(XMM5,XMM5,0x00)
1795           SSE_MULT_PS(XMM5,XMM0)
1796           SSE_SUB_PS(XMM4,XMM5)
1797 
1798           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1799           SSE_SHUFFLE(XMM6,XMM6,0x00)
1800           SSE_MULT_PS(XMM6,XMM1)
1801           SSE_SUB_PS(XMM4,XMM6)
1802 
1803           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1804           SSE_SHUFFLE(XMM7,XMM7,0x00)
1805           SSE_MULT_PS(XMM7,XMM2)
1806           SSE_SUB_PS(XMM4,XMM7)
1807 
1808           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1809           SSE_SHUFFLE(XMM5,XMM5,0x00)
1810           SSE_MULT_PS(XMM5,XMM3)
1811           SSE_SUB_PS(XMM4,XMM5)
1812 
1813           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1814           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1815           SSE_INLINE_END_2;
1816           pv += 16;
1817         }
1818         PetscCall(PetscLogFlops(128.0*nz+112.0));
1819       }
1820       row = (unsigned int)(*bjtmp++);
1821 /*        row = (*bjtmp++)/4; */
1822 /*        bjtmp++; */
1823     }
1824     /* finished row so stick it into b->a */
1825     pv = ba + 16*bi[i];
1826     pj = bj + bi[i];
1827     nz = bi[i+1] - bi[i];
1828 
1829     /* Copy x block back into pv block */
1830     for (j=0; j<nz; j++) {
1831       x = rtmp+16*((unsigned int)pj[j]);
1832 /*        x  = rtmp+4*pj[j]; */
1833 
1834       SSE_INLINE_BEGIN_2(x,pv)
1835       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1836       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1837       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1838 
1839       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1840       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1841 
1842       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1843       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1844 
1845       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1846       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1847 
1848       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1849       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1850 
1851       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1852       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1853 
1854       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1855       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1856 
1857       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1858       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1859       SSE_INLINE_END_2;
1860       pv += 16;
1861     }
1862     /* invert diagonal block */
1863     w = ba + 16*diag_offset[i];
1864     if (pivotinblocks) {
1865       PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected));
1866       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1867     } else {
1868       PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
1869     }
1870 /*      PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */
1871     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1872   }
1873 
1874   PetscCall(PetscFree(rtmp));
1875 
1876   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1877   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1878   C->assembled           = PETSC_TRUE;
1879 
1880   PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs));
1881   /* Flop Count from inverting diagonal blocks */
1882   SSE_SCOPE_END;
1883   PetscFunctionReturn(0);
1884 }
1885 
1886 #endif
1887