xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 2205254efee3a00a594e5e2a3a70f74dcb40bc03)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <../src/mat/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks;
30   PetscReal      shift = info->shiftamount;
31 
32   PetscFunctionBegin;
33   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x    = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv = ba + 16*diag_offset[row];
70         pj = bj + diag_offset[row] + 1;
71         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15 = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x    = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv   += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv   += 16;
140     }
141     /* invert diagonal block */
142     w    = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
145     } else {
146       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
147     }
148   }
149 
150   ierr = PetscFree(rtmp);CHKERRQ(ierr);
151   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
153   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
154   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155   C->assembled = PETSC_TRUE;
156   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
157   PetscFunctionReturn(0);
158 }
159 
160 /* MatLUFactorNumeric_SeqBAIJ_4 -
161      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
162        PetscKernel_A_gets_A_times_B()
163        PetscKernel_A_gets_A_minus_B_times_C()
164        PetscKernel_A_gets_inverse_A()
165 */
166 
167 #undef __FUNCT__
168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
170 {
171   Mat            C=B;
172   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
173   IS             isrow = b->row,isicol = b->icol;
174   PetscErrorCode ierr;
175   const PetscInt *r,*ic;
176   PetscInt       i,j,k,nz,nzL,row;
177   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
178   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
179   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
180   PetscInt       flg;
181   PetscReal      shift;
182 
183   PetscFunctionBegin;
184   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
185   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
186 
187   if (info->shifttype == MAT_SHIFT_NONE) {
188     shift = 0;
189   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
190     shift = info->shiftamount;
191   }
192 
193   /* generate work space needed by the factorization */
194   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
195   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
196 
197   for (i=0; i<n; i++) {
198     /* zero rtmp */
199     /* L part */
200     nz    = bi[i+1] - bi[i];
201     bjtmp = bj + bi[i];
202     for  (j=0; j<nz; j++) {
203       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
204     }
205 
206     /* U part */
207     nz = bdiag[i]-bdiag[i+1];
208     bjtmp = bj + bdiag[i+1]+1;
209     for  (j=0; j<nz; j++) {
210       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
211     }
212 
213     /* load in initial (unfactored row) */
214     nz    = ai[r[i]+1] - ai[r[i]];
215     ajtmp = aj + ai[r[i]];
216     v     = aa + bs2*ai[r[i]];
217     for (j=0; j<nz; j++) {
218       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
219     }
220 
221     /* elimination */
222     bjtmp = bj + bi[i];
223     nzL   = bi[i+1] - bi[i];
224     for (k=0;k < nzL;k++) {
225       row = bjtmp[k];
226       pc = rtmp + bs2*row;
227       for (flg=0,j=0; j<bs2; j++) {
228         if (pc[j]!=0.0) {
229           flg = 1;
230           break;
231         }
232       }
233       if (flg) {
234         pv = b->a + bs2*bdiag[row];
235         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
236         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
237 
238         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
239         pv = b->a + bs2*(bdiag[row+1]+1);
240         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
241         for (j=0; j<nz; j++) {
242           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
243           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
244           v    = rtmp + bs2*pj[j];
245           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
246           pv  += bs2;
247         }
248         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
249       }
250     }
251 
252     /* finished row so stick it into b->a */
253     /* L part */
254     pv   = b->a + bs2*bi[i] ;
255     pj   = b->j + bi[i] ;
256     nz   = bi[i+1] - bi[i];
257     for (j=0; j<nz; j++) {
258       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
259     }
260 
261     /* Mark diagonal and invert diagonal for simplier triangular solves */
262     pv   = b->a + bs2*bdiag[i];
263     pj   = b->j + bdiag[i];
264     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
265     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
266 
267     /* U part */
268     pv = b->a + bs2*(bdiag[i+1]+1);
269     pj = b->j + bdiag[i+1]+1;
270     nz = bdiag[i] - bdiag[i+1] - 1;
271     for (j=0; j<nz; j++) {
272       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
273     }
274   }
275 
276   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
277   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
278   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
279   C->ops->solve          = MatSolve_SeqBAIJ_4;
280   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
281   C->assembled = PETSC_TRUE;
282   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
283   PetscFunctionReturn(0);
284 }
285 
286 #undef __FUNCT__
287 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
288 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
289 {
290 /*
291     Default Version for when blocks are 4 by 4 Using natural ordering
292 */
293   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
294   PetscErrorCode ierr;
295   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
296   PetscInt       *ajtmpold,*ajtmp,nz,row;
297   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
298   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
299   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
300   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
301   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
302   MatScalar      m13,m14,m15,m16;
303   MatScalar      *ba = b->a,*aa = a->a;
304   PetscBool      pivotinblocks = b->pivotinblocks;
305   PetscReal      shift = info->shiftamount;
306 
307   PetscFunctionBegin;
308   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
309 
310   for (i=0; i<n; i++) {
311     nz    = bi[i+1] - bi[i];
312     ajtmp = bj + bi[i];
313     for  (j=0; j<nz; j++) {
314       x = rtmp+16*ajtmp[j];
315       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
316       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
317     }
318     /* load in initial (unfactored row) */
319     nz       = ai[i+1] - ai[i];
320     ajtmpold = aj + ai[i];
321     v        = aa + 16*ai[i];
322     for (j=0; j<nz; j++) {
323       x    = rtmp+16*ajtmpold[j];
324       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
325       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
326       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
327       x[14] = v[14]; x[15] = v[15];
328       v    += 16;
329     }
330     row = *ajtmp++;
331     while (row < i) {
332       pc  = rtmp + 16*row;
333       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
334       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
335       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
336       p15 = pc[14]; p16 = pc[15];
337       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
338           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
339           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
340           || p16 != 0.0) {
341         pv = ba + 16*diag_offset[row];
342         pj = bj + diag_offset[row] + 1;
343         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
344         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
345         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
346         x15 = pv[14]; x16 = pv[15];
347         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
348         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
349         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
350         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
351 
352         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
353         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
354         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
355         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
356 
357         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
358         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
359         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
360         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
361 
362         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
363         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
364         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
365         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
366         nz = bi[row+1] - diag_offset[row] - 1;
367         pv += 16;
368         for (j=0; j<nz; j++) {
369           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
370           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
371           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
372           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
373           x    = rtmp + 16*pj[j];
374           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
375           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
376           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
377           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
378 
379           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
380           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
381           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
382           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
383 
384           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
385           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
386           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
387           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
388 
389           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
390           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
391           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
392           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
393 
394           pv   += 16;
395         }
396         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
397       }
398       row = *ajtmp++;
399     }
400     /* finished row so stick it into b->a */
401     pv = ba + 16*bi[i];
402     pj = bj + bi[i];
403     nz = bi[i+1] - bi[i];
404     for (j=0; j<nz; j++) {
405       x      = rtmp+16*pj[j];
406       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
407       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
408       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
409       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
410       pv   += 16;
411     }
412     /* invert diagonal block */
413     w = ba + 16*diag_offset[i];
414     if (pivotinblocks) {
415       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
416     } else {
417       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
418     }
419   }
420 
421   ierr = PetscFree(rtmp);CHKERRQ(ierr);
422   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
423   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
424   C->assembled = PETSC_TRUE;
425   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
426   PetscFunctionReturn(0);
427 }
428 
429 /*
430   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
431     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
432 */
433 #undef __FUNCT__
434 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
435 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
436 {
437   Mat            C=B;
438   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
439   PetscErrorCode ierr;
440   PetscInt       i,j,k,nz,nzL,row;
441   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
442   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
443   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
444   PetscInt       flg;
445   PetscReal      shift;
446 
447   PetscFunctionBegin;
448   /* generate work space needed by the factorization */
449   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
450   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
451 
452   if (info->shifttype == MAT_SHIFT_NONE) {
453     shift = 0;
454   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
455     shift = info->shiftamount;
456   }
457 
458   for (i=0; i<n; i++) {
459     /* zero rtmp */
460     /* L part */
461     nz    = bi[i+1] - bi[i];
462     bjtmp = bj + bi[i];
463     for  (j=0; j<nz; j++) {
464       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
465     }
466 
467     /* U part */
468     nz = bdiag[i] - bdiag[i+1];
469     bjtmp = bj + bdiag[i+1]+1;
470     for  (j=0; j<nz; j++) {
471       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
472     }
473 
474     /* load in initial (unfactored row) */
475     nz    = ai[i+1] - ai[i];
476     ajtmp = aj + ai[i];
477     v     = aa + bs2*ai[i];
478     for (j=0; j<nz; j++) {
479       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
480     }
481 
482     /* elimination */
483     bjtmp = bj + bi[i];
484     nzL   = bi[i+1] - bi[i];
485     for (k=0;k < nzL;k++) {
486       row = bjtmp[k];
487       pc = rtmp + bs2*row;
488       for (flg=0,j=0; j<bs2; j++) {
489         if (pc[j]!=0.0) {
490           flg = 1;
491           break;
492         }
493       }
494       if (flg) {
495         pv = b->a + bs2*bdiag[row];
496         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
497         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
498 
499         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
500         pv = b->a + bs2*(bdiag[row+1]+1);
501         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
502         for (j=0; j<nz; j++) {
503           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
504           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
505           v    = rtmp + bs2*pj[j];
506           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
507           pv  += bs2;
508         }
509         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
510       }
511     }
512 
513     /* finished row so stick it into b->a */
514     /* L part */
515     pv   = b->a + bs2*bi[i] ;
516     pj   = b->j + bi[i] ;
517     nz   = bi[i+1] - bi[i];
518     for (j=0; j<nz; j++) {
519       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
520     }
521 
522     /* Mark diagonal and invert diagonal for simplier triangular solves */
523     pv   = b->a + bs2*bdiag[i];
524     pj   = b->j + bdiag[i];
525     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
526     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
527 
528     /* U part */
529     pv = b->a + bs2*(bdiag[i+1]+1);
530     pj = b->j + bdiag[i+1]+1;
531     nz = bdiag[i] - bdiag[i+1] - 1;
532     for (j=0; j<nz; j++) {
533       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
534     }
535   }
536   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
537   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
538   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
539   C->assembled = PETSC_TRUE;
540   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
541   PetscFunctionReturn(0);
542 }
543 
544 #if defined(PETSC_HAVE_SSE)
545 
546 #include PETSC_HAVE_SSE
547 
548 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
549 #undef __FUNCT__
550 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
551 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
552 {
553   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
554   PetscErrorCode ierr;
555   int i,j,n = a->mbs;
556   int         *bj = b->j,*bjtmp,*pj;
557   int         row;
558   int         *ajtmpold,nz,*bi=b->i;
559   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j;
560   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
561   MatScalar   *ba = b->a,*aa = a->a;
562   int         nonzero=0;
563 /*    int            nonzero=0,colscale = 16; */
564   PetscBool   pivotinblocks = b->pivotinblocks;
565   PetscReal      shift = info->shiftamount;
566 
567   PetscFunctionBegin;
568   SSE_SCOPE_BEGIN;
569 
570   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
571   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
572   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
573   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
574 /*    if ((unsigned long)bj==(unsigned long)aj) { */
575 /*      colscale = 4; */
576 /*    } */
577   for (i=0; i<n; i++) {
578     nz    = bi[i+1] - bi[i];
579     bjtmp = bj + bi[i];
580     /* zero out the 4x4 block accumulators */
581     /* zero out one register */
582     XOR_PS(XMM7,XMM7);
583     for  (j=0; j<nz; j++) {
584       x = rtmp+16*bjtmp[j];
585 /*        x = rtmp+4*bjtmp[j]; */
586       SSE_INLINE_BEGIN_1(x)
587         /* Copy zero register to memory locations */
588         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
589         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
590         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
591         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
592         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
593         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
594         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
595         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
596         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
597       SSE_INLINE_END_1;
598     }
599     /* load in initial (unfactored row) */
600     nz       = ai[i+1] - ai[i];
601     ajtmpold = aj + ai[i];
602     v        = aa + 16*ai[i];
603     for (j=0; j<nz; j++) {
604       x = rtmp+16*ajtmpold[j];
605 /*        x = rtmp+colscale*ajtmpold[j]; */
606       /* Copy v block into x block */
607       SSE_INLINE_BEGIN_2(v,x)
608         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
609         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
610         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
611 
612         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
613         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
614 
615         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
616         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
617 
618         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
619         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
620 
621         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
622         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
623 
624         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
625         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
626 
627         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
628         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
629 
630         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
631         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
632       SSE_INLINE_END_2;
633 
634       v += 16;
635     }
636 /*      row = (*bjtmp++)/4; */
637     row = *bjtmp++;
638     while (row < i) {
639       pc  = rtmp + 16*row;
640       SSE_INLINE_BEGIN_1(pc)
641         /* Load block from lower triangle */
642         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
643         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
644         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
645 
646         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
647         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
648 
649         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
650         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
651 
652         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
653         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
654 
655         /* Compare block to zero block */
656 
657         SSE_COPY_PS(XMM4,XMM7)
658         SSE_CMPNEQ_PS(XMM4,XMM0)
659 
660         SSE_COPY_PS(XMM5,XMM7)
661         SSE_CMPNEQ_PS(XMM5,XMM1)
662 
663         SSE_COPY_PS(XMM6,XMM7)
664         SSE_CMPNEQ_PS(XMM6,XMM2)
665 
666         SSE_CMPNEQ_PS(XMM7,XMM3)
667 
668         /* Reduce the comparisons to one SSE register */
669         SSE_OR_PS(XMM6,XMM7)
670         SSE_OR_PS(XMM5,XMM4)
671         SSE_OR_PS(XMM5,XMM6)
672       SSE_INLINE_END_1;
673 
674       /* Reduce the one SSE register to an integer register for branching */
675       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
676       MOVEMASK(nonzero,XMM5);
677 
678       /* If block is nonzero ... */
679       if (nonzero) {
680         pv = ba + 16*diag_offset[row];
681         PREFETCH_L1(&pv[16]);
682         pj = bj + diag_offset[row] + 1;
683 
684         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
685         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
686         /* but the diagonal was inverted already */
687         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
688 
689         SSE_INLINE_BEGIN_2(pv,pc)
690           /* Column 0, product is accumulated in XMM4 */
691           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
692           SSE_SHUFFLE(XMM4,XMM4,0x00)
693           SSE_MULT_PS(XMM4,XMM0)
694 
695           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
696           SSE_SHUFFLE(XMM5,XMM5,0x00)
697           SSE_MULT_PS(XMM5,XMM1)
698           SSE_ADD_PS(XMM4,XMM5)
699 
700           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
701           SSE_SHUFFLE(XMM6,XMM6,0x00)
702           SSE_MULT_PS(XMM6,XMM2)
703           SSE_ADD_PS(XMM4,XMM6)
704 
705           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
706           SSE_SHUFFLE(XMM7,XMM7,0x00)
707           SSE_MULT_PS(XMM7,XMM3)
708           SSE_ADD_PS(XMM4,XMM7)
709 
710           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
711           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
712 
713           /* Column 1, product is accumulated in XMM5 */
714           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
715           SSE_SHUFFLE(XMM5,XMM5,0x00)
716           SSE_MULT_PS(XMM5,XMM0)
717 
718           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
719           SSE_SHUFFLE(XMM6,XMM6,0x00)
720           SSE_MULT_PS(XMM6,XMM1)
721           SSE_ADD_PS(XMM5,XMM6)
722 
723           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
724           SSE_SHUFFLE(XMM7,XMM7,0x00)
725           SSE_MULT_PS(XMM7,XMM2)
726           SSE_ADD_PS(XMM5,XMM7)
727 
728           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
729           SSE_SHUFFLE(XMM6,XMM6,0x00)
730           SSE_MULT_PS(XMM6,XMM3)
731           SSE_ADD_PS(XMM5,XMM6)
732 
733           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
734           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
735 
736           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
737 
738           /* Column 2, product is accumulated in XMM6 */
739           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
740           SSE_SHUFFLE(XMM6,XMM6,0x00)
741           SSE_MULT_PS(XMM6,XMM0)
742 
743           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
744           SSE_SHUFFLE(XMM7,XMM7,0x00)
745           SSE_MULT_PS(XMM7,XMM1)
746           SSE_ADD_PS(XMM6,XMM7)
747 
748           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
749           SSE_SHUFFLE(XMM7,XMM7,0x00)
750           SSE_MULT_PS(XMM7,XMM2)
751           SSE_ADD_PS(XMM6,XMM7)
752 
753           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
754           SSE_SHUFFLE(XMM7,XMM7,0x00)
755           SSE_MULT_PS(XMM7,XMM3)
756           SSE_ADD_PS(XMM6,XMM7)
757 
758           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
759           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
760 
761           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
762           /* Column 3, product is accumulated in XMM0 */
763           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
764           SSE_SHUFFLE(XMM7,XMM7,0x00)
765           SSE_MULT_PS(XMM0,XMM7)
766 
767           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
768           SSE_SHUFFLE(XMM7,XMM7,0x00)
769           SSE_MULT_PS(XMM1,XMM7)
770           SSE_ADD_PS(XMM0,XMM1)
771 
772           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
773           SSE_SHUFFLE(XMM1,XMM1,0x00)
774           SSE_MULT_PS(XMM1,XMM2)
775           SSE_ADD_PS(XMM0,XMM1)
776 
777           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
778           SSE_SHUFFLE(XMM7,XMM7,0x00)
779           SSE_MULT_PS(XMM3,XMM7)
780           SSE_ADD_PS(XMM0,XMM3)
781 
782           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
783           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
784 
785           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
786           /* This is code to be maintained and read by humans afterall. */
787           /* Copy Multiplier Col 3 into XMM3 */
788           SSE_COPY_PS(XMM3,XMM0)
789           /* Copy Multiplier Col 2 into XMM2 */
790           SSE_COPY_PS(XMM2,XMM6)
791           /* Copy Multiplier Col 1 into XMM1 */
792           SSE_COPY_PS(XMM1,XMM5)
793           /* Copy Multiplier Col 0 into XMM0 */
794           SSE_COPY_PS(XMM0,XMM4)
795         SSE_INLINE_END_2;
796 
797         /* Update the row: */
798         nz = bi[row+1] - diag_offset[row] - 1;
799         pv += 16;
800         for (j=0; j<nz; j++) {
801           PREFETCH_L1(&pv[16]);
802           x = rtmp + 16*pj[j];
803 /*            x = rtmp + 4*pj[j]; */
804 
805           /* X:=X-M*PV, One column at a time */
806           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
807           SSE_INLINE_BEGIN_2(x,pv)
808             /* Load First Column of X*/
809             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
810             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
811 
812             /* Matrix-Vector Product: */
813             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
814             SSE_SHUFFLE(XMM5,XMM5,0x00)
815             SSE_MULT_PS(XMM5,XMM0)
816             SSE_SUB_PS(XMM4,XMM5)
817 
818             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
819             SSE_SHUFFLE(XMM6,XMM6,0x00)
820             SSE_MULT_PS(XMM6,XMM1)
821             SSE_SUB_PS(XMM4,XMM6)
822 
823             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
824             SSE_SHUFFLE(XMM7,XMM7,0x00)
825             SSE_MULT_PS(XMM7,XMM2)
826             SSE_SUB_PS(XMM4,XMM7)
827 
828             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
829             SSE_SHUFFLE(XMM5,XMM5,0x00)
830             SSE_MULT_PS(XMM5,XMM3)
831             SSE_SUB_PS(XMM4,XMM5)
832 
833             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
834             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
835 
836             /* Second Column */
837             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
838             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
839 
840             /* Matrix-Vector Product: */
841             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
842             SSE_SHUFFLE(XMM6,XMM6,0x00)
843             SSE_MULT_PS(XMM6,XMM0)
844             SSE_SUB_PS(XMM5,XMM6)
845 
846             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
847             SSE_SHUFFLE(XMM7,XMM7,0x00)
848             SSE_MULT_PS(XMM7,XMM1)
849             SSE_SUB_PS(XMM5,XMM7)
850 
851             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
852             SSE_SHUFFLE(XMM4,XMM4,0x00)
853             SSE_MULT_PS(XMM4,XMM2)
854             SSE_SUB_PS(XMM5,XMM4)
855 
856             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
857             SSE_SHUFFLE(XMM6,XMM6,0x00)
858             SSE_MULT_PS(XMM6,XMM3)
859             SSE_SUB_PS(XMM5,XMM6)
860 
861             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
862             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
863 
864             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
865 
866             /* Third Column */
867             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
868             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
869 
870             /* Matrix-Vector Product: */
871             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
872             SSE_SHUFFLE(XMM7,XMM7,0x00)
873             SSE_MULT_PS(XMM7,XMM0)
874             SSE_SUB_PS(XMM6,XMM7)
875 
876             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
877             SSE_SHUFFLE(XMM4,XMM4,0x00)
878             SSE_MULT_PS(XMM4,XMM1)
879             SSE_SUB_PS(XMM6,XMM4)
880 
881             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
882             SSE_SHUFFLE(XMM5,XMM5,0x00)
883             SSE_MULT_PS(XMM5,XMM2)
884             SSE_SUB_PS(XMM6,XMM5)
885 
886             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
887             SSE_SHUFFLE(XMM7,XMM7,0x00)
888             SSE_MULT_PS(XMM7,XMM3)
889             SSE_SUB_PS(XMM6,XMM7)
890 
891             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
892             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
893 
894             /* Fourth Column */
895             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
896             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
897 
898             /* Matrix-Vector Product: */
899             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
900             SSE_SHUFFLE(XMM5,XMM5,0x00)
901             SSE_MULT_PS(XMM5,XMM0)
902             SSE_SUB_PS(XMM4,XMM5)
903 
904             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
905             SSE_SHUFFLE(XMM6,XMM6,0x00)
906             SSE_MULT_PS(XMM6,XMM1)
907             SSE_SUB_PS(XMM4,XMM6)
908 
909             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
910             SSE_SHUFFLE(XMM7,XMM7,0x00)
911             SSE_MULT_PS(XMM7,XMM2)
912             SSE_SUB_PS(XMM4,XMM7)
913 
914             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
915             SSE_SHUFFLE(XMM5,XMM5,0x00)
916             SSE_MULT_PS(XMM5,XMM3)
917             SSE_SUB_PS(XMM4,XMM5)
918 
919             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
920             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
921           SSE_INLINE_END_2;
922           pv   += 16;
923         }
924         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
925       }
926       row = *bjtmp++;
927 /*        row = (*bjtmp++)/4; */
928     }
929     /* finished row so stick it into b->a */
930     pv = ba + 16*bi[i];
931     pj = bj + bi[i];
932     nz = bi[i+1] - bi[i];
933 
934     /* Copy x block back into pv block */
935     for (j=0; j<nz; j++) {
936       x  = rtmp+16*pj[j];
937 /*        x  = rtmp+4*pj[j]; */
938 
939       SSE_INLINE_BEGIN_2(x,pv)
940         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
941         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
942         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
943 
944         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
945         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
946 
947         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
948         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
949 
950         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
951         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
952 
953         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
954         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
955 
956         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
957         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
958 
959         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
960         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
961 
962         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
963         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
964       SSE_INLINE_END_2;
965       pv += 16;
966     }
967     /* invert diagonal block */
968     w = ba + 16*diag_offset[i];
969     if (pivotinblocks) {
970       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
971     } else {
972       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
973     }
974 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
975     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
976   }
977 
978   ierr = PetscFree(rtmp);CHKERRQ(ierr);
979   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
980   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
981   C->assembled = PETSC_TRUE;
982   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
983   /* Flop Count from inverting diagonal blocks */
984   SSE_SCOPE_END;
985   PetscFunctionReturn(0);
986 }
987 
988 #undef __FUNCT__
989 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
990 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
991 {
992   Mat            A=C;
993   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
994   PetscErrorCode ierr;
995   int i,j,n = a->mbs;
996   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
997   unsigned short *aj = (unsigned short *)(a->j),*ajtmp;
998   unsigned int   row;
999   int            nz,*bi=b->i;
1000   int            *diag_offset = b->diag,*ai=a->i;
1001   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1002   MatScalar      *ba = b->a,*aa = a->a;
1003   int            nonzero=0;
1004 /*    int            nonzero=0,colscale = 16; */
1005   PetscBool      pivotinblocks = b->pivotinblocks;
1006   PetscReal      shift = info->shiftamount;
1007 
1008   PetscFunctionBegin;
1009   SSE_SCOPE_BEGIN;
1010 
1011   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1012   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1013   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1014   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1015 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1016 /*      colscale = 4; */
1017 /*    } */
1018 
1019   for (i=0; i<n; i++) {
1020     nz    = bi[i+1] - bi[i];
1021     bjtmp = bj + bi[i];
1022     /* zero out the 4x4 block accumulators */
1023     /* zero out one register */
1024     XOR_PS(XMM7,XMM7);
1025     for  (j=0; j<nz; j++) {
1026       x = rtmp+16*((unsigned int)bjtmp[j]);
1027 /*        x = rtmp+4*bjtmp[j]; */
1028       SSE_INLINE_BEGIN_1(x)
1029         /* Copy zero register to memory locations */
1030         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1031         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1032         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1033         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1034         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1035         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1036         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1037         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1038         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1039       SSE_INLINE_END_1;
1040     }
1041     /* load in initial (unfactored row) */
1042     nz    = ai[i+1] - ai[i];
1043     ajtmp = aj + ai[i];
1044     v     = aa + 16*ai[i];
1045     for (j=0; j<nz; j++) {
1046       x = rtmp+16*((unsigned int)ajtmp[j]);
1047 /*        x = rtmp+colscale*ajtmp[j]; */
1048       /* Copy v block into x block */
1049       SSE_INLINE_BEGIN_2(v,x)
1050         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1051         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1052         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1053 
1054         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1055         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1056 
1057         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1058         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1059 
1060         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1061         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1062 
1063         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1064         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1065 
1066         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1067         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1068 
1069         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1070         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1071 
1072         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1073         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1074       SSE_INLINE_END_2;
1075 
1076       v += 16;
1077     }
1078 /*      row = (*bjtmp++)/4; */
1079     row = (unsigned int)(*bjtmp++);
1080     while (row < i) {
1081       pc  = rtmp + 16*row;
1082       SSE_INLINE_BEGIN_1(pc)
1083         /* Load block from lower triangle */
1084         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1085         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1086         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1087 
1088         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1089         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1090 
1091         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1092         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1093 
1094         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1095         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1096 
1097         /* Compare block to zero block */
1098 
1099         SSE_COPY_PS(XMM4,XMM7)
1100         SSE_CMPNEQ_PS(XMM4,XMM0)
1101 
1102         SSE_COPY_PS(XMM5,XMM7)
1103         SSE_CMPNEQ_PS(XMM5,XMM1)
1104 
1105         SSE_COPY_PS(XMM6,XMM7)
1106         SSE_CMPNEQ_PS(XMM6,XMM2)
1107 
1108         SSE_CMPNEQ_PS(XMM7,XMM3)
1109 
1110         /* Reduce the comparisons to one SSE register */
1111         SSE_OR_PS(XMM6,XMM7)
1112         SSE_OR_PS(XMM5,XMM4)
1113         SSE_OR_PS(XMM5,XMM6)
1114       SSE_INLINE_END_1;
1115 
1116       /* Reduce the one SSE register to an integer register for branching */
1117       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1118       MOVEMASK(nonzero,XMM5);
1119 
1120       /* If block is nonzero ... */
1121       if (nonzero) {
1122         pv = ba + 16*diag_offset[row];
1123         PREFETCH_L1(&pv[16]);
1124         pj = bj + diag_offset[row] + 1;
1125 
1126         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1127         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1128         /* but the diagonal was inverted already */
1129         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1130 
1131         SSE_INLINE_BEGIN_2(pv,pc)
1132           /* Column 0, product is accumulated in XMM4 */
1133           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1134           SSE_SHUFFLE(XMM4,XMM4,0x00)
1135           SSE_MULT_PS(XMM4,XMM0)
1136 
1137           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1138           SSE_SHUFFLE(XMM5,XMM5,0x00)
1139           SSE_MULT_PS(XMM5,XMM1)
1140           SSE_ADD_PS(XMM4,XMM5)
1141 
1142           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1143           SSE_SHUFFLE(XMM6,XMM6,0x00)
1144           SSE_MULT_PS(XMM6,XMM2)
1145           SSE_ADD_PS(XMM4,XMM6)
1146 
1147           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1148           SSE_SHUFFLE(XMM7,XMM7,0x00)
1149           SSE_MULT_PS(XMM7,XMM3)
1150           SSE_ADD_PS(XMM4,XMM7)
1151 
1152           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1153           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1154 
1155           /* Column 1, product is accumulated in XMM5 */
1156           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1157           SSE_SHUFFLE(XMM5,XMM5,0x00)
1158           SSE_MULT_PS(XMM5,XMM0)
1159 
1160           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1161           SSE_SHUFFLE(XMM6,XMM6,0x00)
1162           SSE_MULT_PS(XMM6,XMM1)
1163           SSE_ADD_PS(XMM5,XMM6)
1164 
1165           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1166           SSE_SHUFFLE(XMM7,XMM7,0x00)
1167           SSE_MULT_PS(XMM7,XMM2)
1168           SSE_ADD_PS(XMM5,XMM7)
1169 
1170           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1171           SSE_SHUFFLE(XMM6,XMM6,0x00)
1172           SSE_MULT_PS(XMM6,XMM3)
1173           SSE_ADD_PS(XMM5,XMM6)
1174 
1175           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1176           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1177 
1178           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1179 
1180           /* Column 2, product is accumulated in XMM6 */
1181           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1182           SSE_SHUFFLE(XMM6,XMM6,0x00)
1183           SSE_MULT_PS(XMM6,XMM0)
1184 
1185           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1186           SSE_SHUFFLE(XMM7,XMM7,0x00)
1187           SSE_MULT_PS(XMM7,XMM1)
1188           SSE_ADD_PS(XMM6,XMM7)
1189 
1190           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1191           SSE_SHUFFLE(XMM7,XMM7,0x00)
1192           SSE_MULT_PS(XMM7,XMM2)
1193           SSE_ADD_PS(XMM6,XMM7)
1194 
1195           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1196           SSE_SHUFFLE(XMM7,XMM7,0x00)
1197           SSE_MULT_PS(XMM7,XMM3)
1198           SSE_ADD_PS(XMM6,XMM7)
1199 
1200           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1201           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1202 
1203           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1204           /* Column 3, product is accumulated in XMM0 */
1205           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1206           SSE_SHUFFLE(XMM7,XMM7,0x00)
1207           SSE_MULT_PS(XMM0,XMM7)
1208 
1209           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1210           SSE_SHUFFLE(XMM7,XMM7,0x00)
1211           SSE_MULT_PS(XMM1,XMM7)
1212           SSE_ADD_PS(XMM0,XMM1)
1213 
1214           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1215           SSE_SHUFFLE(XMM1,XMM1,0x00)
1216           SSE_MULT_PS(XMM1,XMM2)
1217           SSE_ADD_PS(XMM0,XMM1)
1218 
1219           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1220           SSE_SHUFFLE(XMM7,XMM7,0x00)
1221           SSE_MULT_PS(XMM3,XMM7)
1222           SSE_ADD_PS(XMM0,XMM3)
1223 
1224           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1225           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1226 
1227           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1228           /* This is code to be maintained and read by humans afterall. */
1229           /* Copy Multiplier Col 3 into XMM3 */
1230           SSE_COPY_PS(XMM3,XMM0)
1231           /* Copy Multiplier Col 2 into XMM2 */
1232           SSE_COPY_PS(XMM2,XMM6)
1233           /* Copy Multiplier Col 1 into XMM1 */
1234           SSE_COPY_PS(XMM1,XMM5)
1235           /* Copy Multiplier Col 0 into XMM0 */
1236           SSE_COPY_PS(XMM0,XMM4)
1237         SSE_INLINE_END_2;
1238 
1239         /* Update the row: */
1240         nz = bi[row+1] - diag_offset[row] - 1;
1241         pv += 16;
1242         for (j=0; j<nz; j++) {
1243           PREFETCH_L1(&pv[16]);
1244           x = rtmp + 16*((unsigned int)pj[j]);
1245 /*            x = rtmp + 4*pj[j]; */
1246 
1247           /* X:=X-M*PV, One column at a time */
1248           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1249           SSE_INLINE_BEGIN_2(x,pv)
1250             /* Load First Column of X*/
1251             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1252             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1253 
1254             /* Matrix-Vector Product: */
1255             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1256             SSE_SHUFFLE(XMM5,XMM5,0x00)
1257             SSE_MULT_PS(XMM5,XMM0)
1258             SSE_SUB_PS(XMM4,XMM5)
1259 
1260             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1261             SSE_SHUFFLE(XMM6,XMM6,0x00)
1262             SSE_MULT_PS(XMM6,XMM1)
1263             SSE_SUB_PS(XMM4,XMM6)
1264 
1265             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1266             SSE_SHUFFLE(XMM7,XMM7,0x00)
1267             SSE_MULT_PS(XMM7,XMM2)
1268             SSE_SUB_PS(XMM4,XMM7)
1269 
1270             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1271             SSE_SHUFFLE(XMM5,XMM5,0x00)
1272             SSE_MULT_PS(XMM5,XMM3)
1273             SSE_SUB_PS(XMM4,XMM5)
1274 
1275             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1276             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1277 
1278             /* Second Column */
1279             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1280             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1281 
1282             /* Matrix-Vector Product: */
1283             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1284             SSE_SHUFFLE(XMM6,XMM6,0x00)
1285             SSE_MULT_PS(XMM6,XMM0)
1286             SSE_SUB_PS(XMM5,XMM6)
1287 
1288             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1289             SSE_SHUFFLE(XMM7,XMM7,0x00)
1290             SSE_MULT_PS(XMM7,XMM1)
1291             SSE_SUB_PS(XMM5,XMM7)
1292 
1293             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1294             SSE_SHUFFLE(XMM4,XMM4,0x00)
1295             SSE_MULT_PS(XMM4,XMM2)
1296             SSE_SUB_PS(XMM5,XMM4)
1297 
1298             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1299             SSE_SHUFFLE(XMM6,XMM6,0x00)
1300             SSE_MULT_PS(XMM6,XMM3)
1301             SSE_SUB_PS(XMM5,XMM6)
1302 
1303             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1304             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1305 
1306             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1307 
1308             /* Third Column */
1309             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1310             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1311 
1312             /* Matrix-Vector Product: */
1313             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1314             SSE_SHUFFLE(XMM7,XMM7,0x00)
1315             SSE_MULT_PS(XMM7,XMM0)
1316             SSE_SUB_PS(XMM6,XMM7)
1317 
1318             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1319             SSE_SHUFFLE(XMM4,XMM4,0x00)
1320             SSE_MULT_PS(XMM4,XMM1)
1321             SSE_SUB_PS(XMM6,XMM4)
1322 
1323             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1324             SSE_SHUFFLE(XMM5,XMM5,0x00)
1325             SSE_MULT_PS(XMM5,XMM2)
1326             SSE_SUB_PS(XMM6,XMM5)
1327 
1328             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1329             SSE_SHUFFLE(XMM7,XMM7,0x00)
1330             SSE_MULT_PS(XMM7,XMM3)
1331             SSE_SUB_PS(XMM6,XMM7)
1332 
1333             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1334             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1335 
1336             /* Fourth Column */
1337             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1338             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1339 
1340             /* Matrix-Vector Product: */
1341             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1342             SSE_SHUFFLE(XMM5,XMM5,0x00)
1343             SSE_MULT_PS(XMM5,XMM0)
1344             SSE_SUB_PS(XMM4,XMM5)
1345 
1346             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1347             SSE_SHUFFLE(XMM6,XMM6,0x00)
1348             SSE_MULT_PS(XMM6,XMM1)
1349             SSE_SUB_PS(XMM4,XMM6)
1350 
1351             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1352             SSE_SHUFFLE(XMM7,XMM7,0x00)
1353             SSE_MULT_PS(XMM7,XMM2)
1354             SSE_SUB_PS(XMM4,XMM7)
1355 
1356             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1357             SSE_SHUFFLE(XMM5,XMM5,0x00)
1358             SSE_MULT_PS(XMM5,XMM3)
1359             SSE_SUB_PS(XMM4,XMM5)
1360 
1361             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1362             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1363           SSE_INLINE_END_2;
1364           pv   += 16;
1365         }
1366         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1367       }
1368       row = (unsigned int)(*bjtmp++);
1369 /*        row = (*bjtmp++)/4; */
1370 /*        bjtmp++; */
1371     }
1372     /* finished row so stick it into b->a */
1373     pv = ba + 16*bi[i];
1374     pj = bj + bi[i];
1375     nz = bi[i+1] - bi[i];
1376 
1377     /* Copy x block back into pv block */
1378     for (j=0; j<nz; j++) {
1379       x  = rtmp+16*((unsigned int)pj[j]);
1380 /*        x  = rtmp+4*pj[j]; */
1381 
1382       SSE_INLINE_BEGIN_2(x,pv)
1383         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1384         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1385         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1386 
1387         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1388         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1389 
1390         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1391         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1392 
1393         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1394         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1395 
1396         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1397         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1398 
1399         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1400         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1401 
1402         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1403         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1404 
1405         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1406         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1407       SSE_INLINE_END_2;
1408       pv += 16;
1409     }
1410     /* invert diagonal block */
1411     w = ba + 16*diag_offset[i];
1412     if (pivotinblocks) {
1413       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1414     } else {
1415       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1416     }
1417 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1418     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1419   }
1420 
1421   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1422   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1423   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1424   C->assembled = PETSC_TRUE;
1425   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1426   /* Flop Count from inverting diagonal blocks */
1427   SSE_SCOPE_END;
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 #undef __FUNCT__
1432 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1434 {
1435   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1436   PetscErrorCode ierr;
1437   int  i,j,n = a->mbs;
1438   unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj;
1439   unsigned int   row;
1440   int            *ajtmpold,nz,*bi=b->i;
1441   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1442   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1443   MatScalar      *ba = b->a,*aa = a->a;
1444   int            nonzero=0;
1445 /*    int            nonzero=0,colscale = 16; */
1446   PetscBool      pivotinblocks = b->pivotinblocks;
1447   PetscReal      shift = info->shiftamount;
1448 
1449   PetscFunctionBegin;
1450   SSE_SCOPE_BEGIN;
1451 
1452   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1453   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1454   ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1455   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1456 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1457 /*      colscale = 4; */
1458 /*    } */
1459   if ((unsigned long)bj==(unsigned long)aj) {
1460     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1461   }
1462 
1463   for (i=0; i<n; i++) {
1464     nz    = bi[i+1] - bi[i];
1465     bjtmp = bj + bi[i];
1466     /* zero out the 4x4 block accumulators */
1467     /* zero out one register */
1468     XOR_PS(XMM7,XMM7);
1469     for  (j=0; j<nz; j++) {
1470       x = rtmp+16*((unsigned int)bjtmp[j]);
1471 /*        x = rtmp+4*bjtmp[j]; */
1472       SSE_INLINE_BEGIN_1(x)
1473         /* Copy zero register to memory locations */
1474         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1475         SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1476         SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1477         SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1478         SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1479         SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1480         SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1481         SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1482         SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1483       SSE_INLINE_END_1;
1484     }
1485     /* load in initial (unfactored row) */
1486     nz       = ai[i+1] - ai[i];
1487     ajtmpold = aj + ai[i];
1488     v        = aa + 16*ai[i];
1489     for (j=0; j<nz; j++) {
1490       x = rtmp+16*ajtmpold[j];
1491 /*        x = rtmp+colscale*ajtmpold[j]; */
1492       /* Copy v block into x block */
1493       SSE_INLINE_BEGIN_2(v,x)
1494         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1495         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1496         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1497 
1498         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1499         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1500 
1501         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1502         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1503 
1504         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1505         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1506 
1507         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1508         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1509 
1510         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1511         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1512 
1513         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1514         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1515 
1516         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1517         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1518       SSE_INLINE_END_2;
1519 
1520       v += 16;
1521     }
1522 /*      row = (*bjtmp++)/4; */
1523     row = (unsigned int)(*bjtmp++);
1524     while (row < i) {
1525       pc  = rtmp + 16*row;
1526       SSE_INLINE_BEGIN_1(pc)
1527         /* Load block from lower triangle */
1528         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1529         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1530         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1531 
1532         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1533         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1534 
1535         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1536         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1537 
1538         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1539         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1540 
1541         /* Compare block to zero block */
1542 
1543         SSE_COPY_PS(XMM4,XMM7)
1544         SSE_CMPNEQ_PS(XMM4,XMM0)
1545 
1546         SSE_COPY_PS(XMM5,XMM7)
1547         SSE_CMPNEQ_PS(XMM5,XMM1)
1548 
1549         SSE_COPY_PS(XMM6,XMM7)
1550         SSE_CMPNEQ_PS(XMM6,XMM2)
1551 
1552         SSE_CMPNEQ_PS(XMM7,XMM3)
1553 
1554         /* Reduce the comparisons to one SSE register */
1555         SSE_OR_PS(XMM6,XMM7)
1556         SSE_OR_PS(XMM5,XMM4)
1557         SSE_OR_PS(XMM5,XMM6)
1558       SSE_INLINE_END_1;
1559 
1560       /* Reduce the one SSE register to an integer register for branching */
1561       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1562       MOVEMASK(nonzero,XMM5);
1563 
1564       /* If block is nonzero ... */
1565       if (nonzero) {
1566         pv = ba + 16*diag_offset[row];
1567         PREFETCH_L1(&pv[16]);
1568         pj = bj + diag_offset[row] + 1;
1569 
1570         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1571         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1572         /* but the diagonal was inverted already */
1573         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1574 
1575         SSE_INLINE_BEGIN_2(pv,pc)
1576           /* Column 0, product is accumulated in XMM4 */
1577           SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1578           SSE_SHUFFLE(XMM4,XMM4,0x00)
1579           SSE_MULT_PS(XMM4,XMM0)
1580 
1581           SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1582           SSE_SHUFFLE(XMM5,XMM5,0x00)
1583           SSE_MULT_PS(XMM5,XMM1)
1584           SSE_ADD_PS(XMM4,XMM5)
1585 
1586           SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1587           SSE_SHUFFLE(XMM6,XMM6,0x00)
1588           SSE_MULT_PS(XMM6,XMM2)
1589           SSE_ADD_PS(XMM4,XMM6)
1590 
1591           SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1592           SSE_SHUFFLE(XMM7,XMM7,0x00)
1593           SSE_MULT_PS(XMM7,XMM3)
1594           SSE_ADD_PS(XMM4,XMM7)
1595 
1596           SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1597           SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1598 
1599           /* Column 1, product is accumulated in XMM5 */
1600           SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1601           SSE_SHUFFLE(XMM5,XMM5,0x00)
1602           SSE_MULT_PS(XMM5,XMM0)
1603 
1604           SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1605           SSE_SHUFFLE(XMM6,XMM6,0x00)
1606           SSE_MULT_PS(XMM6,XMM1)
1607           SSE_ADD_PS(XMM5,XMM6)
1608 
1609           SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1610           SSE_SHUFFLE(XMM7,XMM7,0x00)
1611           SSE_MULT_PS(XMM7,XMM2)
1612           SSE_ADD_PS(XMM5,XMM7)
1613 
1614           SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1615           SSE_SHUFFLE(XMM6,XMM6,0x00)
1616           SSE_MULT_PS(XMM6,XMM3)
1617           SSE_ADD_PS(XMM5,XMM6)
1618 
1619           SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1620           SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1621 
1622           SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1623 
1624           /* Column 2, product is accumulated in XMM6 */
1625           SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1626           SSE_SHUFFLE(XMM6,XMM6,0x00)
1627           SSE_MULT_PS(XMM6,XMM0)
1628 
1629           SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1630           SSE_SHUFFLE(XMM7,XMM7,0x00)
1631           SSE_MULT_PS(XMM7,XMM1)
1632           SSE_ADD_PS(XMM6,XMM7)
1633 
1634           SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1635           SSE_SHUFFLE(XMM7,XMM7,0x00)
1636           SSE_MULT_PS(XMM7,XMM2)
1637           SSE_ADD_PS(XMM6,XMM7)
1638 
1639           SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1640           SSE_SHUFFLE(XMM7,XMM7,0x00)
1641           SSE_MULT_PS(XMM7,XMM3)
1642           SSE_ADD_PS(XMM6,XMM7)
1643 
1644           SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1645           SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1646 
1647           /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1648           /* Column 3, product is accumulated in XMM0 */
1649           SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1650           SSE_SHUFFLE(XMM7,XMM7,0x00)
1651           SSE_MULT_PS(XMM0,XMM7)
1652 
1653           SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1654           SSE_SHUFFLE(XMM7,XMM7,0x00)
1655           SSE_MULT_PS(XMM1,XMM7)
1656           SSE_ADD_PS(XMM0,XMM1)
1657 
1658           SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1659           SSE_SHUFFLE(XMM1,XMM1,0x00)
1660           SSE_MULT_PS(XMM1,XMM2)
1661           SSE_ADD_PS(XMM0,XMM1)
1662 
1663           SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1664           SSE_SHUFFLE(XMM7,XMM7,0x00)
1665           SSE_MULT_PS(XMM3,XMM7)
1666           SSE_ADD_PS(XMM0,XMM3)
1667 
1668           SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1669           SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1670 
1671           /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1672           /* This is code to be maintained and read by humans afterall. */
1673           /* Copy Multiplier Col 3 into XMM3 */
1674           SSE_COPY_PS(XMM3,XMM0)
1675           /* Copy Multiplier Col 2 into XMM2 */
1676           SSE_COPY_PS(XMM2,XMM6)
1677           /* Copy Multiplier Col 1 into XMM1 */
1678           SSE_COPY_PS(XMM1,XMM5)
1679           /* Copy Multiplier Col 0 into XMM0 */
1680           SSE_COPY_PS(XMM0,XMM4)
1681         SSE_INLINE_END_2;
1682 
1683         /* Update the row: */
1684         nz = bi[row+1] - diag_offset[row] - 1;
1685         pv += 16;
1686         for (j=0; j<nz; j++) {
1687           PREFETCH_L1(&pv[16]);
1688           x = rtmp + 16*((unsigned int)pj[j]);
1689 /*            x = rtmp + 4*pj[j]; */
1690 
1691           /* X:=X-M*PV, One column at a time */
1692           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1693           SSE_INLINE_BEGIN_2(x,pv)
1694             /* Load First Column of X*/
1695             SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1696             SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1697 
1698             /* Matrix-Vector Product: */
1699             SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1700             SSE_SHUFFLE(XMM5,XMM5,0x00)
1701             SSE_MULT_PS(XMM5,XMM0)
1702             SSE_SUB_PS(XMM4,XMM5)
1703 
1704             SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1705             SSE_SHUFFLE(XMM6,XMM6,0x00)
1706             SSE_MULT_PS(XMM6,XMM1)
1707             SSE_SUB_PS(XMM4,XMM6)
1708 
1709             SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1710             SSE_SHUFFLE(XMM7,XMM7,0x00)
1711             SSE_MULT_PS(XMM7,XMM2)
1712             SSE_SUB_PS(XMM4,XMM7)
1713 
1714             SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1715             SSE_SHUFFLE(XMM5,XMM5,0x00)
1716             SSE_MULT_PS(XMM5,XMM3)
1717             SSE_SUB_PS(XMM4,XMM5)
1718 
1719             SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1720             SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1721 
1722             /* Second Column */
1723             SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1724             SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1725 
1726             /* Matrix-Vector Product: */
1727             SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1728             SSE_SHUFFLE(XMM6,XMM6,0x00)
1729             SSE_MULT_PS(XMM6,XMM0)
1730             SSE_SUB_PS(XMM5,XMM6)
1731 
1732             SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1733             SSE_SHUFFLE(XMM7,XMM7,0x00)
1734             SSE_MULT_PS(XMM7,XMM1)
1735             SSE_SUB_PS(XMM5,XMM7)
1736 
1737             SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1738             SSE_SHUFFLE(XMM4,XMM4,0x00)
1739             SSE_MULT_PS(XMM4,XMM2)
1740             SSE_SUB_PS(XMM5,XMM4)
1741 
1742             SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1743             SSE_SHUFFLE(XMM6,XMM6,0x00)
1744             SSE_MULT_PS(XMM6,XMM3)
1745             SSE_SUB_PS(XMM5,XMM6)
1746 
1747             SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1748             SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1749 
1750             SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1751 
1752             /* Third Column */
1753             SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1754             SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1755 
1756             /* Matrix-Vector Product: */
1757             SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1758             SSE_SHUFFLE(XMM7,XMM7,0x00)
1759             SSE_MULT_PS(XMM7,XMM0)
1760             SSE_SUB_PS(XMM6,XMM7)
1761 
1762             SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1763             SSE_SHUFFLE(XMM4,XMM4,0x00)
1764             SSE_MULT_PS(XMM4,XMM1)
1765             SSE_SUB_PS(XMM6,XMM4)
1766 
1767             SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1768             SSE_SHUFFLE(XMM5,XMM5,0x00)
1769             SSE_MULT_PS(XMM5,XMM2)
1770             SSE_SUB_PS(XMM6,XMM5)
1771 
1772             SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1773             SSE_SHUFFLE(XMM7,XMM7,0x00)
1774             SSE_MULT_PS(XMM7,XMM3)
1775             SSE_SUB_PS(XMM6,XMM7)
1776 
1777             SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1778             SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1779 
1780             /* Fourth Column */
1781             SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1782             SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1783 
1784             /* Matrix-Vector Product: */
1785             SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1786             SSE_SHUFFLE(XMM5,XMM5,0x00)
1787             SSE_MULT_PS(XMM5,XMM0)
1788             SSE_SUB_PS(XMM4,XMM5)
1789 
1790             SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1791             SSE_SHUFFLE(XMM6,XMM6,0x00)
1792             SSE_MULT_PS(XMM6,XMM1)
1793             SSE_SUB_PS(XMM4,XMM6)
1794 
1795             SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1796             SSE_SHUFFLE(XMM7,XMM7,0x00)
1797             SSE_MULT_PS(XMM7,XMM2)
1798             SSE_SUB_PS(XMM4,XMM7)
1799 
1800             SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1801             SSE_SHUFFLE(XMM5,XMM5,0x00)
1802             SSE_MULT_PS(XMM5,XMM3)
1803             SSE_SUB_PS(XMM4,XMM5)
1804 
1805             SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1806             SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1807           SSE_INLINE_END_2;
1808           pv   += 16;
1809         }
1810         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1811       }
1812       row = (unsigned int)(*bjtmp++);
1813 /*        row = (*bjtmp++)/4; */
1814 /*        bjtmp++; */
1815     }
1816     /* finished row so stick it into b->a */
1817     pv = ba + 16*bi[i];
1818     pj = bj + bi[i];
1819     nz = bi[i+1] - bi[i];
1820 
1821     /* Copy x block back into pv block */
1822     for (j=0; j<nz; j++) {
1823       x  = rtmp+16*((unsigned int)pj[j]);
1824 /*        x  = rtmp+4*pj[j]; */
1825 
1826       SSE_INLINE_BEGIN_2(x,pv)
1827         /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1828         SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1829         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1830 
1831         SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1832         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1833 
1834         SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1835         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1836 
1837         SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1838         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1839 
1840         SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1841         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1842 
1843         SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1844         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1845 
1846         SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1847         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1848 
1849         SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1850         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1851       SSE_INLINE_END_2;
1852       pv += 16;
1853     }
1854     /* invert diagonal block */
1855     w = ba + 16*diag_offset[i];
1856     if (pivotinblocks) {
1857       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1858     } else {
1859       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1860     }
1861 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1862     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1863   }
1864 
1865   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1866   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1867   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1868   C->assembled = PETSC_TRUE;
1869   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1870   /* Flop Count from inverting diagonal blocks */
1871   SSE_SCOPE_END;
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 #endif
1876