xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba           = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks;
30   PetscReal      shift         = info->shiftamount;
31 
32   PetscFunctionBegin;
33   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
35   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x     = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x     = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv    = ba + 16*diag_offset[row];
70         pj    = bj + diag_offset[row] + 1;
71         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15   = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz  = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x     = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv    += 16;
140     }
141     /* invert diagonal block */
142     w = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
145     } else {
146       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
147     }
148   }
149 
150   ierr = PetscFree(rtmp);CHKERRQ(ierr);
151   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
153 
154   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
155   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
156   C->assembled           = PETSC_TRUE;
157 
158   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
159   PetscFunctionReturn(0);
160 }
161 
162 /* MatLUFactorNumeric_SeqBAIJ_4 -
163      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
164        PetscKernel_A_gets_A_times_B()
165        PetscKernel_A_gets_A_minus_B_times_C()
166        PetscKernel_A_gets_inverse_A()
167 */
168 
169 #undef __FUNCT__
170 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
171 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
172 {
173   Mat            C     = B;
174   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
175   IS             isrow = b->row,isicol = b->icol;
176   PetscErrorCode ierr;
177   const PetscInt *r,*ic;
178   PetscInt       i,j,k,nz,nzL,row;
179   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
180   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
181   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
182   PetscInt       flg;
183   PetscReal      shift;
184 
185   PetscFunctionBegin;
186   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
187   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
188 
189   if (info->shifttype == MAT_SHIFT_NONE) {
190     shift = 0;
191   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
192     shift = info->shiftamount;
193   }
194 
195   /* generate work space needed by the factorization */
196   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
197   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
198 
199   for (i=0; i<n; i++) {
200     /* zero rtmp */
201     /* L part */
202     nz    = bi[i+1] - bi[i];
203     bjtmp = bj + bi[i];
204     for  (j=0; j<nz; j++) {
205       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
206     }
207 
208     /* U part */
209     nz    = bdiag[i]-bdiag[i+1];
210     bjtmp = bj + bdiag[i+1]+1;
211     for  (j=0; j<nz; j++) {
212       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
213     }
214 
215     /* load in initial (unfactored row) */
216     nz    = ai[r[i]+1] - ai[r[i]];
217     ajtmp = aj + ai[r[i]];
218     v     = aa + bs2*ai[r[i]];
219     for (j=0; j<nz; j++) {
220       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
221     }
222 
223     /* elimination */
224     bjtmp = bj + bi[i];
225     nzL   = bi[i+1] - bi[i];
226     for (k=0; k < nzL; k++) {
227       row = bjtmp[k];
228       pc  = rtmp + bs2*row;
229       for (flg=0,j=0; j<bs2; j++) {
230         if (pc[j]!=0.0) {
231           flg = 1;
232           break;
233         }
234       }
235       if (flg) {
236         pv = b->a + bs2*bdiag[row];
237         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
238         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
239 
240         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
241         pv = b->a + bs2*(bdiag[row+1]+1);
242         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
243         for (j=0; j<nz; j++) {
244           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
245           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
246           v    = rtmp + bs2*pj[j];
247           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
248           pv  += bs2;
249         }
250         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
251       }
252     }
253 
254     /* finished row so stick it into b->a */
255     /* L part */
256     pv = b->a + bs2*bi[i];
257     pj = b->j + bi[i];
258     nz = bi[i+1] - bi[i];
259     for (j=0; j<nz; j++) {
260       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
261     }
262 
263     /* Mark diagonal and invert diagonal for simplier triangular solves */
264     pv   = b->a + bs2*bdiag[i];
265     pj   = b->j + bdiag[i];
266     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
267     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
268 
269     /* U part */
270     pv = b->a + bs2*(bdiag[i+1]+1);
271     pj = b->j + bdiag[i+1]+1;
272     nz = bdiag[i] - bdiag[i+1] - 1;
273     for (j=0; j<nz; j++) {
274       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
275     }
276   }
277 
278   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
279   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
280   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
281 
282   C->ops->solve          = MatSolve_SeqBAIJ_4;
283   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
284   C->assembled           = PETSC_TRUE;
285 
286   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
287   PetscFunctionReturn(0);
288 }
289 
290 #undef __FUNCT__
291 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
292 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293 {
294 /*
295     Default Version for when blocks are 4 by 4 Using natural ordering
296 */
297   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
298   PetscErrorCode ierr;
299   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300   PetscInt       *ajtmpold,*ajtmp,nz,row;
301   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
303   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306   MatScalar      m13,m14,m15,m16;
307   MatScalar      *ba           = b->a,*aa = a->a;
308   PetscBool      pivotinblocks = b->pivotinblocks;
309   PetscReal      shift         = info->shiftamount;
310 
311   PetscFunctionBegin;
312   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
313 
314   for (i=0; i<n; i++) {
315     nz    = bi[i+1] - bi[i];
316     ajtmp = bj + bi[i];
317     for  (j=0; j<nz; j++) {
318       x     = rtmp+16*ajtmp[j];
319       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
320       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
321     }
322     /* load in initial (unfactored row) */
323     nz       = ai[i+1] - ai[i];
324     ajtmpold = aj + ai[i];
325     v        = aa + 16*ai[i];
326     for (j=0; j<nz; j++) {
327       x     = rtmp+16*ajtmpold[j];
328       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
329       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
330       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
331       x[14] = v[14]; x[15] = v[15];
332       v    += 16;
333     }
334     row = *ajtmp++;
335     while (row < i) {
336       pc  = rtmp + 16*row;
337       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
338       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
339       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
340       p15 = pc[14]; p16 = pc[15];
341       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
342           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
343           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
344           || p16 != 0.0) {
345         pv    = ba + 16*diag_offset[row];
346         pj    = bj + diag_offset[row] + 1;
347         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
348         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
349         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
350         x15   = pv[14]; x16 = pv[15];
351         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
352         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
353         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
354         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
355 
356         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
357         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
358         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
359         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
360 
361         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
362         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
363         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
364         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
365 
366         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
367         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
368         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
369         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
370         nz     = bi[row+1] - diag_offset[row] - 1;
371         pv    += 16;
372         for (j=0; j<nz; j++) {
373           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
374           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
375           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
376           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
377           x     = rtmp + 16*pj[j];
378           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
379           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
380           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
381           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
382 
383           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
384           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
385           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
386           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
387 
388           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
389           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
390           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
391           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
392 
393           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
394           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
395           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
396           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
397 
398           pv += 16;
399         }
400         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
401       }
402       row = *ajtmp++;
403     }
404     /* finished row so stick it into b->a */
405     pv = ba + 16*bi[i];
406     pj = bj + bi[i];
407     nz = bi[i+1] - bi[i];
408     for (j=0; j<nz; j++) {
409       x      = rtmp+16*pj[j];
410       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
411       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
412       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
413       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
414       pv    += 16;
415     }
416     /* invert diagonal block */
417     w = ba + 16*diag_offset[i];
418     if (pivotinblocks) {
419       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
420     } else {
421       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
422     }
423   }
424 
425   ierr = PetscFree(rtmp);CHKERRQ(ierr);
426 
427   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
428   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
429   C->assembled           = PETSC_TRUE;
430 
431   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
432   PetscFunctionReturn(0);
433 }
434 
435 /*
436   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
437     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
438 */
439 #undef __FUNCT__
440 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
442 {
443   Mat            C =B;
444   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
445   PetscErrorCode ierr;
446   PetscInt       i,j,k,nz,nzL,row;
447   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
448   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
449   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
450   PetscInt       flg;
451   PetscReal      shift;
452 
453   PetscFunctionBegin;
454   /* generate work space needed by the factorization */
455   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
456   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
457 
458   if (info->shifttype == MAT_SHIFT_NONE) {
459     shift = 0;
460   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
461     shift = info->shiftamount;
462   }
463 
464   for (i=0; i<n; i++) {
465     /* zero rtmp */
466     /* L part */
467     nz    = bi[i+1] - bi[i];
468     bjtmp = bj + bi[i];
469     for  (j=0; j<nz; j++) {
470       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
471     }
472 
473     /* U part */
474     nz    = bdiag[i] - bdiag[i+1];
475     bjtmp = bj + bdiag[i+1]+1;
476     for  (j=0; j<nz; j++) {
477       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
478     }
479 
480     /* load in initial (unfactored row) */
481     nz    = ai[i+1] - ai[i];
482     ajtmp = aj + ai[i];
483     v     = aa + bs2*ai[i];
484     for (j=0; j<nz; j++) {
485       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
486     }
487 
488     /* elimination */
489     bjtmp = bj + bi[i];
490     nzL   = bi[i+1] - bi[i];
491     for (k=0; k < nzL; k++) {
492       row = bjtmp[k];
493       pc  = rtmp + bs2*row;
494       for (flg=0,j=0; j<bs2; j++) {
495         if (pc[j]!=0.0) {
496           flg = 1;
497           break;
498         }
499       }
500       if (flg) {
501         pv = b->a + bs2*bdiag[row];
502         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
503         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
504 
505         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
506         pv = b->a + bs2*(bdiag[row+1]+1);
507         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
508         for (j=0; j<nz; j++) {
509           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
510           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
511           v    = rtmp + bs2*pj[j];
512           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
513           pv  += bs2;
514         }
515         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
516       }
517     }
518 
519     /* finished row so stick it into b->a */
520     /* L part */
521     pv = b->a + bs2*bi[i];
522     pj = b->j + bi[i];
523     nz = bi[i+1] - bi[i];
524     for (j=0; j<nz; j++) {
525       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
526     }
527 
528     /* Mark diagonal and invert diagonal for simplier triangular solves */
529     pv   = b->a + bs2*bdiag[i];
530     pj   = b->j + bdiag[i];
531     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
532     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr);
533 
534     /* U part */
535     pv = b->a + bs2*(bdiag[i+1]+1);
536     pj = b->j + bdiag[i+1]+1;
537     nz = bdiag[i] - bdiag[i+1] - 1;
538     for (j=0; j<nz; j++) {
539       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
540     }
541   }
542   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
543 
544   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
545   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
546   C->assembled           = PETSC_TRUE;
547 
548   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
549   PetscFunctionReturn(0);
550 }
551 
552 #if defined(PETSC_HAVE_SSE)
553 
554 #include PETSC_HAVE_SSE
555 
556 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
557 #undef __FUNCT__
558 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
559 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
560 {
561   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
562   PetscErrorCode ierr;
563   int            i,j,n = a->mbs;
564   int            *bj = b->j,*bjtmp,*pj;
565   int            row;
566   int            *ajtmpold,nz,*bi=b->i;
567   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
568   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
569   MatScalar      *ba    = b->a,*aa = a->a;
570   int            nonzero=0;
571 /*    int            nonzero=0,colscale = 16; */
572   PetscBool pivotinblocks = b->pivotinblocks;
573   PetscReal shift         = info->shiftamount;
574 
575   PetscFunctionBegin;
576   SSE_SCOPE_BEGIN;
577 
578   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
579   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
580   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
581   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
582 /*    if ((unsigned long)bj==(unsigned long)aj) { */
583 /*      colscale = 4; */
584 /*    } */
585   for (i=0; i<n; i++) {
586     nz    = bi[i+1] - bi[i];
587     bjtmp = bj + bi[i];
588     /* zero out the 4x4 block accumulators */
589     /* zero out one register */
590     XOR_PS(XMM7,XMM7);
591     for  (j=0; j<nz; j++) {
592       x = rtmp+16*bjtmp[j];
593 /*        x = rtmp+4*bjtmp[j]; */
594       SSE_INLINE_BEGIN_1(x)
595       /* Copy zero register to memory locations */
596       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
597       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
598       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
599       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
600       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
601       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
602       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
603       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
604       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
605       SSE_INLINE_END_1;
606     }
607     /* load in initial (unfactored row) */
608     nz       = ai[i+1] - ai[i];
609     ajtmpold = aj + ai[i];
610     v        = aa + 16*ai[i];
611     for (j=0; j<nz; j++) {
612       x = rtmp+16*ajtmpold[j];
613 /*        x = rtmp+colscale*ajtmpold[j]; */
614       /* Copy v block into x block */
615       SSE_INLINE_BEGIN_2(v,x)
616       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
617       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
618       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
619 
620       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
621       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
622 
623       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
624       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
625 
626       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
627       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
628 
629       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
630       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
631 
632       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
633       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
634 
635       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
636       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
637 
638       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
639       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
640       SSE_INLINE_END_2;
641 
642       v += 16;
643     }
644 /*      row = (*bjtmp++)/4; */
645     row = *bjtmp++;
646     while (row < i) {
647       pc = rtmp + 16*row;
648       SSE_INLINE_BEGIN_1(pc)
649       /* Load block from lower triangle */
650       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
651       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
652       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
653 
654       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
655       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
656 
657       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
658       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
659 
660       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
661       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
662 
663       /* Compare block to zero block */
664 
665       SSE_COPY_PS(XMM4,XMM7)
666       SSE_CMPNEQ_PS(XMM4,XMM0)
667 
668       SSE_COPY_PS(XMM5,XMM7)
669       SSE_CMPNEQ_PS(XMM5,XMM1)
670 
671       SSE_COPY_PS(XMM6,XMM7)
672       SSE_CMPNEQ_PS(XMM6,XMM2)
673 
674       SSE_CMPNEQ_PS(XMM7,XMM3)
675 
676       /* Reduce the comparisons to one SSE register */
677       SSE_OR_PS(XMM6,XMM7)
678       SSE_OR_PS(XMM5,XMM4)
679       SSE_OR_PS(XMM5,XMM6)
680       SSE_INLINE_END_1;
681 
682       /* Reduce the one SSE register to an integer register for branching */
683       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
684       MOVEMASK(nonzero,XMM5);
685 
686       /* If block is nonzero ... */
687       if (nonzero) {
688         pv = ba + 16*diag_offset[row];
689         PREFETCH_L1(&pv[16]);
690         pj = bj + diag_offset[row] + 1;
691 
692         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
693         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
694         /* but the diagonal was inverted already */
695         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
696 
697         SSE_INLINE_BEGIN_2(pv,pc)
698         /* Column 0, product is accumulated in XMM4 */
699         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
700         SSE_SHUFFLE(XMM4,XMM4,0x00)
701         SSE_MULT_PS(XMM4,XMM0)
702 
703         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
704         SSE_SHUFFLE(XMM5,XMM5,0x00)
705         SSE_MULT_PS(XMM5,XMM1)
706         SSE_ADD_PS(XMM4,XMM5)
707 
708         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
709         SSE_SHUFFLE(XMM6,XMM6,0x00)
710         SSE_MULT_PS(XMM6,XMM2)
711         SSE_ADD_PS(XMM4,XMM6)
712 
713         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
714         SSE_SHUFFLE(XMM7,XMM7,0x00)
715         SSE_MULT_PS(XMM7,XMM3)
716         SSE_ADD_PS(XMM4,XMM7)
717 
718         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
719         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
720 
721         /* Column 1, product is accumulated in XMM5 */
722         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
723         SSE_SHUFFLE(XMM5,XMM5,0x00)
724         SSE_MULT_PS(XMM5,XMM0)
725 
726         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
727         SSE_SHUFFLE(XMM6,XMM6,0x00)
728         SSE_MULT_PS(XMM6,XMM1)
729         SSE_ADD_PS(XMM5,XMM6)
730 
731         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
732         SSE_SHUFFLE(XMM7,XMM7,0x00)
733         SSE_MULT_PS(XMM7,XMM2)
734         SSE_ADD_PS(XMM5,XMM7)
735 
736         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
737         SSE_SHUFFLE(XMM6,XMM6,0x00)
738         SSE_MULT_PS(XMM6,XMM3)
739         SSE_ADD_PS(XMM5,XMM6)
740 
741         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
742         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
743 
744         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
745 
746         /* Column 2, product is accumulated in XMM6 */
747         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
748         SSE_SHUFFLE(XMM6,XMM6,0x00)
749         SSE_MULT_PS(XMM6,XMM0)
750 
751         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
752         SSE_SHUFFLE(XMM7,XMM7,0x00)
753         SSE_MULT_PS(XMM7,XMM1)
754         SSE_ADD_PS(XMM6,XMM7)
755 
756         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
757         SSE_SHUFFLE(XMM7,XMM7,0x00)
758         SSE_MULT_PS(XMM7,XMM2)
759         SSE_ADD_PS(XMM6,XMM7)
760 
761         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
762         SSE_SHUFFLE(XMM7,XMM7,0x00)
763         SSE_MULT_PS(XMM7,XMM3)
764         SSE_ADD_PS(XMM6,XMM7)
765 
766         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
767         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
768 
769         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
770         /* Column 3, product is accumulated in XMM0 */
771         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
772         SSE_SHUFFLE(XMM7,XMM7,0x00)
773         SSE_MULT_PS(XMM0,XMM7)
774 
775         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
776         SSE_SHUFFLE(XMM7,XMM7,0x00)
777         SSE_MULT_PS(XMM1,XMM7)
778         SSE_ADD_PS(XMM0,XMM1)
779 
780         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
781         SSE_SHUFFLE(XMM1,XMM1,0x00)
782         SSE_MULT_PS(XMM1,XMM2)
783         SSE_ADD_PS(XMM0,XMM1)
784 
785         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
786         SSE_SHUFFLE(XMM7,XMM7,0x00)
787         SSE_MULT_PS(XMM3,XMM7)
788         SSE_ADD_PS(XMM0,XMM3)
789 
790         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
791         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
792 
793         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
794         /* This is code to be maintained and read by humans afterall. */
795         /* Copy Multiplier Col 3 into XMM3 */
796         SSE_COPY_PS(XMM3,XMM0)
797         /* Copy Multiplier Col 2 into XMM2 */
798         SSE_COPY_PS(XMM2,XMM6)
799         /* Copy Multiplier Col 1 into XMM1 */
800         SSE_COPY_PS(XMM1,XMM5)
801         /* Copy Multiplier Col 0 into XMM0 */
802         SSE_COPY_PS(XMM0,XMM4)
803         SSE_INLINE_END_2;
804 
805         /* Update the row: */
806         nz  = bi[row+1] - diag_offset[row] - 1;
807         pv += 16;
808         for (j=0; j<nz; j++) {
809           PREFETCH_L1(&pv[16]);
810           x = rtmp + 16*pj[j];
811 /*            x = rtmp + 4*pj[j]; */
812 
813           /* X:=X-M*PV, One column at a time */
814           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
815           SSE_INLINE_BEGIN_2(x,pv)
816           /* Load First Column of X*/
817           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
818           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
819 
820           /* Matrix-Vector Product: */
821           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
822           SSE_SHUFFLE(XMM5,XMM5,0x00)
823           SSE_MULT_PS(XMM5,XMM0)
824           SSE_SUB_PS(XMM4,XMM5)
825 
826           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
827           SSE_SHUFFLE(XMM6,XMM6,0x00)
828           SSE_MULT_PS(XMM6,XMM1)
829           SSE_SUB_PS(XMM4,XMM6)
830 
831           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
832           SSE_SHUFFLE(XMM7,XMM7,0x00)
833           SSE_MULT_PS(XMM7,XMM2)
834           SSE_SUB_PS(XMM4,XMM7)
835 
836           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
837           SSE_SHUFFLE(XMM5,XMM5,0x00)
838           SSE_MULT_PS(XMM5,XMM3)
839           SSE_SUB_PS(XMM4,XMM5)
840 
841           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
842           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
843 
844           /* Second Column */
845           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
846           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
847 
848           /* Matrix-Vector Product: */
849           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
850           SSE_SHUFFLE(XMM6,XMM6,0x00)
851           SSE_MULT_PS(XMM6,XMM0)
852           SSE_SUB_PS(XMM5,XMM6)
853 
854           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
855           SSE_SHUFFLE(XMM7,XMM7,0x00)
856           SSE_MULT_PS(XMM7,XMM1)
857           SSE_SUB_PS(XMM5,XMM7)
858 
859           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
860           SSE_SHUFFLE(XMM4,XMM4,0x00)
861           SSE_MULT_PS(XMM4,XMM2)
862           SSE_SUB_PS(XMM5,XMM4)
863 
864           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
865           SSE_SHUFFLE(XMM6,XMM6,0x00)
866           SSE_MULT_PS(XMM6,XMM3)
867           SSE_SUB_PS(XMM5,XMM6)
868 
869           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
870           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
871 
872           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
873 
874           /* Third Column */
875           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
876           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
877 
878           /* Matrix-Vector Product: */
879           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
880           SSE_SHUFFLE(XMM7,XMM7,0x00)
881           SSE_MULT_PS(XMM7,XMM0)
882           SSE_SUB_PS(XMM6,XMM7)
883 
884           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
885           SSE_SHUFFLE(XMM4,XMM4,0x00)
886           SSE_MULT_PS(XMM4,XMM1)
887           SSE_SUB_PS(XMM6,XMM4)
888 
889           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
890           SSE_SHUFFLE(XMM5,XMM5,0x00)
891           SSE_MULT_PS(XMM5,XMM2)
892           SSE_SUB_PS(XMM6,XMM5)
893 
894           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
895           SSE_SHUFFLE(XMM7,XMM7,0x00)
896           SSE_MULT_PS(XMM7,XMM3)
897           SSE_SUB_PS(XMM6,XMM7)
898 
899           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
900           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
901 
902           /* Fourth Column */
903           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
904           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
905 
906           /* Matrix-Vector Product: */
907           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
908           SSE_SHUFFLE(XMM5,XMM5,0x00)
909           SSE_MULT_PS(XMM5,XMM0)
910           SSE_SUB_PS(XMM4,XMM5)
911 
912           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
913           SSE_SHUFFLE(XMM6,XMM6,0x00)
914           SSE_MULT_PS(XMM6,XMM1)
915           SSE_SUB_PS(XMM4,XMM6)
916 
917           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
918           SSE_SHUFFLE(XMM7,XMM7,0x00)
919           SSE_MULT_PS(XMM7,XMM2)
920           SSE_SUB_PS(XMM4,XMM7)
921 
922           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
923           SSE_SHUFFLE(XMM5,XMM5,0x00)
924           SSE_MULT_PS(XMM5,XMM3)
925           SSE_SUB_PS(XMM4,XMM5)
926 
927           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
928           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
929           SSE_INLINE_END_2;
930           pv += 16;
931         }
932         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
933       }
934       row = *bjtmp++;
935 /*        row = (*bjtmp++)/4; */
936     }
937     /* finished row so stick it into b->a */
938     pv = ba + 16*bi[i];
939     pj = bj + bi[i];
940     nz = bi[i+1] - bi[i];
941 
942     /* Copy x block back into pv block */
943     for (j=0; j<nz; j++) {
944       x = rtmp+16*pj[j];
945 /*        x  = rtmp+4*pj[j]; */
946 
947       SSE_INLINE_BEGIN_2(x,pv)
948       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
949       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
950       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
951 
952       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
953       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
954 
955       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
956       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
957 
958       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
959       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
960 
961       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
962       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
963 
964       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
965       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
966 
967       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
968       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
969 
970       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
971       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
972       SSE_INLINE_END_2;
973       pv += 16;
974     }
975     /* invert diagonal block */
976     w = ba + 16*diag_offset[i];
977     if (pivotinblocks) {
978       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
979     } else {
980       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
981     }
982 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
983     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
984   }
985 
986   ierr = PetscFree(rtmp);CHKERRQ(ierr);
987 
988   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
989   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
990   C->assembled           = PETSC_TRUE;
991 
992   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
993   /* Flop Count from inverting diagonal blocks */
994   SSE_SCOPE_END;
995   PetscFunctionReturn(0);
996 }
997 
998 #undef __FUNCT__
999 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1000 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1001 {
1002   Mat            A  =C;
1003   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1004   PetscErrorCode ierr;
1005   int            i,j,n = a->mbs;
1006   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1007   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1008   unsigned int   row;
1009   int            nz,*bi=b->i;
1010   int            *diag_offset = b->diag,*ai=a->i;
1011   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1012   MatScalar      *ba    = b->a,*aa = a->a;
1013   int            nonzero=0;
1014 /*    int            nonzero=0,colscale = 16; */
1015   PetscBool pivotinblocks = b->pivotinblocks;
1016   PetscReal shift         = info->shiftamount;
1017 
1018   PetscFunctionBegin;
1019   SSE_SCOPE_BEGIN;
1020 
1021   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1022   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1023   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1024   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1025 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1026 /*      colscale = 4; */
1027 /*    } */
1028 
1029   for (i=0; i<n; i++) {
1030     nz    = bi[i+1] - bi[i];
1031     bjtmp = bj + bi[i];
1032     /* zero out the 4x4 block accumulators */
1033     /* zero out one register */
1034     XOR_PS(XMM7,XMM7);
1035     for  (j=0; j<nz; j++) {
1036       x = rtmp+16*((unsigned int)bjtmp[j]);
1037 /*        x = rtmp+4*bjtmp[j]; */
1038       SSE_INLINE_BEGIN_1(x)
1039       /* Copy zero register to memory locations */
1040       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1041       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1042       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1043       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1044       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1045       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1046       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1047       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1048       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1049       SSE_INLINE_END_1;
1050     }
1051     /* load in initial (unfactored row) */
1052     nz    = ai[i+1] - ai[i];
1053     ajtmp = aj + ai[i];
1054     v     = aa + 16*ai[i];
1055     for (j=0; j<nz; j++) {
1056       x = rtmp+16*((unsigned int)ajtmp[j]);
1057 /*        x = rtmp+colscale*ajtmp[j]; */
1058       /* Copy v block into x block */
1059       SSE_INLINE_BEGIN_2(v,x)
1060       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1061       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1062       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1063 
1064       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1065       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1066 
1067       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1068       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1069 
1070       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1071       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1072 
1073       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1074       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1075 
1076       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1077       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1078 
1079       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1080       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1081 
1082       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1083       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1084       SSE_INLINE_END_2;
1085 
1086       v += 16;
1087     }
1088 /*      row = (*bjtmp++)/4; */
1089     row = (unsigned int)(*bjtmp++);
1090     while (row < i) {
1091       pc = rtmp + 16*row;
1092       SSE_INLINE_BEGIN_1(pc)
1093       /* Load block from lower triangle */
1094       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1095       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1096       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1097 
1098       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1099       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1100 
1101       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1102       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1103 
1104       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1105       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1106 
1107       /* Compare block to zero block */
1108 
1109       SSE_COPY_PS(XMM4,XMM7)
1110       SSE_CMPNEQ_PS(XMM4,XMM0)
1111 
1112       SSE_COPY_PS(XMM5,XMM7)
1113       SSE_CMPNEQ_PS(XMM5,XMM1)
1114 
1115       SSE_COPY_PS(XMM6,XMM7)
1116       SSE_CMPNEQ_PS(XMM6,XMM2)
1117 
1118       SSE_CMPNEQ_PS(XMM7,XMM3)
1119 
1120       /* Reduce the comparisons to one SSE register */
1121       SSE_OR_PS(XMM6,XMM7)
1122       SSE_OR_PS(XMM5,XMM4)
1123       SSE_OR_PS(XMM5,XMM6)
1124       SSE_INLINE_END_1;
1125 
1126       /* Reduce the one SSE register to an integer register for branching */
1127       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1128       MOVEMASK(nonzero,XMM5);
1129 
1130       /* If block is nonzero ... */
1131       if (nonzero) {
1132         pv = ba + 16*diag_offset[row];
1133         PREFETCH_L1(&pv[16]);
1134         pj = bj + diag_offset[row] + 1;
1135 
1136         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1137         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1138         /* but the diagonal was inverted already */
1139         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1140 
1141         SSE_INLINE_BEGIN_2(pv,pc)
1142         /* Column 0, product is accumulated in XMM4 */
1143         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1144         SSE_SHUFFLE(XMM4,XMM4,0x00)
1145         SSE_MULT_PS(XMM4,XMM0)
1146 
1147         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1148         SSE_SHUFFLE(XMM5,XMM5,0x00)
1149         SSE_MULT_PS(XMM5,XMM1)
1150         SSE_ADD_PS(XMM4,XMM5)
1151 
1152         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1153         SSE_SHUFFLE(XMM6,XMM6,0x00)
1154         SSE_MULT_PS(XMM6,XMM2)
1155         SSE_ADD_PS(XMM4,XMM6)
1156 
1157         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1158         SSE_SHUFFLE(XMM7,XMM7,0x00)
1159         SSE_MULT_PS(XMM7,XMM3)
1160         SSE_ADD_PS(XMM4,XMM7)
1161 
1162         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1163         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1164 
1165         /* Column 1, product is accumulated in XMM5 */
1166         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1167         SSE_SHUFFLE(XMM5,XMM5,0x00)
1168         SSE_MULT_PS(XMM5,XMM0)
1169 
1170         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1171         SSE_SHUFFLE(XMM6,XMM6,0x00)
1172         SSE_MULT_PS(XMM6,XMM1)
1173         SSE_ADD_PS(XMM5,XMM6)
1174 
1175         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1176         SSE_SHUFFLE(XMM7,XMM7,0x00)
1177         SSE_MULT_PS(XMM7,XMM2)
1178         SSE_ADD_PS(XMM5,XMM7)
1179 
1180         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1181         SSE_SHUFFLE(XMM6,XMM6,0x00)
1182         SSE_MULT_PS(XMM6,XMM3)
1183         SSE_ADD_PS(XMM5,XMM6)
1184 
1185         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1186         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1187 
1188         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1189 
1190         /* Column 2, product is accumulated in XMM6 */
1191         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1192         SSE_SHUFFLE(XMM6,XMM6,0x00)
1193         SSE_MULT_PS(XMM6,XMM0)
1194 
1195         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1196         SSE_SHUFFLE(XMM7,XMM7,0x00)
1197         SSE_MULT_PS(XMM7,XMM1)
1198         SSE_ADD_PS(XMM6,XMM7)
1199 
1200         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1201         SSE_SHUFFLE(XMM7,XMM7,0x00)
1202         SSE_MULT_PS(XMM7,XMM2)
1203         SSE_ADD_PS(XMM6,XMM7)
1204 
1205         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1206         SSE_SHUFFLE(XMM7,XMM7,0x00)
1207         SSE_MULT_PS(XMM7,XMM3)
1208         SSE_ADD_PS(XMM6,XMM7)
1209 
1210         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1211         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1212 
1213         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1214         /* Column 3, product is accumulated in XMM0 */
1215         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1216         SSE_SHUFFLE(XMM7,XMM7,0x00)
1217         SSE_MULT_PS(XMM0,XMM7)
1218 
1219         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1220         SSE_SHUFFLE(XMM7,XMM7,0x00)
1221         SSE_MULT_PS(XMM1,XMM7)
1222         SSE_ADD_PS(XMM0,XMM1)
1223 
1224         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1225         SSE_SHUFFLE(XMM1,XMM1,0x00)
1226         SSE_MULT_PS(XMM1,XMM2)
1227         SSE_ADD_PS(XMM0,XMM1)
1228 
1229         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1230         SSE_SHUFFLE(XMM7,XMM7,0x00)
1231         SSE_MULT_PS(XMM3,XMM7)
1232         SSE_ADD_PS(XMM0,XMM3)
1233 
1234         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1235         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1236 
1237         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1238         /* This is code to be maintained and read by humans afterall. */
1239         /* Copy Multiplier Col 3 into XMM3 */
1240         SSE_COPY_PS(XMM3,XMM0)
1241         /* Copy Multiplier Col 2 into XMM2 */
1242         SSE_COPY_PS(XMM2,XMM6)
1243         /* Copy Multiplier Col 1 into XMM1 */
1244         SSE_COPY_PS(XMM1,XMM5)
1245         /* Copy Multiplier Col 0 into XMM0 */
1246         SSE_COPY_PS(XMM0,XMM4)
1247         SSE_INLINE_END_2;
1248 
1249         /* Update the row: */
1250         nz  = bi[row+1] - diag_offset[row] - 1;
1251         pv += 16;
1252         for (j=0; j<nz; j++) {
1253           PREFETCH_L1(&pv[16]);
1254           x = rtmp + 16*((unsigned int)pj[j]);
1255 /*            x = rtmp + 4*pj[j]; */
1256 
1257           /* X:=X-M*PV, One column at a time */
1258           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1259           SSE_INLINE_BEGIN_2(x,pv)
1260           /* Load First Column of X*/
1261           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1262           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1263 
1264           /* Matrix-Vector Product: */
1265           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1266           SSE_SHUFFLE(XMM5,XMM5,0x00)
1267           SSE_MULT_PS(XMM5,XMM0)
1268           SSE_SUB_PS(XMM4,XMM5)
1269 
1270           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1271           SSE_SHUFFLE(XMM6,XMM6,0x00)
1272           SSE_MULT_PS(XMM6,XMM1)
1273           SSE_SUB_PS(XMM4,XMM6)
1274 
1275           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1276           SSE_SHUFFLE(XMM7,XMM7,0x00)
1277           SSE_MULT_PS(XMM7,XMM2)
1278           SSE_SUB_PS(XMM4,XMM7)
1279 
1280           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1281           SSE_SHUFFLE(XMM5,XMM5,0x00)
1282           SSE_MULT_PS(XMM5,XMM3)
1283           SSE_SUB_PS(XMM4,XMM5)
1284 
1285           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1286           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1287 
1288           /* Second Column */
1289           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1290           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1291 
1292           /* Matrix-Vector Product: */
1293           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1294           SSE_SHUFFLE(XMM6,XMM6,0x00)
1295           SSE_MULT_PS(XMM6,XMM0)
1296           SSE_SUB_PS(XMM5,XMM6)
1297 
1298           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1299           SSE_SHUFFLE(XMM7,XMM7,0x00)
1300           SSE_MULT_PS(XMM7,XMM1)
1301           SSE_SUB_PS(XMM5,XMM7)
1302 
1303           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1304           SSE_SHUFFLE(XMM4,XMM4,0x00)
1305           SSE_MULT_PS(XMM4,XMM2)
1306           SSE_SUB_PS(XMM5,XMM4)
1307 
1308           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1309           SSE_SHUFFLE(XMM6,XMM6,0x00)
1310           SSE_MULT_PS(XMM6,XMM3)
1311           SSE_SUB_PS(XMM5,XMM6)
1312 
1313           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1314           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1315 
1316           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1317 
1318           /* Third Column */
1319           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1320           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1321 
1322           /* Matrix-Vector Product: */
1323           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1324           SSE_SHUFFLE(XMM7,XMM7,0x00)
1325           SSE_MULT_PS(XMM7,XMM0)
1326           SSE_SUB_PS(XMM6,XMM7)
1327 
1328           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1329           SSE_SHUFFLE(XMM4,XMM4,0x00)
1330           SSE_MULT_PS(XMM4,XMM1)
1331           SSE_SUB_PS(XMM6,XMM4)
1332 
1333           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1334           SSE_SHUFFLE(XMM5,XMM5,0x00)
1335           SSE_MULT_PS(XMM5,XMM2)
1336           SSE_SUB_PS(XMM6,XMM5)
1337 
1338           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1339           SSE_SHUFFLE(XMM7,XMM7,0x00)
1340           SSE_MULT_PS(XMM7,XMM3)
1341           SSE_SUB_PS(XMM6,XMM7)
1342 
1343           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1344           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1345 
1346           /* Fourth Column */
1347           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1348           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1349 
1350           /* Matrix-Vector Product: */
1351           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1352           SSE_SHUFFLE(XMM5,XMM5,0x00)
1353           SSE_MULT_PS(XMM5,XMM0)
1354           SSE_SUB_PS(XMM4,XMM5)
1355 
1356           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1357           SSE_SHUFFLE(XMM6,XMM6,0x00)
1358           SSE_MULT_PS(XMM6,XMM1)
1359           SSE_SUB_PS(XMM4,XMM6)
1360 
1361           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1362           SSE_SHUFFLE(XMM7,XMM7,0x00)
1363           SSE_MULT_PS(XMM7,XMM2)
1364           SSE_SUB_PS(XMM4,XMM7)
1365 
1366           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1367           SSE_SHUFFLE(XMM5,XMM5,0x00)
1368           SSE_MULT_PS(XMM5,XMM3)
1369           SSE_SUB_PS(XMM4,XMM5)
1370 
1371           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1372           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1373           SSE_INLINE_END_2;
1374           pv += 16;
1375         }
1376         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1377       }
1378       row = (unsigned int)(*bjtmp++);
1379 /*        row = (*bjtmp++)/4; */
1380 /*        bjtmp++; */
1381     }
1382     /* finished row so stick it into b->a */
1383     pv = ba + 16*bi[i];
1384     pj = bj + bi[i];
1385     nz = bi[i+1] - bi[i];
1386 
1387     /* Copy x block back into pv block */
1388     for (j=0; j<nz; j++) {
1389       x = rtmp+16*((unsigned int)pj[j]);
1390 /*        x  = rtmp+4*pj[j]; */
1391 
1392       SSE_INLINE_BEGIN_2(x,pv)
1393       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1394       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1395       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1396 
1397       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1398       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1399 
1400       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1401       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1402 
1403       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1404       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1405 
1406       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1407       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1408 
1409       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1410       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1411 
1412       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1413       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1414 
1415       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1416       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1417       SSE_INLINE_END_2;
1418       pv += 16;
1419     }
1420     /* invert diagonal block */
1421     w = ba + 16*diag_offset[i];
1422     if (pivotinblocks) {
1423       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1424     } else {
1425       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1426     }
1427 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1428     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1429   }
1430 
1431   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1432 
1433   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1434   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1435   C->assembled           = PETSC_TRUE;
1436 
1437   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1438   /* Flop Count from inverting diagonal blocks */
1439   SSE_SCOPE_END;
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 #undef __FUNCT__
1444 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1445 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1446 {
1447   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1448   PetscErrorCode ierr;
1449   int            i,j,n = a->mbs;
1450   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1451   unsigned int   row;
1452   int            *ajtmpold,nz,*bi=b->i;
1453   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1454   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1455   MatScalar      *ba    = b->a,*aa = a->a;
1456   int            nonzero=0;
1457 /*    int            nonzero=0,colscale = 16; */
1458   PetscBool pivotinblocks = b->pivotinblocks;
1459   PetscReal shift         = info->shiftamount;
1460 
1461   PetscFunctionBegin;
1462   SSE_SCOPE_BEGIN;
1463 
1464   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1465   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1466   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1467   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1468 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1469 /*      colscale = 4; */
1470 /*    } */
1471   if ((unsigned long)bj==(unsigned long)aj) {
1472     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1473   }
1474 
1475   for (i=0; i<n; i++) {
1476     nz    = bi[i+1] - bi[i];
1477     bjtmp = bj + bi[i];
1478     /* zero out the 4x4 block accumulators */
1479     /* zero out one register */
1480     XOR_PS(XMM7,XMM7);
1481     for  (j=0; j<nz; j++) {
1482       x = rtmp+16*((unsigned int)bjtmp[j]);
1483 /*        x = rtmp+4*bjtmp[j]; */
1484       SSE_INLINE_BEGIN_1(x)
1485       /* Copy zero register to memory locations */
1486       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1487       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1488       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1489       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1490       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1491       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1492       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1493       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1494       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1495       SSE_INLINE_END_1;
1496     }
1497     /* load in initial (unfactored row) */
1498     nz       = ai[i+1] - ai[i];
1499     ajtmpold = aj + ai[i];
1500     v        = aa + 16*ai[i];
1501     for (j=0; j<nz; j++) {
1502       x = rtmp+16*ajtmpold[j];
1503 /*        x = rtmp+colscale*ajtmpold[j]; */
1504       /* Copy v block into x block */
1505       SSE_INLINE_BEGIN_2(v,x)
1506       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1507       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1508       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1509 
1510       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1511       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1512 
1513       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1514       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1515 
1516       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1517       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1518 
1519       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1520       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1521 
1522       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1523       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1524 
1525       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1526       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1527 
1528       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1529       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1530       SSE_INLINE_END_2;
1531 
1532       v += 16;
1533     }
1534 /*      row = (*bjtmp++)/4; */
1535     row = (unsigned int)(*bjtmp++);
1536     while (row < i) {
1537       pc = rtmp + 16*row;
1538       SSE_INLINE_BEGIN_1(pc)
1539       /* Load block from lower triangle */
1540       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1541       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1542       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1543 
1544       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1545       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1546 
1547       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1548       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1549 
1550       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1551       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1552 
1553       /* Compare block to zero block */
1554 
1555       SSE_COPY_PS(XMM4,XMM7)
1556       SSE_CMPNEQ_PS(XMM4,XMM0)
1557 
1558       SSE_COPY_PS(XMM5,XMM7)
1559       SSE_CMPNEQ_PS(XMM5,XMM1)
1560 
1561       SSE_COPY_PS(XMM6,XMM7)
1562       SSE_CMPNEQ_PS(XMM6,XMM2)
1563 
1564       SSE_CMPNEQ_PS(XMM7,XMM3)
1565 
1566       /* Reduce the comparisons to one SSE register */
1567       SSE_OR_PS(XMM6,XMM7)
1568       SSE_OR_PS(XMM5,XMM4)
1569       SSE_OR_PS(XMM5,XMM6)
1570       SSE_INLINE_END_1;
1571 
1572       /* Reduce the one SSE register to an integer register for branching */
1573       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1574       MOVEMASK(nonzero,XMM5);
1575 
1576       /* If block is nonzero ... */
1577       if (nonzero) {
1578         pv = ba + 16*diag_offset[row];
1579         PREFETCH_L1(&pv[16]);
1580         pj = bj + diag_offset[row] + 1;
1581 
1582         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1583         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1584         /* but the diagonal was inverted already */
1585         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1586 
1587         SSE_INLINE_BEGIN_2(pv,pc)
1588         /* Column 0, product is accumulated in XMM4 */
1589         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1590         SSE_SHUFFLE(XMM4,XMM4,0x00)
1591         SSE_MULT_PS(XMM4,XMM0)
1592 
1593         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1594         SSE_SHUFFLE(XMM5,XMM5,0x00)
1595         SSE_MULT_PS(XMM5,XMM1)
1596         SSE_ADD_PS(XMM4,XMM5)
1597 
1598         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1599         SSE_SHUFFLE(XMM6,XMM6,0x00)
1600         SSE_MULT_PS(XMM6,XMM2)
1601         SSE_ADD_PS(XMM4,XMM6)
1602 
1603         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1604         SSE_SHUFFLE(XMM7,XMM7,0x00)
1605         SSE_MULT_PS(XMM7,XMM3)
1606         SSE_ADD_PS(XMM4,XMM7)
1607 
1608         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1609         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1610 
1611         /* Column 1, product is accumulated in XMM5 */
1612         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1613         SSE_SHUFFLE(XMM5,XMM5,0x00)
1614         SSE_MULT_PS(XMM5,XMM0)
1615 
1616         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1617         SSE_SHUFFLE(XMM6,XMM6,0x00)
1618         SSE_MULT_PS(XMM6,XMM1)
1619         SSE_ADD_PS(XMM5,XMM6)
1620 
1621         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1622         SSE_SHUFFLE(XMM7,XMM7,0x00)
1623         SSE_MULT_PS(XMM7,XMM2)
1624         SSE_ADD_PS(XMM5,XMM7)
1625 
1626         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1627         SSE_SHUFFLE(XMM6,XMM6,0x00)
1628         SSE_MULT_PS(XMM6,XMM3)
1629         SSE_ADD_PS(XMM5,XMM6)
1630 
1631         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1632         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1633 
1634         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1635 
1636         /* Column 2, product is accumulated in XMM6 */
1637         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1638         SSE_SHUFFLE(XMM6,XMM6,0x00)
1639         SSE_MULT_PS(XMM6,XMM0)
1640 
1641         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1642         SSE_SHUFFLE(XMM7,XMM7,0x00)
1643         SSE_MULT_PS(XMM7,XMM1)
1644         SSE_ADD_PS(XMM6,XMM7)
1645 
1646         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1647         SSE_SHUFFLE(XMM7,XMM7,0x00)
1648         SSE_MULT_PS(XMM7,XMM2)
1649         SSE_ADD_PS(XMM6,XMM7)
1650 
1651         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1652         SSE_SHUFFLE(XMM7,XMM7,0x00)
1653         SSE_MULT_PS(XMM7,XMM3)
1654         SSE_ADD_PS(XMM6,XMM7)
1655 
1656         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1657         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1658 
1659         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1660         /* Column 3, product is accumulated in XMM0 */
1661         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1662         SSE_SHUFFLE(XMM7,XMM7,0x00)
1663         SSE_MULT_PS(XMM0,XMM7)
1664 
1665         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1666         SSE_SHUFFLE(XMM7,XMM7,0x00)
1667         SSE_MULT_PS(XMM1,XMM7)
1668         SSE_ADD_PS(XMM0,XMM1)
1669 
1670         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1671         SSE_SHUFFLE(XMM1,XMM1,0x00)
1672         SSE_MULT_PS(XMM1,XMM2)
1673         SSE_ADD_PS(XMM0,XMM1)
1674 
1675         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1676         SSE_SHUFFLE(XMM7,XMM7,0x00)
1677         SSE_MULT_PS(XMM3,XMM7)
1678         SSE_ADD_PS(XMM0,XMM3)
1679 
1680         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1681         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1682 
1683         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1684         /* This is code to be maintained and read by humans afterall. */
1685         /* Copy Multiplier Col 3 into XMM3 */
1686         SSE_COPY_PS(XMM3,XMM0)
1687         /* Copy Multiplier Col 2 into XMM2 */
1688         SSE_COPY_PS(XMM2,XMM6)
1689         /* Copy Multiplier Col 1 into XMM1 */
1690         SSE_COPY_PS(XMM1,XMM5)
1691         /* Copy Multiplier Col 0 into XMM0 */
1692         SSE_COPY_PS(XMM0,XMM4)
1693         SSE_INLINE_END_2;
1694 
1695         /* Update the row: */
1696         nz  = bi[row+1] - diag_offset[row] - 1;
1697         pv += 16;
1698         for (j=0; j<nz; j++) {
1699           PREFETCH_L1(&pv[16]);
1700           x = rtmp + 16*((unsigned int)pj[j]);
1701 /*            x = rtmp + 4*pj[j]; */
1702 
1703           /* X:=X-M*PV, One column at a time */
1704           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1705           SSE_INLINE_BEGIN_2(x,pv)
1706           /* Load First Column of X*/
1707           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1708           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1709 
1710           /* Matrix-Vector Product: */
1711           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1712           SSE_SHUFFLE(XMM5,XMM5,0x00)
1713           SSE_MULT_PS(XMM5,XMM0)
1714           SSE_SUB_PS(XMM4,XMM5)
1715 
1716           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1717           SSE_SHUFFLE(XMM6,XMM6,0x00)
1718           SSE_MULT_PS(XMM6,XMM1)
1719           SSE_SUB_PS(XMM4,XMM6)
1720 
1721           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1722           SSE_SHUFFLE(XMM7,XMM7,0x00)
1723           SSE_MULT_PS(XMM7,XMM2)
1724           SSE_SUB_PS(XMM4,XMM7)
1725 
1726           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1727           SSE_SHUFFLE(XMM5,XMM5,0x00)
1728           SSE_MULT_PS(XMM5,XMM3)
1729           SSE_SUB_PS(XMM4,XMM5)
1730 
1731           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1732           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1733 
1734           /* Second Column */
1735           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1736           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1737 
1738           /* Matrix-Vector Product: */
1739           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1740           SSE_SHUFFLE(XMM6,XMM6,0x00)
1741           SSE_MULT_PS(XMM6,XMM0)
1742           SSE_SUB_PS(XMM5,XMM6)
1743 
1744           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1745           SSE_SHUFFLE(XMM7,XMM7,0x00)
1746           SSE_MULT_PS(XMM7,XMM1)
1747           SSE_SUB_PS(XMM5,XMM7)
1748 
1749           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1750           SSE_SHUFFLE(XMM4,XMM4,0x00)
1751           SSE_MULT_PS(XMM4,XMM2)
1752           SSE_SUB_PS(XMM5,XMM4)
1753 
1754           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1755           SSE_SHUFFLE(XMM6,XMM6,0x00)
1756           SSE_MULT_PS(XMM6,XMM3)
1757           SSE_SUB_PS(XMM5,XMM6)
1758 
1759           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1760           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1761 
1762           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1763 
1764           /* Third Column */
1765           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1766           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1767 
1768           /* Matrix-Vector Product: */
1769           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1770           SSE_SHUFFLE(XMM7,XMM7,0x00)
1771           SSE_MULT_PS(XMM7,XMM0)
1772           SSE_SUB_PS(XMM6,XMM7)
1773 
1774           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1775           SSE_SHUFFLE(XMM4,XMM4,0x00)
1776           SSE_MULT_PS(XMM4,XMM1)
1777           SSE_SUB_PS(XMM6,XMM4)
1778 
1779           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1780           SSE_SHUFFLE(XMM5,XMM5,0x00)
1781           SSE_MULT_PS(XMM5,XMM2)
1782           SSE_SUB_PS(XMM6,XMM5)
1783 
1784           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1785           SSE_SHUFFLE(XMM7,XMM7,0x00)
1786           SSE_MULT_PS(XMM7,XMM3)
1787           SSE_SUB_PS(XMM6,XMM7)
1788 
1789           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1790           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1791 
1792           /* Fourth Column */
1793           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1794           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1795 
1796           /* Matrix-Vector Product: */
1797           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1798           SSE_SHUFFLE(XMM5,XMM5,0x00)
1799           SSE_MULT_PS(XMM5,XMM0)
1800           SSE_SUB_PS(XMM4,XMM5)
1801 
1802           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1803           SSE_SHUFFLE(XMM6,XMM6,0x00)
1804           SSE_MULT_PS(XMM6,XMM1)
1805           SSE_SUB_PS(XMM4,XMM6)
1806 
1807           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1808           SSE_SHUFFLE(XMM7,XMM7,0x00)
1809           SSE_MULT_PS(XMM7,XMM2)
1810           SSE_SUB_PS(XMM4,XMM7)
1811 
1812           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1813           SSE_SHUFFLE(XMM5,XMM5,0x00)
1814           SSE_MULT_PS(XMM5,XMM3)
1815           SSE_SUB_PS(XMM4,XMM5)
1816 
1817           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1818           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1819           SSE_INLINE_END_2;
1820           pv += 16;
1821         }
1822         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1823       }
1824       row = (unsigned int)(*bjtmp++);
1825 /*        row = (*bjtmp++)/4; */
1826 /*        bjtmp++; */
1827     }
1828     /* finished row so stick it into b->a */
1829     pv = ba + 16*bi[i];
1830     pj = bj + bi[i];
1831     nz = bi[i+1] - bi[i];
1832 
1833     /* Copy x block back into pv block */
1834     for (j=0; j<nz; j++) {
1835       x = rtmp+16*((unsigned int)pj[j]);
1836 /*        x  = rtmp+4*pj[j]; */
1837 
1838       SSE_INLINE_BEGIN_2(x,pv)
1839       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1840       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1841       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1842 
1843       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1844       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1845 
1846       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1847       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1848 
1849       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1850       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1851 
1852       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1853       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1854 
1855       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1856       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1857 
1858       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1859       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1860 
1861       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1862       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1863       SSE_INLINE_END_2;
1864       pv += 16;
1865     }
1866     /* invert diagonal block */
1867     w = ba + 16*diag_offset[i];
1868     if (pivotinblocks) {
1869       ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr);
1870     } else {
1871       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1872     }
1873 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1874     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1875   }
1876 
1877   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1878 
1879   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1880   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1881   C->assembled           = PETSC_TRUE;
1882 
1883   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1884   /* Flop Count from inverting diagonal blocks */
1885   SSE_SCOPE_END;
1886   PetscFunctionReturn(0);
1887 }
1888 
1889 #endif
1890