xref: /petsc/src/mat/impls/baij/seq/baijfact11.c (revision 2b8d69ca7ea5fe9190df62c1dce3bbd66fce84dd)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
12 #undef __FUNCT__
13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace"
14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
15 {
16   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
17   IS             isrow = b->row,isicol = b->icol;
18   PetscErrorCode ierr;
19   const PetscInt *r,*ic;
20   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
21   PetscInt       *ajtmpold,*ajtmp,nz,row;
22   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
23   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
24   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
25   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
26   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
27   MatScalar      m13,m14,m15,m16;
28   MatScalar      *ba           = b->a,*aa = a->a;
29   PetscBool      pivotinblocks = b->pivotinblocks;
30   PetscReal      shift         = info->shiftamount;
31   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
32 
33   PetscFunctionBegin;
34   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
36   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
37   allowzeropivot = PetscNot(A->erroriffailure);
38 
39   for (i=0; i<n; i++) {
40     nz    = bi[i+1] - bi[i];
41     ajtmp = bj + bi[i];
42     for  (j=0; j<nz; j++) {
43       x     = rtmp+16*ajtmp[j];
44       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
45       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
46     }
47     /* load in initial (unfactored row) */
48     idx      = r[i];
49     nz       = ai[idx+1] - ai[idx];
50     ajtmpold = aj + ai[idx];
51     v        = aa + 16*ai[idx];
52     for (j=0; j<nz; j++) {
53       x     = rtmp+16*ic[ajtmpold[j]];
54       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
55       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
56       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
57       x[14] = v[14]; x[15] = v[15];
58       v    += 16;
59     }
60     row = *ajtmp++;
61     while (row < i) {
62       pc  = rtmp + 16*row;
63       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
64       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
65       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
66       p15 = pc[14]; p16 = pc[15];
67       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
68           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
69           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
70           || p16 != 0.0) {
71         pv    = ba + 16*diag_offset[row];
72         pj    = bj + diag_offset[row] + 1;
73         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
74         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
75         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
76         x15   = pv[14]; x16 = pv[15];
77         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
78         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
79         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
80         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
81 
82         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
83         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
84         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
85         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
86 
87         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
88         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
89         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
90         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
91 
92         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
93         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
94         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
95         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
96 
97         nz  = bi[row+1] - diag_offset[row] - 1;
98         pv += 16;
99         for (j=0; j<nz; j++) {
100           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
101           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
102           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
103           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
104           x     = rtmp + 16*pj[j];
105           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
106           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
107           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
108           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
109 
110           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
111           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
112           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
113           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
114 
115           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
116           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
117           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
118           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
119 
120           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
121           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
122           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
123           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
124 
125           pv += 16;
126         }
127         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
128       }
129       row = *ajtmp++;
130     }
131     /* finished row so stick it into b->a */
132     pv = ba + 16*bi[i];
133     pj = bj + bi[i];
134     nz = bi[i+1] - bi[i];
135     for (j=0; j<nz; j++) {
136       x      = rtmp+16*pj[j];
137       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
138       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
139       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
140       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
141       pv    += 16;
142     }
143     /* invert diagonal block */
144     w = ba + 16*diag_offset[i];
145     if (pivotinblocks) {
146       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
147       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
148     } else {
149       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
150     }
151   }
152 
153   ierr = PetscFree(rtmp);CHKERRQ(ierr);
154   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
155   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
156 
157   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
158   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
159   C->assembled           = PETSC_TRUE;
160 
161   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatLUFactorNumeric_SeqBAIJ_4 -
166      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
167        PetscKernel_A_gets_A_times_B()
168        PetscKernel_A_gets_A_minus_B_times_C()
169        PetscKernel_A_gets_inverse_A()
170 */
171 
172 #undef __FUNCT__
173 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4"
174 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
175 {
176   Mat            C     = B;
177   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
178   IS             isrow = b->row,isicol = b->icol;
179   PetscErrorCode ierr;
180   const PetscInt *r,*ic;
181   PetscInt       i,j,k,nz,nzL,row;
182   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
183   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
184   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
185   PetscInt       flg;
186   PetscReal      shift;
187   PetscBool      allowzeropivot,zeropivotdetected;
188 
189   PetscFunctionBegin;
190   allowzeropivot = PetscNot(A->erroriffailure);
191   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
192   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
193 
194   if (info->shifttype == MAT_SHIFT_NONE) {
195     shift = 0;
196   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
197     shift = info->shiftamount;
198   }
199 
200   /* generate work space needed by the factorization */
201   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
202   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
203 
204   for (i=0; i<n; i++) {
205     /* zero rtmp */
206     /* L part */
207     nz    = bi[i+1] - bi[i];
208     bjtmp = bj + bi[i];
209     for  (j=0; j<nz; j++) {
210       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
211     }
212 
213     /* U part */
214     nz    = bdiag[i]-bdiag[i+1];
215     bjtmp = bj + bdiag[i+1]+1;
216     for  (j=0; j<nz; j++) {
217       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
218     }
219 
220     /* load in initial (unfactored row) */
221     nz    = ai[r[i]+1] - ai[r[i]];
222     ajtmp = aj + ai[r[i]];
223     v     = aa + bs2*ai[r[i]];
224     for (j=0; j<nz; j++) {
225       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
226     }
227 
228     /* elimination */
229     bjtmp = bj + bi[i];
230     nzL   = bi[i+1] - bi[i];
231     for (k=0; k < nzL; k++) {
232       row = bjtmp[k];
233       pc  = rtmp + bs2*row;
234       for (flg=0,j=0; j<bs2; j++) {
235         if (pc[j]!=0.0) {
236           flg = 1;
237           break;
238         }
239       }
240       if (flg) {
241         pv = b->a + bs2*bdiag[row];
242         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
243         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
244 
245         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
246         pv = b->a + bs2*(bdiag[row+1]+1);
247         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
248         for (j=0; j<nz; j++) {
249           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
250           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
251           v    = rtmp + bs2*pj[j];
252           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
253           pv  += bs2;
254         }
255         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
256       }
257     }
258 
259     /* finished row so stick it into b->a */
260     /* L part */
261     pv = b->a + bs2*bi[i];
262     pj = b->j + bi[i];
263     nz = bi[i+1] - bi[i];
264     for (j=0; j<nz; j++) {
265       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
266     }
267 
268     /* Mark diagonal and invert diagonal for simplier triangular solves */
269     pv   = b->a + bs2*bdiag[i];
270     pj   = b->j + bdiag[i];
271     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
272     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
273     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
274 
275     /* U part */
276     pv = b->a + bs2*(bdiag[i+1]+1);
277     pj = b->j + bdiag[i+1]+1;
278     nz = bdiag[i] - bdiag[i+1] - 1;
279     for (j=0; j<nz; j++) {
280       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
281     }
282   }
283 
284   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
285   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
286   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
287 
288   C->ops->solve          = MatSolve_SeqBAIJ_4;
289   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
290   C->assembled           = PETSC_TRUE;
291 
292   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
293   PetscFunctionReturn(0);
294 }
295 
296 #undef __FUNCT__
297 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace"
298 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
299 {
300 /*
301     Default Version for when blocks are 4 by 4 Using natural ordering
302 */
303   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
304   PetscErrorCode ierr;
305   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
306   PetscInt       *ajtmpold,*ajtmp,nz,row;
307   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
308   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
309   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
310   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
311   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
312   MatScalar      m13,m14,m15,m16;
313   MatScalar      *ba           = b->a,*aa = a->a;
314   PetscBool      pivotinblocks = b->pivotinblocks;
315   PetscReal      shift         = info->shiftamount;
316   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
317 
318   PetscFunctionBegin;
319   allowzeropivot = PetscNot(A->erroriffailure);
320   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
321 
322   for (i=0; i<n; i++) {
323     nz    = bi[i+1] - bi[i];
324     ajtmp = bj + bi[i];
325     for  (j=0; j<nz; j++) {
326       x     = rtmp+16*ajtmp[j];
327       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
328       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
329     }
330     /* load in initial (unfactored row) */
331     nz       = ai[i+1] - ai[i];
332     ajtmpold = aj + ai[i];
333     v        = aa + 16*ai[i];
334     for (j=0; j<nz; j++) {
335       x     = rtmp+16*ajtmpold[j];
336       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
337       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
338       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
339       x[14] = v[14]; x[15] = v[15];
340       v    += 16;
341     }
342     row = *ajtmp++;
343     while (row < i) {
344       pc  = rtmp + 16*row;
345       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
346       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
347       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
348       p15 = pc[14]; p16 = pc[15];
349       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
350           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
351           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
352           || p16 != 0.0) {
353         pv    = ba + 16*diag_offset[row];
354         pj    = bj + diag_offset[row] + 1;
355         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
356         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
357         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
358         x15   = pv[14]; x16 = pv[15];
359         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
360         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
361         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
362         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
363 
364         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
365         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
366         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
367         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
368 
369         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
370         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
371         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
372         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
373 
374         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
375         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
376         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
377         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
378         nz     = bi[row+1] - diag_offset[row] - 1;
379         pv    += 16;
380         for (j=0; j<nz; j++) {
381           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
382           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
383           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
384           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
385           x     = rtmp + 16*pj[j];
386           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
387           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
388           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
389           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
390 
391           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
392           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
393           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
394           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
395 
396           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
397           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
398           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
399           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
400 
401           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
402           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
403           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
404           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
405 
406           pv += 16;
407         }
408         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
409       }
410       row = *ajtmp++;
411     }
412     /* finished row so stick it into b->a */
413     pv = ba + 16*bi[i];
414     pj = bj + bi[i];
415     nz = bi[i+1] - bi[i];
416     for (j=0; j<nz; j++) {
417       x      = rtmp+16*pj[j];
418       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
419       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
420       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
421       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
422       pv    += 16;
423     }
424     /* invert diagonal block */
425     w = ba + 16*diag_offset[i];
426     if (pivotinblocks) {
427       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
428       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
429     } else {
430       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
431     }
432   }
433 
434   ierr = PetscFree(rtmp);CHKERRQ(ierr);
435 
436   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
437   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
438   C->assembled           = PETSC_TRUE;
439 
440   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
441   PetscFunctionReturn(0);
442 }
443 
444 /*
445   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
446     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
447 */
448 #undef __FUNCT__
449 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering"
450 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
451 {
452   Mat            C =B;
453   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
454   PetscErrorCode ierr;
455   PetscInt       i,j,k,nz,nzL,row;
456   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
457   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
458   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
459   PetscInt       flg;
460   PetscReal      shift;
461   PetscBool      allowzeropivot,zeropivotdetected;
462 
463   PetscFunctionBegin;
464   allowzeropivot = PetscNot(A->erroriffailure);
465 
466   /* generate work space needed by the factorization */
467   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
468   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
469 
470   if (info->shifttype == MAT_SHIFT_NONE) {
471     shift = 0;
472   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
473     shift = info->shiftamount;
474   }
475 
476   for (i=0; i<n; i++) {
477     /* zero rtmp */
478     /* L part */
479     nz    = bi[i+1] - bi[i];
480     bjtmp = bj + bi[i];
481     for  (j=0; j<nz; j++) {
482       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
483     }
484 
485     /* U part */
486     nz    = bdiag[i] - bdiag[i+1];
487     bjtmp = bj + bdiag[i+1]+1;
488     for  (j=0; j<nz; j++) {
489       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
490     }
491 
492     /* load in initial (unfactored row) */
493     nz    = ai[i+1] - ai[i];
494     ajtmp = aj + ai[i];
495     v     = aa + bs2*ai[i];
496     for (j=0; j<nz; j++) {
497       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
498     }
499 
500     /* elimination */
501     bjtmp = bj + bi[i];
502     nzL   = bi[i+1] - bi[i];
503     for (k=0; k < nzL; k++) {
504       row = bjtmp[k];
505       pc  = rtmp + bs2*row;
506       for (flg=0,j=0; j<bs2; j++) {
507         if (pc[j]!=0.0) {
508           flg = 1;
509           break;
510         }
511       }
512       if (flg) {
513         pv = b->a + bs2*bdiag[row];
514         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
515         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
516 
517         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
518         pv = b->a + bs2*(bdiag[row+1]+1);
519         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
520         for (j=0; j<nz; j++) {
521           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
522           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
523           v    = rtmp + bs2*pj[j];
524           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
525           pv  += bs2;
526         }
527         ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
528       }
529     }
530 
531     /* finished row so stick it into b->a */
532     /* L part */
533     pv = b->a + bs2*bi[i];
534     pj = b->j + bi[i];
535     nz = bi[i+1] - bi[i];
536     for (j=0; j<nz; j++) {
537       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
538     }
539 
540     /* Mark diagonal and invert diagonal for simplier triangular solves */
541     pv   = b->a + bs2*bdiag[i];
542     pj   = b->j + bdiag[i];
543     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
544     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
545     if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
546 
547     /* U part */
548     pv = b->a + bs2*(bdiag[i+1]+1);
549     pj = b->j + bdiag[i+1]+1;
550     nz = bdiag[i] - bdiag[i+1] - 1;
551     for (j=0; j<nz; j++) {
552       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
553     }
554   }
555   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
556 
557   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
558   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
559   C->assembled           = PETSC_TRUE;
560 
561   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
562   PetscFunctionReturn(0);
563 }
564 
565 #if defined(PETSC_HAVE_SSE)
566 
567 #include PETSC_HAVE_SSE
568 
569 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
570 #undef __FUNCT__
571 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE"
572 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
573 {
574   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
575   PetscErrorCode ierr;
576   int            i,j,n = a->mbs;
577   int            *bj = b->j,*bjtmp,*pj;
578   int            row;
579   int            *ajtmpold,nz,*bi=b->i;
580   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
581   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
582   MatScalar      *ba    = b->a,*aa = a->a;
583   int            nonzero=0;
584   PetscBool      pivotinblocks = b->pivotinblocks;
585   PetscReal      shift         = info->shiftamount;
586   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
587 
588   PetscFunctionBegin;
589   allowzeropivot = PetscNot(A->erroriffailure);
590   SSE_SCOPE_BEGIN;
591 
592   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
593   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
594   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
595   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
596 /*    if ((unsigned long)bj==(unsigned long)aj) { */
597 /*      colscale = 4; */
598 /*    } */
599   for (i=0; i<n; i++) {
600     nz    = bi[i+1] - bi[i];
601     bjtmp = bj + bi[i];
602     /* zero out the 4x4 block accumulators */
603     /* zero out one register */
604     XOR_PS(XMM7,XMM7);
605     for  (j=0; j<nz; j++) {
606       x = rtmp+16*bjtmp[j];
607 /*        x = rtmp+4*bjtmp[j]; */
608       SSE_INLINE_BEGIN_1(x)
609       /* Copy zero register to memory locations */
610       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
611       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
612       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
613       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
614       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
615       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
616       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
617       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
618       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
619       SSE_INLINE_END_1;
620     }
621     /* load in initial (unfactored row) */
622     nz       = ai[i+1] - ai[i];
623     ajtmpold = aj + ai[i];
624     v        = aa + 16*ai[i];
625     for (j=0; j<nz; j++) {
626       x = rtmp+16*ajtmpold[j];
627 /*        x = rtmp+colscale*ajtmpold[j]; */
628       /* Copy v block into x block */
629       SSE_INLINE_BEGIN_2(v,x)
630       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
631       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
632       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
633 
634       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
635       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
636 
637       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
638       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
639 
640       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
641       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
642 
643       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
644       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
645 
646       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
647       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
648 
649       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
650       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
651 
652       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
653       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
654       SSE_INLINE_END_2;
655 
656       v += 16;
657     }
658 /*      row = (*bjtmp++)/4; */
659     row = *bjtmp++;
660     while (row < i) {
661       pc = rtmp + 16*row;
662       SSE_INLINE_BEGIN_1(pc)
663       /* Load block from lower triangle */
664       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
665       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
666       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
667 
668       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
669       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
670 
671       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
672       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
673 
674       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
675       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
676 
677       /* Compare block to zero block */
678 
679       SSE_COPY_PS(XMM4,XMM7)
680       SSE_CMPNEQ_PS(XMM4,XMM0)
681 
682       SSE_COPY_PS(XMM5,XMM7)
683       SSE_CMPNEQ_PS(XMM5,XMM1)
684 
685       SSE_COPY_PS(XMM6,XMM7)
686       SSE_CMPNEQ_PS(XMM6,XMM2)
687 
688       SSE_CMPNEQ_PS(XMM7,XMM3)
689 
690       /* Reduce the comparisons to one SSE register */
691       SSE_OR_PS(XMM6,XMM7)
692       SSE_OR_PS(XMM5,XMM4)
693       SSE_OR_PS(XMM5,XMM6)
694       SSE_INLINE_END_1;
695 
696       /* Reduce the one SSE register to an integer register for branching */
697       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
698       MOVEMASK(nonzero,XMM5);
699 
700       /* If block is nonzero ... */
701       if (nonzero) {
702         pv = ba + 16*diag_offset[row];
703         PREFETCH_L1(&pv[16]);
704         pj = bj + diag_offset[row] + 1;
705 
706         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
707         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
708         /* but the diagonal was inverted already */
709         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
710 
711         SSE_INLINE_BEGIN_2(pv,pc)
712         /* Column 0, product is accumulated in XMM4 */
713         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
714         SSE_SHUFFLE(XMM4,XMM4,0x00)
715         SSE_MULT_PS(XMM4,XMM0)
716 
717         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
718         SSE_SHUFFLE(XMM5,XMM5,0x00)
719         SSE_MULT_PS(XMM5,XMM1)
720         SSE_ADD_PS(XMM4,XMM5)
721 
722         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
723         SSE_SHUFFLE(XMM6,XMM6,0x00)
724         SSE_MULT_PS(XMM6,XMM2)
725         SSE_ADD_PS(XMM4,XMM6)
726 
727         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
728         SSE_SHUFFLE(XMM7,XMM7,0x00)
729         SSE_MULT_PS(XMM7,XMM3)
730         SSE_ADD_PS(XMM4,XMM7)
731 
732         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
733         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
734 
735         /* Column 1, product is accumulated in XMM5 */
736         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
737         SSE_SHUFFLE(XMM5,XMM5,0x00)
738         SSE_MULT_PS(XMM5,XMM0)
739 
740         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
741         SSE_SHUFFLE(XMM6,XMM6,0x00)
742         SSE_MULT_PS(XMM6,XMM1)
743         SSE_ADD_PS(XMM5,XMM6)
744 
745         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
746         SSE_SHUFFLE(XMM7,XMM7,0x00)
747         SSE_MULT_PS(XMM7,XMM2)
748         SSE_ADD_PS(XMM5,XMM7)
749 
750         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
751         SSE_SHUFFLE(XMM6,XMM6,0x00)
752         SSE_MULT_PS(XMM6,XMM3)
753         SSE_ADD_PS(XMM5,XMM6)
754 
755         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
756         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
757 
758         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
759 
760         /* Column 2, product is accumulated in XMM6 */
761         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
762         SSE_SHUFFLE(XMM6,XMM6,0x00)
763         SSE_MULT_PS(XMM6,XMM0)
764 
765         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
766         SSE_SHUFFLE(XMM7,XMM7,0x00)
767         SSE_MULT_PS(XMM7,XMM1)
768         SSE_ADD_PS(XMM6,XMM7)
769 
770         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
771         SSE_SHUFFLE(XMM7,XMM7,0x00)
772         SSE_MULT_PS(XMM7,XMM2)
773         SSE_ADD_PS(XMM6,XMM7)
774 
775         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
776         SSE_SHUFFLE(XMM7,XMM7,0x00)
777         SSE_MULT_PS(XMM7,XMM3)
778         SSE_ADD_PS(XMM6,XMM7)
779 
780         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
781         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
782 
783         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
784         /* Column 3, product is accumulated in XMM0 */
785         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
786         SSE_SHUFFLE(XMM7,XMM7,0x00)
787         SSE_MULT_PS(XMM0,XMM7)
788 
789         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
790         SSE_SHUFFLE(XMM7,XMM7,0x00)
791         SSE_MULT_PS(XMM1,XMM7)
792         SSE_ADD_PS(XMM0,XMM1)
793 
794         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
795         SSE_SHUFFLE(XMM1,XMM1,0x00)
796         SSE_MULT_PS(XMM1,XMM2)
797         SSE_ADD_PS(XMM0,XMM1)
798 
799         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
800         SSE_SHUFFLE(XMM7,XMM7,0x00)
801         SSE_MULT_PS(XMM3,XMM7)
802         SSE_ADD_PS(XMM0,XMM3)
803 
804         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
805         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
806 
807         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
808         /* This is code to be maintained and read by humans afterall. */
809         /* Copy Multiplier Col 3 into XMM3 */
810         SSE_COPY_PS(XMM3,XMM0)
811         /* Copy Multiplier Col 2 into XMM2 */
812         SSE_COPY_PS(XMM2,XMM6)
813         /* Copy Multiplier Col 1 into XMM1 */
814         SSE_COPY_PS(XMM1,XMM5)
815         /* Copy Multiplier Col 0 into XMM0 */
816         SSE_COPY_PS(XMM0,XMM4)
817         SSE_INLINE_END_2;
818 
819         /* Update the row: */
820         nz  = bi[row+1] - diag_offset[row] - 1;
821         pv += 16;
822         for (j=0; j<nz; j++) {
823           PREFETCH_L1(&pv[16]);
824           x = rtmp + 16*pj[j];
825 /*            x = rtmp + 4*pj[j]; */
826 
827           /* X:=X-M*PV, One column at a time */
828           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
829           SSE_INLINE_BEGIN_2(x,pv)
830           /* Load First Column of X*/
831           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
832           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
833 
834           /* Matrix-Vector Product: */
835           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
836           SSE_SHUFFLE(XMM5,XMM5,0x00)
837           SSE_MULT_PS(XMM5,XMM0)
838           SSE_SUB_PS(XMM4,XMM5)
839 
840           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
841           SSE_SHUFFLE(XMM6,XMM6,0x00)
842           SSE_MULT_PS(XMM6,XMM1)
843           SSE_SUB_PS(XMM4,XMM6)
844 
845           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
846           SSE_SHUFFLE(XMM7,XMM7,0x00)
847           SSE_MULT_PS(XMM7,XMM2)
848           SSE_SUB_PS(XMM4,XMM7)
849 
850           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
851           SSE_SHUFFLE(XMM5,XMM5,0x00)
852           SSE_MULT_PS(XMM5,XMM3)
853           SSE_SUB_PS(XMM4,XMM5)
854 
855           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
856           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
857 
858           /* Second Column */
859           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
860           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
861 
862           /* Matrix-Vector Product: */
863           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
864           SSE_SHUFFLE(XMM6,XMM6,0x00)
865           SSE_MULT_PS(XMM6,XMM0)
866           SSE_SUB_PS(XMM5,XMM6)
867 
868           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
869           SSE_SHUFFLE(XMM7,XMM7,0x00)
870           SSE_MULT_PS(XMM7,XMM1)
871           SSE_SUB_PS(XMM5,XMM7)
872 
873           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
874           SSE_SHUFFLE(XMM4,XMM4,0x00)
875           SSE_MULT_PS(XMM4,XMM2)
876           SSE_SUB_PS(XMM5,XMM4)
877 
878           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
879           SSE_SHUFFLE(XMM6,XMM6,0x00)
880           SSE_MULT_PS(XMM6,XMM3)
881           SSE_SUB_PS(XMM5,XMM6)
882 
883           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
884           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
885 
886           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
887 
888           /* Third Column */
889           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
890           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
891 
892           /* Matrix-Vector Product: */
893           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
894           SSE_SHUFFLE(XMM7,XMM7,0x00)
895           SSE_MULT_PS(XMM7,XMM0)
896           SSE_SUB_PS(XMM6,XMM7)
897 
898           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
899           SSE_SHUFFLE(XMM4,XMM4,0x00)
900           SSE_MULT_PS(XMM4,XMM1)
901           SSE_SUB_PS(XMM6,XMM4)
902 
903           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
904           SSE_SHUFFLE(XMM5,XMM5,0x00)
905           SSE_MULT_PS(XMM5,XMM2)
906           SSE_SUB_PS(XMM6,XMM5)
907 
908           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
909           SSE_SHUFFLE(XMM7,XMM7,0x00)
910           SSE_MULT_PS(XMM7,XMM3)
911           SSE_SUB_PS(XMM6,XMM7)
912 
913           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
914           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
915 
916           /* Fourth Column */
917           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
918           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
919 
920           /* Matrix-Vector Product: */
921           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
922           SSE_SHUFFLE(XMM5,XMM5,0x00)
923           SSE_MULT_PS(XMM5,XMM0)
924           SSE_SUB_PS(XMM4,XMM5)
925 
926           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
927           SSE_SHUFFLE(XMM6,XMM6,0x00)
928           SSE_MULT_PS(XMM6,XMM1)
929           SSE_SUB_PS(XMM4,XMM6)
930 
931           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
932           SSE_SHUFFLE(XMM7,XMM7,0x00)
933           SSE_MULT_PS(XMM7,XMM2)
934           SSE_SUB_PS(XMM4,XMM7)
935 
936           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
937           SSE_SHUFFLE(XMM5,XMM5,0x00)
938           SSE_MULT_PS(XMM5,XMM3)
939           SSE_SUB_PS(XMM4,XMM5)
940 
941           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
942           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
943           SSE_INLINE_END_2;
944           pv += 16;
945         }
946         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
947       }
948       row = *bjtmp++;
949 /*        row = (*bjtmp++)/4; */
950     }
951     /* finished row so stick it into b->a */
952     pv = ba + 16*bi[i];
953     pj = bj + bi[i];
954     nz = bi[i+1] - bi[i];
955 
956     /* Copy x block back into pv block */
957     for (j=0; j<nz; j++) {
958       x = rtmp+16*pj[j];
959 /*        x  = rtmp+4*pj[j]; */
960 
961       SSE_INLINE_BEGIN_2(x,pv)
962       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
963       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
964       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
965 
966       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
967       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
968 
969       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
970       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
971 
972       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
973       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
974 
975       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
976       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
977 
978       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
979       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
980 
981       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
982       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
983 
984       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
985       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
986       SSE_INLINE_END_2;
987       pv += 16;
988     }
989     /* invert diagonal block */
990     w = ba + 16*diag_offset[i];
991     if (pivotinblocks) {
992       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
993       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
994     } else {
995       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
996     }
997 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
998     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
999   }
1000 
1001   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1002 
1003   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1004   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1005   C->assembled           = PETSC_TRUE;
1006 
1007   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1008   /* Flop Count from inverting diagonal blocks */
1009   SSE_SCOPE_END;
1010   PetscFunctionReturn(0);
1011 }
1012 
1013 #undef __FUNCT__
1014 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace"
1015 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1016 {
1017   Mat            A  =C;
1018   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1019   PetscErrorCode ierr;
1020   int            i,j,n = a->mbs;
1021   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1022   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1023   unsigned int   row;
1024   int            nz,*bi=b->i;
1025   int            *diag_offset = b->diag,*ai=a->i;
1026   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1027   MatScalar      *ba    = b->a,*aa = a->a;
1028   int            nonzero=0;
1029   PetscBool      pivotinblocks = b->pivotinblocks;
1030   PetscReal      shift = info->shiftamount;
1031   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1032 
1033   PetscFunctionBegin;
1034   allowzeropivot = PetscNot(A->erroriffailure);
1035   SSE_SCOPE_BEGIN;
1036 
1037   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1038   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1039   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1040   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1041 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1042 /*      colscale = 4; */
1043 /*    } */
1044 
1045   for (i=0; i<n; i++) {
1046     nz    = bi[i+1] - bi[i];
1047     bjtmp = bj + bi[i];
1048     /* zero out the 4x4 block accumulators */
1049     /* zero out one register */
1050     XOR_PS(XMM7,XMM7);
1051     for  (j=0; j<nz; j++) {
1052       x = rtmp+16*((unsigned int)bjtmp[j]);
1053 /*        x = rtmp+4*bjtmp[j]; */
1054       SSE_INLINE_BEGIN_1(x)
1055       /* Copy zero register to memory locations */
1056       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1057       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1058       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1059       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1060       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1061       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1062       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1063       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1064       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1065       SSE_INLINE_END_1;
1066     }
1067     /* load in initial (unfactored row) */
1068     nz    = ai[i+1] - ai[i];
1069     ajtmp = aj + ai[i];
1070     v     = aa + 16*ai[i];
1071     for (j=0; j<nz; j++) {
1072       x = rtmp+16*((unsigned int)ajtmp[j]);
1073 /*        x = rtmp+colscale*ajtmp[j]; */
1074       /* Copy v block into x block */
1075       SSE_INLINE_BEGIN_2(v,x)
1076       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1077       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1078       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1079 
1080       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1081       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1082 
1083       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1084       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1085 
1086       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1087       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1088 
1089       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1090       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1091 
1092       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1093       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1094 
1095       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1096       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1097 
1098       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1099       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1100       SSE_INLINE_END_2;
1101 
1102       v += 16;
1103     }
1104 /*      row = (*bjtmp++)/4; */
1105     row = (unsigned int)(*bjtmp++);
1106     while (row < i) {
1107       pc = rtmp + 16*row;
1108       SSE_INLINE_BEGIN_1(pc)
1109       /* Load block from lower triangle */
1110       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1111       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1112       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1113 
1114       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1115       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1116 
1117       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1118       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1119 
1120       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1121       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1122 
1123       /* Compare block to zero block */
1124 
1125       SSE_COPY_PS(XMM4,XMM7)
1126       SSE_CMPNEQ_PS(XMM4,XMM0)
1127 
1128       SSE_COPY_PS(XMM5,XMM7)
1129       SSE_CMPNEQ_PS(XMM5,XMM1)
1130 
1131       SSE_COPY_PS(XMM6,XMM7)
1132       SSE_CMPNEQ_PS(XMM6,XMM2)
1133 
1134       SSE_CMPNEQ_PS(XMM7,XMM3)
1135 
1136       /* Reduce the comparisons to one SSE register */
1137       SSE_OR_PS(XMM6,XMM7)
1138       SSE_OR_PS(XMM5,XMM4)
1139       SSE_OR_PS(XMM5,XMM6)
1140       SSE_INLINE_END_1;
1141 
1142       /* Reduce the one SSE register to an integer register for branching */
1143       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1144       MOVEMASK(nonzero,XMM5);
1145 
1146       /* If block is nonzero ... */
1147       if (nonzero) {
1148         pv = ba + 16*diag_offset[row];
1149         PREFETCH_L1(&pv[16]);
1150         pj = bj + diag_offset[row] + 1;
1151 
1152         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1153         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1154         /* but the diagonal was inverted already */
1155         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1156 
1157         SSE_INLINE_BEGIN_2(pv,pc)
1158         /* Column 0, product is accumulated in XMM4 */
1159         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1160         SSE_SHUFFLE(XMM4,XMM4,0x00)
1161         SSE_MULT_PS(XMM4,XMM0)
1162 
1163         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1164         SSE_SHUFFLE(XMM5,XMM5,0x00)
1165         SSE_MULT_PS(XMM5,XMM1)
1166         SSE_ADD_PS(XMM4,XMM5)
1167 
1168         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1169         SSE_SHUFFLE(XMM6,XMM6,0x00)
1170         SSE_MULT_PS(XMM6,XMM2)
1171         SSE_ADD_PS(XMM4,XMM6)
1172 
1173         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1174         SSE_SHUFFLE(XMM7,XMM7,0x00)
1175         SSE_MULT_PS(XMM7,XMM3)
1176         SSE_ADD_PS(XMM4,XMM7)
1177 
1178         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1179         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1180 
1181         /* Column 1, product is accumulated in XMM5 */
1182         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1183         SSE_SHUFFLE(XMM5,XMM5,0x00)
1184         SSE_MULT_PS(XMM5,XMM0)
1185 
1186         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1187         SSE_SHUFFLE(XMM6,XMM6,0x00)
1188         SSE_MULT_PS(XMM6,XMM1)
1189         SSE_ADD_PS(XMM5,XMM6)
1190 
1191         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1192         SSE_SHUFFLE(XMM7,XMM7,0x00)
1193         SSE_MULT_PS(XMM7,XMM2)
1194         SSE_ADD_PS(XMM5,XMM7)
1195 
1196         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1197         SSE_SHUFFLE(XMM6,XMM6,0x00)
1198         SSE_MULT_PS(XMM6,XMM3)
1199         SSE_ADD_PS(XMM5,XMM6)
1200 
1201         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1202         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1203 
1204         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1205 
1206         /* Column 2, product is accumulated in XMM6 */
1207         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1208         SSE_SHUFFLE(XMM6,XMM6,0x00)
1209         SSE_MULT_PS(XMM6,XMM0)
1210 
1211         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1212         SSE_SHUFFLE(XMM7,XMM7,0x00)
1213         SSE_MULT_PS(XMM7,XMM1)
1214         SSE_ADD_PS(XMM6,XMM7)
1215 
1216         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1217         SSE_SHUFFLE(XMM7,XMM7,0x00)
1218         SSE_MULT_PS(XMM7,XMM2)
1219         SSE_ADD_PS(XMM6,XMM7)
1220 
1221         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1222         SSE_SHUFFLE(XMM7,XMM7,0x00)
1223         SSE_MULT_PS(XMM7,XMM3)
1224         SSE_ADD_PS(XMM6,XMM7)
1225 
1226         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1227         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1228 
1229         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1230         /* Column 3, product is accumulated in XMM0 */
1231         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1232         SSE_SHUFFLE(XMM7,XMM7,0x00)
1233         SSE_MULT_PS(XMM0,XMM7)
1234 
1235         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1236         SSE_SHUFFLE(XMM7,XMM7,0x00)
1237         SSE_MULT_PS(XMM1,XMM7)
1238         SSE_ADD_PS(XMM0,XMM1)
1239 
1240         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1241         SSE_SHUFFLE(XMM1,XMM1,0x00)
1242         SSE_MULT_PS(XMM1,XMM2)
1243         SSE_ADD_PS(XMM0,XMM1)
1244 
1245         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1246         SSE_SHUFFLE(XMM7,XMM7,0x00)
1247         SSE_MULT_PS(XMM3,XMM7)
1248         SSE_ADD_PS(XMM0,XMM3)
1249 
1250         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1251         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1252 
1253         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1254         /* This is code to be maintained and read by humans afterall. */
1255         /* Copy Multiplier Col 3 into XMM3 */
1256         SSE_COPY_PS(XMM3,XMM0)
1257         /* Copy Multiplier Col 2 into XMM2 */
1258         SSE_COPY_PS(XMM2,XMM6)
1259         /* Copy Multiplier Col 1 into XMM1 */
1260         SSE_COPY_PS(XMM1,XMM5)
1261         /* Copy Multiplier Col 0 into XMM0 */
1262         SSE_COPY_PS(XMM0,XMM4)
1263         SSE_INLINE_END_2;
1264 
1265         /* Update the row: */
1266         nz  = bi[row+1] - diag_offset[row] - 1;
1267         pv += 16;
1268         for (j=0; j<nz; j++) {
1269           PREFETCH_L1(&pv[16]);
1270           x = rtmp + 16*((unsigned int)pj[j]);
1271 /*            x = rtmp + 4*pj[j]; */
1272 
1273           /* X:=X-M*PV, One column at a time */
1274           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1275           SSE_INLINE_BEGIN_2(x,pv)
1276           /* Load First Column of X*/
1277           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1278           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1279 
1280           /* Matrix-Vector Product: */
1281           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1282           SSE_SHUFFLE(XMM5,XMM5,0x00)
1283           SSE_MULT_PS(XMM5,XMM0)
1284           SSE_SUB_PS(XMM4,XMM5)
1285 
1286           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1287           SSE_SHUFFLE(XMM6,XMM6,0x00)
1288           SSE_MULT_PS(XMM6,XMM1)
1289           SSE_SUB_PS(XMM4,XMM6)
1290 
1291           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1292           SSE_SHUFFLE(XMM7,XMM7,0x00)
1293           SSE_MULT_PS(XMM7,XMM2)
1294           SSE_SUB_PS(XMM4,XMM7)
1295 
1296           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1297           SSE_SHUFFLE(XMM5,XMM5,0x00)
1298           SSE_MULT_PS(XMM5,XMM3)
1299           SSE_SUB_PS(XMM4,XMM5)
1300 
1301           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1302           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1303 
1304           /* Second Column */
1305           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1306           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1307 
1308           /* Matrix-Vector Product: */
1309           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1310           SSE_SHUFFLE(XMM6,XMM6,0x00)
1311           SSE_MULT_PS(XMM6,XMM0)
1312           SSE_SUB_PS(XMM5,XMM6)
1313 
1314           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1315           SSE_SHUFFLE(XMM7,XMM7,0x00)
1316           SSE_MULT_PS(XMM7,XMM1)
1317           SSE_SUB_PS(XMM5,XMM7)
1318 
1319           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1320           SSE_SHUFFLE(XMM4,XMM4,0x00)
1321           SSE_MULT_PS(XMM4,XMM2)
1322           SSE_SUB_PS(XMM5,XMM4)
1323 
1324           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1325           SSE_SHUFFLE(XMM6,XMM6,0x00)
1326           SSE_MULT_PS(XMM6,XMM3)
1327           SSE_SUB_PS(XMM5,XMM6)
1328 
1329           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1330           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1331 
1332           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1333 
1334           /* Third Column */
1335           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1336           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1337 
1338           /* Matrix-Vector Product: */
1339           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1340           SSE_SHUFFLE(XMM7,XMM7,0x00)
1341           SSE_MULT_PS(XMM7,XMM0)
1342           SSE_SUB_PS(XMM6,XMM7)
1343 
1344           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1345           SSE_SHUFFLE(XMM4,XMM4,0x00)
1346           SSE_MULT_PS(XMM4,XMM1)
1347           SSE_SUB_PS(XMM6,XMM4)
1348 
1349           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1350           SSE_SHUFFLE(XMM5,XMM5,0x00)
1351           SSE_MULT_PS(XMM5,XMM2)
1352           SSE_SUB_PS(XMM6,XMM5)
1353 
1354           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1355           SSE_SHUFFLE(XMM7,XMM7,0x00)
1356           SSE_MULT_PS(XMM7,XMM3)
1357           SSE_SUB_PS(XMM6,XMM7)
1358 
1359           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1360           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1361 
1362           /* Fourth Column */
1363           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1364           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1365 
1366           /* Matrix-Vector Product: */
1367           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1368           SSE_SHUFFLE(XMM5,XMM5,0x00)
1369           SSE_MULT_PS(XMM5,XMM0)
1370           SSE_SUB_PS(XMM4,XMM5)
1371 
1372           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1373           SSE_SHUFFLE(XMM6,XMM6,0x00)
1374           SSE_MULT_PS(XMM6,XMM1)
1375           SSE_SUB_PS(XMM4,XMM6)
1376 
1377           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1378           SSE_SHUFFLE(XMM7,XMM7,0x00)
1379           SSE_MULT_PS(XMM7,XMM2)
1380           SSE_SUB_PS(XMM4,XMM7)
1381 
1382           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1383           SSE_SHUFFLE(XMM5,XMM5,0x00)
1384           SSE_MULT_PS(XMM5,XMM3)
1385           SSE_SUB_PS(XMM4,XMM5)
1386 
1387           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1388           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1389           SSE_INLINE_END_2;
1390           pv += 16;
1391         }
1392         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1393       }
1394       row = (unsigned int)(*bjtmp++);
1395 /*        row = (*bjtmp++)/4; */
1396 /*        bjtmp++; */
1397     }
1398     /* finished row so stick it into b->a */
1399     pv = ba + 16*bi[i];
1400     pj = bj + bi[i];
1401     nz = bi[i+1] - bi[i];
1402 
1403     /* Copy x block back into pv block */
1404     for (j=0; j<nz; j++) {
1405       x = rtmp+16*((unsigned int)pj[j]);
1406 /*        x  = rtmp+4*pj[j]; */
1407 
1408       SSE_INLINE_BEGIN_2(x,pv)
1409       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1410       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1411       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1412 
1413       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1414       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1415 
1416       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1417       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1418 
1419       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1420       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1421 
1422       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1423       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1424 
1425       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1426       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1427 
1428       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1429       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1430 
1431       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1432       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1433       SSE_INLINE_END_2;
1434       pv += 16;
1435     }
1436     /* invert diagonal block */
1437     w = ba + 16*diag_offset[i];
1438     if (pivotinblocks) {
1439       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1440       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1441     } else {
1442       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1443     }
1444     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1445   }
1446 
1447   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1448 
1449   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1450   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1451   C->assembled           = PETSC_TRUE;
1452 
1453   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1454   /* Flop Count from inverting diagonal blocks */
1455   SSE_SCOPE_END;
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 #undef __FUNCT__
1460 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj"
1461 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1462 {
1463   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1464   PetscErrorCode ierr;
1465   int            i,j,n = a->mbs;
1466   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1467   unsigned int   row;
1468   int            *ajtmpold,nz,*bi=b->i;
1469   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1470   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1471   MatScalar      *ba    = b->a,*aa = a->a;
1472   int            nonzero=0;
1473   PetscBool      pivotinblocks = b->pivotinblocks;
1474   PetscReal      shift = info->shiftamount;
1475   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1476 
1477   PetscFunctionBegin;
1478   allowzeropivot = PetscNot(A->erroriffailure);
1479   SSE_SCOPE_BEGIN;
1480 
1481   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1482   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1483   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1484   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1485 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1486 /*      colscale = 4; */
1487 /*    } */
1488   if ((unsigned long)bj==(unsigned long)aj) {
1489     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1490   }
1491 
1492   for (i=0; i<n; i++) {
1493     nz    = bi[i+1] - bi[i];
1494     bjtmp = bj + bi[i];
1495     /* zero out the 4x4 block accumulators */
1496     /* zero out one register */
1497     XOR_PS(XMM7,XMM7);
1498     for  (j=0; j<nz; j++) {
1499       x = rtmp+16*((unsigned int)bjtmp[j]);
1500 /*        x = rtmp+4*bjtmp[j]; */
1501       SSE_INLINE_BEGIN_1(x)
1502       /* Copy zero register to memory locations */
1503       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1504       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1505       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1506       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1507       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1508       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1509       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1510       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1511       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1512       SSE_INLINE_END_1;
1513     }
1514     /* load in initial (unfactored row) */
1515     nz       = ai[i+1] - ai[i];
1516     ajtmpold = aj + ai[i];
1517     v        = aa + 16*ai[i];
1518     for (j=0; j<nz; j++) {
1519       x = rtmp+16*ajtmpold[j];
1520 /*        x = rtmp+colscale*ajtmpold[j]; */
1521       /* Copy v block into x block */
1522       SSE_INLINE_BEGIN_2(v,x)
1523       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1524       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1525       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1526 
1527       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1528       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1529 
1530       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1531       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1532 
1533       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1534       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1535 
1536       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1537       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1538 
1539       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1540       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1541 
1542       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1543       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1544 
1545       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1546       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1547       SSE_INLINE_END_2;
1548 
1549       v += 16;
1550     }
1551 /*      row = (*bjtmp++)/4; */
1552     row = (unsigned int)(*bjtmp++);
1553     while (row < i) {
1554       pc = rtmp + 16*row;
1555       SSE_INLINE_BEGIN_1(pc)
1556       /* Load block from lower triangle */
1557       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1558       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1559       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1560 
1561       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1562       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1563 
1564       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1565       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1566 
1567       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1568       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1569 
1570       /* Compare block to zero block */
1571 
1572       SSE_COPY_PS(XMM4,XMM7)
1573       SSE_CMPNEQ_PS(XMM4,XMM0)
1574 
1575       SSE_COPY_PS(XMM5,XMM7)
1576       SSE_CMPNEQ_PS(XMM5,XMM1)
1577 
1578       SSE_COPY_PS(XMM6,XMM7)
1579       SSE_CMPNEQ_PS(XMM6,XMM2)
1580 
1581       SSE_CMPNEQ_PS(XMM7,XMM3)
1582 
1583       /* Reduce the comparisons to one SSE register */
1584       SSE_OR_PS(XMM6,XMM7)
1585       SSE_OR_PS(XMM5,XMM4)
1586       SSE_OR_PS(XMM5,XMM6)
1587       SSE_INLINE_END_1;
1588 
1589       /* Reduce the one SSE register to an integer register for branching */
1590       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1591       MOVEMASK(nonzero,XMM5);
1592 
1593       /* If block is nonzero ... */
1594       if (nonzero) {
1595         pv = ba + 16*diag_offset[row];
1596         PREFETCH_L1(&pv[16]);
1597         pj = bj + diag_offset[row] + 1;
1598 
1599         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1600         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1601         /* but the diagonal was inverted already */
1602         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1603 
1604         SSE_INLINE_BEGIN_2(pv,pc)
1605         /* Column 0, product is accumulated in XMM4 */
1606         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1607         SSE_SHUFFLE(XMM4,XMM4,0x00)
1608         SSE_MULT_PS(XMM4,XMM0)
1609 
1610         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1611         SSE_SHUFFLE(XMM5,XMM5,0x00)
1612         SSE_MULT_PS(XMM5,XMM1)
1613         SSE_ADD_PS(XMM4,XMM5)
1614 
1615         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1616         SSE_SHUFFLE(XMM6,XMM6,0x00)
1617         SSE_MULT_PS(XMM6,XMM2)
1618         SSE_ADD_PS(XMM4,XMM6)
1619 
1620         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1621         SSE_SHUFFLE(XMM7,XMM7,0x00)
1622         SSE_MULT_PS(XMM7,XMM3)
1623         SSE_ADD_PS(XMM4,XMM7)
1624 
1625         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1626         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1627 
1628         /* Column 1, product is accumulated in XMM5 */
1629         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1630         SSE_SHUFFLE(XMM5,XMM5,0x00)
1631         SSE_MULT_PS(XMM5,XMM0)
1632 
1633         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1634         SSE_SHUFFLE(XMM6,XMM6,0x00)
1635         SSE_MULT_PS(XMM6,XMM1)
1636         SSE_ADD_PS(XMM5,XMM6)
1637 
1638         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1639         SSE_SHUFFLE(XMM7,XMM7,0x00)
1640         SSE_MULT_PS(XMM7,XMM2)
1641         SSE_ADD_PS(XMM5,XMM7)
1642 
1643         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1644         SSE_SHUFFLE(XMM6,XMM6,0x00)
1645         SSE_MULT_PS(XMM6,XMM3)
1646         SSE_ADD_PS(XMM5,XMM6)
1647 
1648         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1649         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1650 
1651         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1652 
1653         /* Column 2, product is accumulated in XMM6 */
1654         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1655         SSE_SHUFFLE(XMM6,XMM6,0x00)
1656         SSE_MULT_PS(XMM6,XMM0)
1657 
1658         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1659         SSE_SHUFFLE(XMM7,XMM7,0x00)
1660         SSE_MULT_PS(XMM7,XMM1)
1661         SSE_ADD_PS(XMM6,XMM7)
1662 
1663         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1664         SSE_SHUFFLE(XMM7,XMM7,0x00)
1665         SSE_MULT_PS(XMM7,XMM2)
1666         SSE_ADD_PS(XMM6,XMM7)
1667 
1668         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1669         SSE_SHUFFLE(XMM7,XMM7,0x00)
1670         SSE_MULT_PS(XMM7,XMM3)
1671         SSE_ADD_PS(XMM6,XMM7)
1672 
1673         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1674         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1675 
1676         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1677         /* Column 3, product is accumulated in XMM0 */
1678         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1679         SSE_SHUFFLE(XMM7,XMM7,0x00)
1680         SSE_MULT_PS(XMM0,XMM7)
1681 
1682         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1683         SSE_SHUFFLE(XMM7,XMM7,0x00)
1684         SSE_MULT_PS(XMM1,XMM7)
1685         SSE_ADD_PS(XMM0,XMM1)
1686 
1687         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1688         SSE_SHUFFLE(XMM1,XMM1,0x00)
1689         SSE_MULT_PS(XMM1,XMM2)
1690         SSE_ADD_PS(XMM0,XMM1)
1691 
1692         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1693         SSE_SHUFFLE(XMM7,XMM7,0x00)
1694         SSE_MULT_PS(XMM3,XMM7)
1695         SSE_ADD_PS(XMM0,XMM3)
1696 
1697         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1698         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1699 
1700         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1701         /* This is code to be maintained and read by humans afterall. */
1702         /* Copy Multiplier Col 3 into XMM3 */
1703         SSE_COPY_PS(XMM3,XMM0)
1704         /* Copy Multiplier Col 2 into XMM2 */
1705         SSE_COPY_PS(XMM2,XMM6)
1706         /* Copy Multiplier Col 1 into XMM1 */
1707         SSE_COPY_PS(XMM1,XMM5)
1708         /* Copy Multiplier Col 0 into XMM0 */
1709         SSE_COPY_PS(XMM0,XMM4)
1710         SSE_INLINE_END_2;
1711 
1712         /* Update the row: */
1713         nz  = bi[row+1] - diag_offset[row] - 1;
1714         pv += 16;
1715         for (j=0; j<nz; j++) {
1716           PREFETCH_L1(&pv[16]);
1717           x = rtmp + 16*((unsigned int)pj[j]);
1718 /*            x = rtmp + 4*pj[j]; */
1719 
1720           /* X:=X-M*PV, One column at a time */
1721           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1722           SSE_INLINE_BEGIN_2(x,pv)
1723           /* Load First Column of X*/
1724           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1725           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1726 
1727           /* Matrix-Vector Product: */
1728           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1729           SSE_SHUFFLE(XMM5,XMM5,0x00)
1730           SSE_MULT_PS(XMM5,XMM0)
1731           SSE_SUB_PS(XMM4,XMM5)
1732 
1733           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1734           SSE_SHUFFLE(XMM6,XMM6,0x00)
1735           SSE_MULT_PS(XMM6,XMM1)
1736           SSE_SUB_PS(XMM4,XMM6)
1737 
1738           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1739           SSE_SHUFFLE(XMM7,XMM7,0x00)
1740           SSE_MULT_PS(XMM7,XMM2)
1741           SSE_SUB_PS(XMM4,XMM7)
1742 
1743           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1744           SSE_SHUFFLE(XMM5,XMM5,0x00)
1745           SSE_MULT_PS(XMM5,XMM3)
1746           SSE_SUB_PS(XMM4,XMM5)
1747 
1748           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1749           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1750 
1751           /* Second Column */
1752           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1753           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1754 
1755           /* Matrix-Vector Product: */
1756           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1757           SSE_SHUFFLE(XMM6,XMM6,0x00)
1758           SSE_MULT_PS(XMM6,XMM0)
1759           SSE_SUB_PS(XMM5,XMM6)
1760 
1761           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1762           SSE_SHUFFLE(XMM7,XMM7,0x00)
1763           SSE_MULT_PS(XMM7,XMM1)
1764           SSE_SUB_PS(XMM5,XMM7)
1765 
1766           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1767           SSE_SHUFFLE(XMM4,XMM4,0x00)
1768           SSE_MULT_PS(XMM4,XMM2)
1769           SSE_SUB_PS(XMM5,XMM4)
1770 
1771           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1772           SSE_SHUFFLE(XMM6,XMM6,0x00)
1773           SSE_MULT_PS(XMM6,XMM3)
1774           SSE_SUB_PS(XMM5,XMM6)
1775 
1776           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1777           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1778 
1779           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1780 
1781           /* Third Column */
1782           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1783           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1784 
1785           /* Matrix-Vector Product: */
1786           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1787           SSE_SHUFFLE(XMM7,XMM7,0x00)
1788           SSE_MULT_PS(XMM7,XMM0)
1789           SSE_SUB_PS(XMM6,XMM7)
1790 
1791           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1792           SSE_SHUFFLE(XMM4,XMM4,0x00)
1793           SSE_MULT_PS(XMM4,XMM1)
1794           SSE_SUB_PS(XMM6,XMM4)
1795 
1796           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1797           SSE_SHUFFLE(XMM5,XMM5,0x00)
1798           SSE_MULT_PS(XMM5,XMM2)
1799           SSE_SUB_PS(XMM6,XMM5)
1800 
1801           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1802           SSE_SHUFFLE(XMM7,XMM7,0x00)
1803           SSE_MULT_PS(XMM7,XMM3)
1804           SSE_SUB_PS(XMM6,XMM7)
1805 
1806           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1807           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1808 
1809           /* Fourth Column */
1810           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1811           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1812 
1813           /* Matrix-Vector Product: */
1814           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1815           SSE_SHUFFLE(XMM5,XMM5,0x00)
1816           SSE_MULT_PS(XMM5,XMM0)
1817           SSE_SUB_PS(XMM4,XMM5)
1818 
1819           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1820           SSE_SHUFFLE(XMM6,XMM6,0x00)
1821           SSE_MULT_PS(XMM6,XMM1)
1822           SSE_SUB_PS(XMM4,XMM6)
1823 
1824           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1825           SSE_SHUFFLE(XMM7,XMM7,0x00)
1826           SSE_MULT_PS(XMM7,XMM2)
1827           SSE_SUB_PS(XMM4,XMM7)
1828 
1829           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1830           SSE_SHUFFLE(XMM5,XMM5,0x00)
1831           SSE_MULT_PS(XMM5,XMM3)
1832           SSE_SUB_PS(XMM4,XMM5)
1833 
1834           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1835           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1836           SSE_INLINE_END_2;
1837           pv += 16;
1838         }
1839         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1840       }
1841       row = (unsigned int)(*bjtmp++);
1842 /*        row = (*bjtmp++)/4; */
1843 /*        bjtmp++; */
1844     }
1845     /* finished row so stick it into b->a */
1846     pv = ba + 16*bi[i];
1847     pj = bj + bi[i];
1848     nz = bi[i+1] - bi[i];
1849 
1850     /* Copy x block back into pv block */
1851     for (j=0; j<nz; j++) {
1852       x = rtmp+16*((unsigned int)pj[j]);
1853 /*        x  = rtmp+4*pj[j]; */
1854 
1855       SSE_INLINE_BEGIN_2(x,pv)
1856       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1857       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1858       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1859 
1860       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1861       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1862 
1863       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1864       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1865 
1866       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1867       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1868 
1869       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1870       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1871 
1872       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1873       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1874 
1875       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1876       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1877 
1878       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1879       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1880       SSE_INLINE_END_2;
1881       pv += 16;
1882     }
1883     /* invert diagonal block */
1884     w = ba + 16*diag_offset[i];
1885     if (pivotinblocks) {
1886       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr);
1887       if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1888     } else {
1889       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1890     }
1891 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1892     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1893   }
1894 
1895   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1896 
1897   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1898   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1899   C->assembled           = PETSC_TRUE;
1900 
1901   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1902   /* Flop Count from inverting diagonal blocks */
1903   SSE_SCOPE_END;
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 #endif
1908