xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision ccb4e88a40f0b86eaeca07ff64c64e4de2fae686)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9 {
10   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
11   PetscInt          n  =a->mbs;
12   const PetscInt    *ai=a->i,*aj=a->j;
13   PetscErrorCode    ierr;
14   const PetscInt    *diag = a->diag;
15   const MatScalar   *aa   =a->a;
16   PetscScalar       *x;
17   const PetscScalar *b;
18 
19   PetscFunctionBegin;
20   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22 
23 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
24   {
25     static PetscScalar w[2000]; /* very BAD need to fix */
26     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
27   }
28 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
29   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
30 #else
31   {
32     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
33     const MatScalar *v;
34     PetscInt        jdx,idt,idx,nz,i,ai16;
35     const PetscInt  *vi;
36 
37     /* forward solve the lower triangular */
38     idx  = 0;
39     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
40     for (i=1; i<n; i++) {
41       v    =  aa      + 16*ai[i];
42       vi   =  aj      + ai[i];
43       nz   =  diag[i] - ai[i];
44       idx +=  4;
45       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
46       while (nz--) {
47         jdx = 4*(*vi++);
48         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
49         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
50         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
51         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
52         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
53         v  += 16;
54       }
55       x[idx]   = s1;
56       x[1+idx] = s2;
57       x[2+idx] = s3;
58       x[3+idx] = s4;
59     }
60     /* backward solve the upper triangular */
61     idt = 4*(n-1);
62     for (i=n-1; i>=0; i--) {
63       ai16 = 16*diag[i];
64       v    = aa + ai16 + 16;
65       vi   = aj + diag[i] + 1;
66       nz   = ai[i+1] - diag[i] - 1;
67       s1   = x[idt];  s2 = x[1+idt];
68       s3   = x[2+idt];s4 = x[3+idt];
69       while (nz--) {
70         idx = 4*(*vi++);
71         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
72         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
73         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
74         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
75         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
76         v  += 16;
77       }
78       v        = aa + ai16;
79       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
80       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
81       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
82       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
83       idt     -= 4;
84     }
85   }
86 #endif
87 
88   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
89   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
90   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
95 {
96   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
97   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
98   PetscInt          i,k,nz,idx,jdx,idt;
99   PetscErrorCode    ierr;
100   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
101   const MatScalar   *aa=a->a,*v;
102   PetscScalar       *x;
103   const PetscScalar *b;
104   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
105 
106   PetscFunctionBegin;
107   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
108   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
109   /* forward solve the lower triangular */
110   idx  = 0;
111   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
112   for (i=1; i<n; i++) {
113     v   = aa + bs2*ai[i];
114     vi  = aj + ai[i];
115     nz  = ai[i+1] - ai[i];
116     idx = bs*i;
117     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
118     for (k=0; k<nz; k++) {
119       jdx = bs*vi[k];
120       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
121       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
122       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
123       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
124       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
125 
126       v +=  bs2;
127     }
128 
129     x[idx]   = s1;
130     x[1+idx] = s2;
131     x[2+idx] = s3;
132     x[3+idx] = s4;
133   }
134 
135   /* backward solve the upper triangular */
136   for (i=n-1; i>=0; i--) {
137     v   = aa + bs2*(adiag[i+1]+1);
138     vi  = aj + adiag[i+1]+1;
139     nz  = adiag[i] - adiag[i+1]-1;
140     idt = bs*i;
141     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
142 
143     for (k=0; k<nz; k++) {
144       idx = bs*vi[k];
145       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
146       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
147       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
148       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
149       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
150 
151       v +=  bs2;
152     }
153     /* x = inv_diagonal*x */
154     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
155     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
156     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
157     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
158 
159   }
160 
161   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
162   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
163   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
164   PetscFunctionReturn(0);
165 }
166 
167 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
168 {
169   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
170   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
171   PetscErrorCode    ierr;
172   const MatScalar   *aa=a->a;
173   const PetscScalar *b;
174   PetscScalar       *x;
175 
176   PetscFunctionBegin;
177   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
178   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
179 
180   {
181     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
182     const MatScalar *v;
183     MatScalar       *t=(MatScalar*)x;
184     PetscInt        jdx,idt,idx,nz,i,ai16;
185     const PetscInt  *vi;
186 
187     /* forward solve the lower triangular */
188     idx  = 0;
189     t[0] = (MatScalar)b[0];
190     t[1] = (MatScalar)b[1];
191     t[2] = (MatScalar)b[2];
192     t[3] = (MatScalar)b[3];
193     for (i=1; i<n; i++) {
194       v    =  aa      + 16*ai[i];
195       vi   =  aj      + ai[i];
196       nz   =  diag[i] - ai[i];
197       idx +=  4;
198       s1   = (MatScalar)b[idx];
199       s2   = (MatScalar)b[1+idx];
200       s3   = (MatScalar)b[2+idx];
201       s4   = (MatScalar)b[3+idx];
202       while (nz--) {
203         jdx = 4*(*vi++);
204         x1  = t[jdx];
205         x2  = t[1+jdx];
206         x3  = t[2+jdx];
207         x4  = t[3+jdx];
208         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
209         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
210         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
211         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
212         v  += 16;
213       }
214       t[idx]   = s1;
215       t[1+idx] = s2;
216       t[2+idx] = s3;
217       t[3+idx] = s4;
218     }
219     /* backward solve the upper triangular */
220     idt = 4*(n-1);
221     for (i=n-1; i>=0; i--) {
222       ai16 = 16*diag[i];
223       v    = aa + ai16 + 16;
224       vi   = aj + diag[i] + 1;
225       nz   = ai[i+1] - diag[i] - 1;
226       s1   = t[idt];
227       s2   = t[1+idt];
228       s3   = t[2+idt];
229       s4   = t[3+idt];
230       while (nz--) {
231         idx = 4*(*vi++);
232         x1  = (MatScalar)x[idx];
233         x2  = (MatScalar)x[1+idx];
234         x3  = (MatScalar)x[2+idx];
235         x4  = (MatScalar)x[3+idx];
236         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
237         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
238         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
239         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
240         v  += 16;
241       }
242       v        = aa + ai16;
243       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
244       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
245       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
246       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
247       idt     -= 4;
248     }
249   }
250 
251   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
252   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
253   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
254   PetscFunctionReturn(0);
255 }
256 
257 #if defined(PETSC_HAVE_SSE)
258 
259 #include PETSC_HAVE_SSE
260 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
261 {
262   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
263   unsigned short *aj=(unsigned short*)a->j;
264   PetscErrorCode ierr;
265   int            *ai=a->i,n=a->mbs,*diag = a->diag;
266   MatScalar      *aa=a->a;
267   PetscScalar    *x,*b;
268 
269   PetscFunctionBegin;
270   SSE_SCOPE_BEGIN;
271   /*
272      Note: This code currently uses demotion of double
273      to float when performing the mixed-mode computation.
274      This may not be numerically reasonable for all applications.
275   */
276   PREFETCH_NTA(aa+16*ai[1]);
277 
278   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
279   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
280   {
281     /* x will first be computed in single precision then promoted inplace to double */
282     MatScalar      *v,*t=(MatScalar*)x;
283     int            nz,i,idt,ai16;
284     unsigned int   jdx,idx;
285     unsigned short *vi;
286     /* Forward solve the lower triangular factor. */
287 
288     /* First block is the identity. */
289     idx = 0;
290     CONVERT_DOUBLE4_FLOAT4(t,b);
291     v =  aa + 16*((unsigned int)ai[1]);
292 
293     for (i=1; i<n;) {
294       PREFETCH_NTA(&v[8]);
295       vi   =  aj      + ai[i];
296       nz   =  diag[i] - ai[i];
297       idx +=  4;
298 
299       /* Demote RHS from double to float. */
300       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
301       LOAD_PS(&t[idx],XMM7);
302 
303       while (nz--) {
304         PREFETCH_NTA(&v[16]);
305         jdx = 4*((unsigned int)(*vi++));
306 
307         /* 4x4 Matrix-Vector product with negative accumulation: */
308         SSE_INLINE_BEGIN_2(&t[jdx],v)
309         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
310 
311         /* First Column */
312         SSE_COPY_PS(XMM0,XMM6)
313         SSE_SHUFFLE(XMM0,XMM0,0x00)
314         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
315         SSE_SUB_PS(XMM7,XMM0)
316 
317         /* Second Column */
318         SSE_COPY_PS(XMM1,XMM6)
319         SSE_SHUFFLE(XMM1,XMM1,0x55)
320         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
321         SSE_SUB_PS(XMM7,XMM1)
322 
323         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
324 
325         /* Third Column */
326         SSE_COPY_PS(XMM2,XMM6)
327         SSE_SHUFFLE(XMM2,XMM2,0xAA)
328         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
329         SSE_SUB_PS(XMM7,XMM2)
330 
331         /* Fourth Column */
332         SSE_COPY_PS(XMM3,XMM6)
333         SSE_SHUFFLE(XMM3,XMM3,0xFF)
334         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
335         SSE_SUB_PS(XMM7,XMM3)
336         SSE_INLINE_END_2
337 
338         v += 16;
339       }
340       v =  aa + 16*ai[++i];
341       PREFETCH_NTA(v);
342       STORE_PS(&t[idx],XMM7);
343     }
344 
345     /* Backward solve the upper triangular factor.*/
346 
347     idt  = 4*(n-1);
348     ai16 = 16*diag[n-1];
349     v    = aa + ai16 + 16;
350     for (i=n-1; i>=0;) {
351       PREFETCH_NTA(&v[8]);
352       vi = aj + diag[i] + 1;
353       nz = ai[i+1] - diag[i] - 1;
354 
355       LOAD_PS(&t[idt],XMM7);
356 
357       while (nz--) {
358         PREFETCH_NTA(&v[16]);
359         idx = 4*((unsigned int)(*vi++));
360 
361         /* 4x4 Matrix-Vector Product with negative accumulation: */
362         SSE_INLINE_BEGIN_2(&t[idx],v)
363         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
364 
365         /* First Column */
366         SSE_COPY_PS(XMM0,XMM6)
367         SSE_SHUFFLE(XMM0,XMM0,0x00)
368         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
369         SSE_SUB_PS(XMM7,XMM0)
370 
371         /* Second Column */
372         SSE_COPY_PS(XMM1,XMM6)
373         SSE_SHUFFLE(XMM1,XMM1,0x55)
374         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
375         SSE_SUB_PS(XMM7,XMM1)
376 
377         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
378 
379         /* Third Column */
380         SSE_COPY_PS(XMM2,XMM6)
381         SSE_SHUFFLE(XMM2,XMM2,0xAA)
382         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
383         SSE_SUB_PS(XMM7,XMM2)
384 
385         /* Fourth Column */
386         SSE_COPY_PS(XMM3,XMM6)
387         SSE_SHUFFLE(XMM3,XMM3,0xFF)
388         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
389         SSE_SUB_PS(XMM7,XMM3)
390         SSE_INLINE_END_2
391         v += 16;
392       }
393       v    = aa + ai16;
394       ai16 = 16*diag[--i];
395       PREFETCH_NTA(aa+ai16+16);
396       /*
397          Scale the result by the diagonal 4x4 block,
398          which was inverted as part of the factorization
399       */
400       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
401       /* First Column */
402       SSE_COPY_PS(XMM0,XMM7)
403       SSE_SHUFFLE(XMM0,XMM0,0x00)
404       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
405 
406       /* Second Column */
407       SSE_COPY_PS(XMM1,XMM7)
408       SSE_SHUFFLE(XMM1,XMM1,0x55)
409       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
410       SSE_ADD_PS(XMM0,XMM1)
411 
412       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
413 
414       /* Third Column */
415       SSE_COPY_PS(XMM2,XMM7)
416       SSE_SHUFFLE(XMM2,XMM2,0xAA)
417       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
418       SSE_ADD_PS(XMM0,XMM2)
419 
420       /* Fourth Column */
421       SSE_COPY_PS(XMM3,XMM7)
422       SSE_SHUFFLE(XMM3,XMM3,0xFF)
423       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
424       SSE_ADD_PS(XMM0,XMM3)
425 
426       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
427       SSE_INLINE_END_3
428 
429       v    = aa + ai16 + 16;
430       idt -= 4;
431     }
432 
433     /* Convert t from single precision back to double precision (inplace)*/
434     idt = 4*(n-1);
435     for (i=n-1; i>=0; i--) {
436       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
437       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
438       PetscScalar *xtemp=&x[idt];
439       MatScalar   *ttemp=&t[idt];
440       xtemp[3] = (PetscScalar)ttemp[3];
441       xtemp[2] = (PetscScalar)ttemp[2];
442       xtemp[1] = (PetscScalar)ttemp[1];
443       xtemp[0] = (PetscScalar)ttemp[0];
444       idt     -= 4;
445     }
446 
447   } /* End of artificial scope. */
448   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
451   SSE_SCOPE_END;
452   PetscFunctionReturn(0);
453 }
454 
455 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
456 {
457   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
458   int            *aj=a->j;
459   PetscErrorCode ierr;
460   int            *ai=a->i,n=a->mbs,*diag = a->diag;
461   MatScalar      *aa=a->a;
462   PetscScalar    *x,*b;
463 
464   PetscFunctionBegin;
465   SSE_SCOPE_BEGIN;
466   /*
467      Note: This code currently uses demotion of double
468      to float when performing the mixed-mode computation.
469      This may not be numerically reasonable for all applications.
470   */
471   PREFETCH_NTA(aa+16*ai[1]);
472 
473   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
475   {
476     /* x will first be computed in single precision then promoted inplace to double */
477     MatScalar *v,*t=(MatScalar*)x;
478     int       nz,i,idt,ai16;
479     int       jdx,idx;
480     int       *vi;
481     /* Forward solve the lower triangular factor. */
482 
483     /* First block is the identity. */
484     idx = 0;
485     CONVERT_DOUBLE4_FLOAT4(t,b);
486     v =  aa + 16*ai[1];
487 
488     for (i=1; i<n;) {
489       PREFETCH_NTA(&v[8]);
490       vi   =  aj      + ai[i];
491       nz   =  diag[i] - ai[i];
492       idx +=  4;
493 
494       /* Demote RHS from double to float. */
495       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
496       LOAD_PS(&t[idx],XMM7);
497 
498       while (nz--) {
499         PREFETCH_NTA(&v[16]);
500         jdx = 4*(*vi++);
501 /*          jdx = *vi++; */
502 
503         /* 4x4 Matrix-Vector product with negative accumulation: */
504         SSE_INLINE_BEGIN_2(&t[jdx],v)
505         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
506 
507         /* First Column */
508         SSE_COPY_PS(XMM0,XMM6)
509         SSE_SHUFFLE(XMM0,XMM0,0x00)
510         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
511         SSE_SUB_PS(XMM7,XMM0)
512 
513         /* Second Column */
514         SSE_COPY_PS(XMM1,XMM6)
515         SSE_SHUFFLE(XMM1,XMM1,0x55)
516         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
517         SSE_SUB_PS(XMM7,XMM1)
518 
519         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
520 
521         /* Third Column */
522         SSE_COPY_PS(XMM2,XMM6)
523         SSE_SHUFFLE(XMM2,XMM2,0xAA)
524         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
525         SSE_SUB_PS(XMM7,XMM2)
526 
527         /* Fourth Column */
528         SSE_COPY_PS(XMM3,XMM6)
529         SSE_SHUFFLE(XMM3,XMM3,0xFF)
530         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
531         SSE_SUB_PS(XMM7,XMM3)
532         SSE_INLINE_END_2
533 
534         v += 16;
535       }
536       v =  aa + 16*ai[++i];
537       PREFETCH_NTA(v);
538       STORE_PS(&t[idx],XMM7);
539     }
540 
541     /* Backward solve the upper triangular factor.*/
542 
543     idt  = 4*(n-1);
544     ai16 = 16*diag[n-1];
545     v    = aa + ai16 + 16;
546     for (i=n-1; i>=0;) {
547       PREFETCH_NTA(&v[8]);
548       vi = aj + diag[i] + 1;
549       nz = ai[i+1] - diag[i] - 1;
550 
551       LOAD_PS(&t[idt],XMM7);
552 
553       while (nz--) {
554         PREFETCH_NTA(&v[16]);
555         idx = 4*(*vi++);
556 /*          idx = *vi++; */
557 
558         /* 4x4 Matrix-Vector Product with negative accumulation: */
559         SSE_INLINE_BEGIN_2(&t[idx],v)
560         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
561 
562         /* First Column */
563         SSE_COPY_PS(XMM0,XMM6)
564         SSE_SHUFFLE(XMM0,XMM0,0x00)
565         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
566         SSE_SUB_PS(XMM7,XMM0)
567 
568         /* Second Column */
569         SSE_COPY_PS(XMM1,XMM6)
570         SSE_SHUFFLE(XMM1,XMM1,0x55)
571         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
572         SSE_SUB_PS(XMM7,XMM1)
573 
574         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
575 
576         /* Third Column */
577         SSE_COPY_PS(XMM2,XMM6)
578         SSE_SHUFFLE(XMM2,XMM2,0xAA)
579         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
580         SSE_SUB_PS(XMM7,XMM2)
581 
582         /* Fourth Column */
583         SSE_COPY_PS(XMM3,XMM6)
584         SSE_SHUFFLE(XMM3,XMM3,0xFF)
585         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
586         SSE_SUB_PS(XMM7,XMM3)
587         SSE_INLINE_END_2
588         v += 16;
589       }
590       v    = aa + ai16;
591       ai16 = 16*diag[--i];
592       PREFETCH_NTA(aa+ai16+16);
593       /*
594          Scale the result by the diagonal 4x4 block,
595          which was inverted as part of the factorization
596       */
597       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
598       /* First Column */
599       SSE_COPY_PS(XMM0,XMM7)
600       SSE_SHUFFLE(XMM0,XMM0,0x00)
601       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
602 
603       /* Second Column */
604       SSE_COPY_PS(XMM1,XMM7)
605       SSE_SHUFFLE(XMM1,XMM1,0x55)
606       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
607       SSE_ADD_PS(XMM0,XMM1)
608 
609       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
610 
611       /* Third Column */
612       SSE_COPY_PS(XMM2,XMM7)
613       SSE_SHUFFLE(XMM2,XMM2,0xAA)
614       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
615       SSE_ADD_PS(XMM0,XMM2)
616 
617       /* Fourth Column */
618       SSE_COPY_PS(XMM3,XMM7)
619       SSE_SHUFFLE(XMM3,XMM3,0xFF)
620       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
621       SSE_ADD_PS(XMM0,XMM3)
622 
623       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
624       SSE_INLINE_END_3
625 
626       v    = aa + ai16 + 16;
627       idt -= 4;
628     }
629 
630     /* Convert t from single precision back to double precision (inplace)*/
631     idt = 4*(n-1);
632     for (i=n-1; i>=0; i--) {
633       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
634       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
635       PetscScalar *xtemp=&x[idt];
636       MatScalar   *ttemp=&t[idt];
637       xtemp[3] = (PetscScalar)ttemp[3];
638       xtemp[2] = (PetscScalar)ttemp[2];
639       xtemp[1] = (PetscScalar)ttemp[1];
640       xtemp[0] = (PetscScalar)ttemp[0];
641       idt     -= 4;
642     }
643 
644   } /* End of artificial scope. */
645   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
646   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
647   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
648   SSE_SCOPE_END;
649   PetscFunctionReturn(0);
650 }
651 
652 #endif
653