xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision daa037dfd3c3bec8dc8659548d2b20b07c1dc6de)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
9 {
10   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
11   PetscInt          n  =a->mbs;
12   const PetscInt    *ai=a->i,*aj=a->j;
13   const PetscInt    *diag = a->diag;
14   const MatScalar   *aa   =a->a;
15   PetscScalar       *x;
16   const PetscScalar *b;
17 
18   PetscFunctionBegin;
19   PetscCall(VecGetArrayRead(bb,&b));
20   PetscCall(VecGetArray(xx,&x));
21 
22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23   {
24     static PetscScalar w[2000]; /* very BAD need to fix */
25     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
26   }
27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
29 #else
30   {
31     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
32     const MatScalar *v;
33     PetscInt        jdx,idt,idx,nz,i,ai16;
34     const PetscInt  *vi;
35 
36     /* forward solve the lower triangular */
37     idx  = 0;
38     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
39     for (i=1; i<n; i++) {
40       v    =  aa      + 16*ai[i];
41       vi   =  aj      + ai[i];
42       nz   =  diag[i] - ai[i];
43       idx +=  4;
44       s1   =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
45       while (nz--) {
46         jdx = 4*(*vi++);
47         x1  = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
48         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
49         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
50         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
51         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
52         v  += 16;
53       }
54       x[idx]   = s1;
55       x[1+idx] = s2;
56       x[2+idx] = s3;
57       x[3+idx] = s4;
58     }
59     /* backward solve the upper triangular */
60     idt = 4*(n-1);
61     for (i=n-1; i>=0; i--) {
62       ai16 = 16*diag[i];
63       v    = aa + ai16 + 16;
64       vi   = aj + diag[i] + 1;
65       nz   = ai[i+1] - diag[i] - 1;
66       s1   = x[idt];  s2 = x[1+idt];
67       s3   = x[2+idt];s4 = x[3+idt];
68       while (nz--) {
69         idx = 4*(*vi++);
70         x1  = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
71         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
72         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
73         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
74         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
75         v  += 16;
76       }
77       v        = aa + ai16;
78       x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
79       x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
80       x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
81       x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
82       idt     -= 4;
83     }
84   }
85 #endif
86 
87   PetscCall(VecRestoreArrayRead(bb,&b));
88   PetscCall(VecRestoreArray(xx,&x));
89   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
94 {
95   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
96   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
97   PetscInt          i,k,nz,idx,jdx,idt;
98   const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
99   const MatScalar   *aa=a->a,*v;
100   PetscScalar       *x;
101   const PetscScalar *b;
102   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
103 
104   PetscFunctionBegin;
105   PetscCall(VecGetArrayRead(bb,&b));
106   PetscCall(VecGetArray(xx,&x));
107   /* forward solve the lower triangular */
108   idx  = 0;
109   x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
110   for (i=1; i<n; i++) {
111     v   = aa + bs2*ai[i];
112     vi  = aj + ai[i];
113     nz  = ai[i+1] - ai[i];
114     idx = bs*i;
115     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
116     for (k=0; k<nz; k++) {
117       jdx = bs*vi[k];
118       x1  = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
119       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
120       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
121       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
122       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
123 
124       v +=  bs2;
125     }
126 
127     x[idx]   = s1;
128     x[1+idx] = s2;
129     x[2+idx] = s3;
130     x[3+idx] = s4;
131   }
132 
133   /* backward solve the upper triangular */
134   for (i=n-1; i>=0; i--) {
135     v   = aa + bs2*(adiag[i+1]+1);
136     vi  = aj + adiag[i+1]+1;
137     nz  = adiag[i] - adiag[i+1]-1;
138     idt = bs*i;
139     s1  = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
140 
141     for (k=0; k<nz; k++) {
142       idx = bs*vi[k];
143       x1  = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
144       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
145       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
146       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
147       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
148 
149       v +=  bs2;
150     }
151     /* x = inv_diagonal*x */
152     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
153     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;
154     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
155     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
156 
157   }
158 
159   PetscCall(VecRestoreArrayRead(bb,&b));
160   PetscCall(VecRestoreArray(xx,&x));
161   PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n));
162   PetscFunctionReturn(0);
163 }
164 
165 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
166 {
167   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
168   const PetscInt    n  =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
169   const MatScalar   *aa=a->a;
170   const PetscScalar *b;
171   PetscScalar       *x;
172 
173   PetscFunctionBegin;
174   PetscCall(VecGetArrayRead(bb,&b));
175   PetscCall(VecGetArray(xx,&x));
176 
177   {
178     MatScalar       s1,s2,s3,s4,x1,x2,x3,x4;
179     const MatScalar *v;
180     MatScalar       *t=(MatScalar*)x;
181     PetscInt        jdx,idt,idx,nz,i,ai16;
182     const PetscInt  *vi;
183 
184     /* forward solve the lower triangular */
185     idx  = 0;
186     t[0] = (MatScalar)b[0];
187     t[1] = (MatScalar)b[1];
188     t[2] = (MatScalar)b[2];
189     t[3] = (MatScalar)b[3];
190     for (i=1; i<n; i++) {
191       v    =  aa      + 16*ai[i];
192       vi   =  aj      + ai[i];
193       nz   =  diag[i] - ai[i];
194       idx +=  4;
195       s1   = (MatScalar)b[idx];
196       s2   = (MatScalar)b[1+idx];
197       s3   = (MatScalar)b[2+idx];
198       s4   = (MatScalar)b[3+idx];
199       while (nz--) {
200         jdx = 4*(*vi++);
201         x1  = t[jdx];
202         x2  = t[1+jdx];
203         x3  = t[2+jdx];
204         x4  = t[3+jdx];
205         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
206         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
207         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
208         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
209         v  += 16;
210       }
211       t[idx]   = s1;
212       t[1+idx] = s2;
213       t[2+idx] = s3;
214       t[3+idx] = s4;
215     }
216     /* backward solve the upper triangular */
217     idt = 4*(n-1);
218     for (i=n-1; i>=0; i--) {
219       ai16 = 16*diag[i];
220       v    = aa + ai16 + 16;
221       vi   = aj + diag[i] + 1;
222       nz   = ai[i+1] - diag[i] - 1;
223       s1   = t[idt];
224       s2   = t[1+idt];
225       s3   = t[2+idt];
226       s4   = t[3+idt];
227       while (nz--) {
228         idx = 4*(*vi++);
229         x1  = (MatScalar)x[idx];
230         x2  = (MatScalar)x[1+idx];
231         x3  = (MatScalar)x[2+idx];
232         x4  = (MatScalar)x[3+idx];
233         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
234         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
235         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
236         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
237         v  += 16;
238       }
239       v        = aa + ai16;
240       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
241       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
242       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
243       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
244       idt     -= 4;
245     }
246   }
247 
248   PetscCall(VecRestoreArrayRead(bb,&b));
249   PetscCall(VecRestoreArray(xx,&x));
250   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
251   PetscFunctionReturn(0);
252 }
253 
254 #if defined(PETSC_HAVE_SSE)
255 
256 #include PETSC_HAVE_SSE
257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
258 {
259   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
260   unsigned short *aj=(unsigned short*)a->j;
261   int            *ai=a->i,n=a->mbs,*diag = a->diag;
262   MatScalar      *aa=a->a;
263   PetscScalar    *x,*b;
264 
265   PetscFunctionBegin;
266   SSE_SCOPE_BEGIN;
267   /*
268      Note: This code currently uses demotion of double
269      to float when performing the mixed-mode computation.
270      This may not be numerically reasonable for all applications.
271   */
272   PREFETCH_NTA(aa+16*ai[1]);
273 
274   PetscCall(VecGetArray(bb,&b));
275   PetscCall(VecGetArray(xx,&x));
276   {
277     /* x will first be computed in single precision then promoted inplace to double */
278     MatScalar      *v,*t=(MatScalar*)x;
279     int            nz,i,idt,ai16;
280     unsigned int   jdx,idx;
281     unsigned short *vi;
282     /* Forward solve the lower triangular factor. */
283 
284     /* First block is the identity. */
285     idx = 0;
286     CONVERT_DOUBLE4_FLOAT4(t,b);
287     v =  aa + 16*((unsigned int)ai[1]);
288 
289     for (i=1; i<n;) {
290       PREFETCH_NTA(&v[8]);
291       vi   =  aj      + ai[i];
292       nz   =  diag[i] - ai[i];
293       idx +=  4;
294 
295       /* Demote RHS from double to float. */
296       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
297       LOAD_PS(&t[idx],XMM7);
298 
299       while (nz--) {
300         PREFETCH_NTA(&v[16]);
301         jdx = 4*((unsigned int)(*vi++));
302 
303         /* 4x4 Matrix-Vector product with negative accumulation: */
304         SSE_INLINE_BEGIN_2(&t[jdx],v)
305         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
306 
307         /* First Column */
308         SSE_COPY_PS(XMM0,XMM6)
309         SSE_SHUFFLE(XMM0,XMM0,0x00)
310         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
311         SSE_SUB_PS(XMM7,XMM0)
312 
313         /* Second Column */
314         SSE_COPY_PS(XMM1,XMM6)
315         SSE_SHUFFLE(XMM1,XMM1,0x55)
316         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
317         SSE_SUB_PS(XMM7,XMM1)
318 
319         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
320 
321         /* Third Column */
322         SSE_COPY_PS(XMM2,XMM6)
323         SSE_SHUFFLE(XMM2,XMM2,0xAA)
324         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
325         SSE_SUB_PS(XMM7,XMM2)
326 
327         /* Fourth Column */
328         SSE_COPY_PS(XMM3,XMM6)
329         SSE_SHUFFLE(XMM3,XMM3,0xFF)
330         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
331         SSE_SUB_PS(XMM7,XMM3)
332         SSE_INLINE_END_2
333 
334         v += 16;
335       }
336       v =  aa + 16*ai[++i];
337       PREFETCH_NTA(v);
338       STORE_PS(&t[idx],XMM7);
339     }
340 
341     /* Backward solve the upper triangular factor.*/
342 
343     idt  = 4*(n-1);
344     ai16 = 16*diag[n-1];
345     v    = aa + ai16 + 16;
346     for (i=n-1; i>=0;) {
347       PREFETCH_NTA(&v[8]);
348       vi = aj + diag[i] + 1;
349       nz = ai[i+1] - diag[i] - 1;
350 
351       LOAD_PS(&t[idt],XMM7);
352 
353       while (nz--) {
354         PREFETCH_NTA(&v[16]);
355         idx = 4*((unsigned int)(*vi++));
356 
357         /* 4x4 Matrix-Vector Product with negative accumulation: */
358         SSE_INLINE_BEGIN_2(&t[idx],v)
359         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
360 
361         /* First Column */
362         SSE_COPY_PS(XMM0,XMM6)
363         SSE_SHUFFLE(XMM0,XMM0,0x00)
364         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
365         SSE_SUB_PS(XMM7,XMM0)
366 
367         /* Second Column */
368         SSE_COPY_PS(XMM1,XMM6)
369         SSE_SHUFFLE(XMM1,XMM1,0x55)
370         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
371         SSE_SUB_PS(XMM7,XMM1)
372 
373         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
374 
375         /* Third Column */
376         SSE_COPY_PS(XMM2,XMM6)
377         SSE_SHUFFLE(XMM2,XMM2,0xAA)
378         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
379         SSE_SUB_PS(XMM7,XMM2)
380 
381         /* Fourth Column */
382         SSE_COPY_PS(XMM3,XMM6)
383         SSE_SHUFFLE(XMM3,XMM3,0xFF)
384         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
385         SSE_SUB_PS(XMM7,XMM3)
386         SSE_INLINE_END_2
387         v += 16;
388       }
389       v    = aa + ai16;
390       ai16 = 16*diag[--i];
391       PREFETCH_NTA(aa+ai16+16);
392       /*
393          Scale the result by the diagonal 4x4 block,
394          which was inverted as part of the factorization
395       */
396       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
397       /* First Column */
398       SSE_COPY_PS(XMM0,XMM7)
399       SSE_SHUFFLE(XMM0,XMM0,0x00)
400       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
401 
402       /* Second Column */
403       SSE_COPY_PS(XMM1,XMM7)
404       SSE_SHUFFLE(XMM1,XMM1,0x55)
405       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
406       SSE_ADD_PS(XMM0,XMM1)
407 
408       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
409 
410       /* Third Column */
411       SSE_COPY_PS(XMM2,XMM7)
412       SSE_SHUFFLE(XMM2,XMM2,0xAA)
413       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
414       SSE_ADD_PS(XMM0,XMM2)
415 
416       /* Fourth Column */
417       SSE_COPY_PS(XMM3,XMM7)
418       SSE_SHUFFLE(XMM3,XMM3,0xFF)
419       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
420       SSE_ADD_PS(XMM0,XMM3)
421 
422       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
423       SSE_INLINE_END_3
424 
425       v    = aa + ai16 + 16;
426       idt -= 4;
427     }
428 
429     /* Convert t from single precision back to double precision (inplace)*/
430     idt = 4*(n-1);
431     for (i=n-1; i>=0; i--) {
432       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
433       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
434       PetscScalar *xtemp=&x[idt];
435       MatScalar   *ttemp=&t[idt];
436       xtemp[3] = (PetscScalar)ttemp[3];
437       xtemp[2] = (PetscScalar)ttemp[2];
438       xtemp[1] = (PetscScalar)ttemp[1];
439       xtemp[0] = (PetscScalar)ttemp[0];
440       idt     -= 4;
441     }
442 
443   } /* End of artificial scope. */
444   PetscCall(VecRestoreArray(bb,&b));
445   PetscCall(VecRestoreArray(xx,&x));
446   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
447   SSE_SCOPE_END;
448   PetscFunctionReturn(0);
449 }
450 
451 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
452 {
453   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
454   int            *aj=a->j;
455   int            *ai=a->i,n=a->mbs,*diag = a->diag;
456   MatScalar      *aa=a->a;
457   PetscScalar    *x,*b;
458 
459   PetscFunctionBegin;
460   SSE_SCOPE_BEGIN;
461   /*
462      Note: This code currently uses demotion of double
463      to float when performing the mixed-mode computation.
464      This may not be numerically reasonable for all applications.
465   */
466   PREFETCH_NTA(aa+16*ai[1]);
467 
468   PetscCall(VecGetArray(bb,&b));
469   PetscCall(VecGetArray(xx,&x));
470   {
471     /* x will first be computed in single precision then promoted inplace to double */
472     MatScalar *v,*t=(MatScalar*)x;
473     int       nz,i,idt,ai16;
474     int       jdx,idx;
475     int       *vi;
476     /* Forward solve the lower triangular factor. */
477 
478     /* First block is the identity. */
479     idx = 0;
480     CONVERT_DOUBLE4_FLOAT4(t,b);
481     v =  aa + 16*ai[1];
482 
483     for (i=1; i<n;) {
484       PREFETCH_NTA(&v[8]);
485       vi   =  aj      + ai[i];
486       nz   =  diag[i] - ai[i];
487       idx +=  4;
488 
489       /* Demote RHS from double to float. */
490       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
491       LOAD_PS(&t[idx],XMM7);
492 
493       while (nz--) {
494         PREFETCH_NTA(&v[16]);
495         jdx = 4*(*vi++);
496 /*          jdx = *vi++; */
497 
498         /* 4x4 Matrix-Vector product with negative accumulation: */
499         SSE_INLINE_BEGIN_2(&t[jdx],v)
500         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
501 
502         /* First Column */
503         SSE_COPY_PS(XMM0,XMM6)
504         SSE_SHUFFLE(XMM0,XMM0,0x00)
505         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
506         SSE_SUB_PS(XMM7,XMM0)
507 
508         /* Second Column */
509         SSE_COPY_PS(XMM1,XMM6)
510         SSE_SHUFFLE(XMM1,XMM1,0x55)
511         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
512         SSE_SUB_PS(XMM7,XMM1)
513 
514         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
515 
516         /* Third Column */
517         SSE_COPY_PS(XMM2,XMM6)
518         SSE_SHUFFLE(XMM2,XMM2,0xAA)
519         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
520         SSE_SUB_PS(XMM7,XMM2)
521 
522         /* Fourth Column */
523         SSE_COPY_PS(XMM3,XMM6)
524         SSE_SHUFFLE(XMM3,XMM3,0xFF)
525         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
526         SSE_SUB_PS(XMM7,XMM3)
527         SSE_INLINE_END_2
528 
529         v += 16;
530       }
531       v =  aa + 16*ai[++i];
532       PREFETCH_NTA(v);
533       STORE_PS(&t[idx],XMM7);
534     }
535 
536     /* Backward solve the upper triangular factor.*/
537 
538     idt  = 4*(n-1);
539     ai16 = 16*diag[n-1];
540     v    = aa + ai16 + 16;
541     for (i=n-1; i>=0;) {
542       PREFETCH_NTA(&v[8]);
543       vi = aj + diag[i] + 1;
544       nz = ai[i+1] - diag[i] - 1;
545 
546       LOAD_PS(&t[idt],XMM7);
547 
548       while (nz--) {
549         PREFETCH_NTA(&v[16]);
550         idx = 4*(*vi++);
551 /*          idx = *vi++; */
552 
553         /* 4x4 Matrix-Vector Product with negative accumulation: */
554         SSE_INLINE_BEGIN_2(&t[idx],v)
555         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
556 
557         /* First Column */
558         SSE_COPY_PS(XMM0,XMM6)
559         SSE_SHUFFLE(XMM0,XMM0,0x00)
560         SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
561         SSE_SUB_PS(XMM7,XMM0)
562 
563         /* Second Column */
564         SSE_COPY_PS(XMM1,XMM6)
565         SSE_SHUFFLE(XMM1,XMM1,0x55)
566         SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
567         SSE_SUB_PS(XMM7,XMM1)
568 
569         SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
570 
571         /* Third Column */
572         SSE_COPY_PS(XMM2,XMM6)
573         SSE_SHUFFLE(XMM2,XMM2,0xAA)
574         SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
575         SSE_SUB_PS(XMM7,XMM2)
576 
577         /* Fourth Column */
578         SSE_COPY_PS(XMM3,XMM6)
579         SSE_SHUFFLE(XMM3,XMM3,0xFF)
580         SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
581         SSE_SUB_PS(XMM7,XMM3)
582         SSE_INLINE_END_2
583         v += 16;
584       }
585       v    = aa + ai16;
586       ai16 = 16*diag[--i];
587       PREFETCH_NTA(aa+ai16+16);
588       /*
589          Scale the result by the diagonal 4x4 block,
590          which was inverted as part of the factorization
591       */
592       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
593       /* First Column */
594       SSE_COPY_PS(XMM0,XMM7)
595       SSE_SHUFFLE(XMM0,XMM0,0x00)
596       SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
597 
598       /* Second Column */
599       SSE_COPY_PS(XMM1,XMM7)
600       SSE_SHUFFLE(XMM1,XMM1,0x55)
601       SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
602       SSE_ADD_PS(XMM0,XMM1)
603 
604       SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
605 
606       /* Third Column */
607       SSE_COPY_PS(XMM2,XMM7)
608       SSE_SHUFFLE(XMM2,XMM2,0xAA)
609       SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
610       SSE_ADD_PS(XMM0,XMM2)
611 
612       /* Fourth Column */
613       SSE_COPY_PS(XMM3,XMM7)
614       SSE_SHUFFLE(XMM3,XMM3,0xFF)
615       SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
616       SSE_ADD_PS(XMM0,XMM3)
617 
618       SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
619       SSE_INLINE_END_3
620 
621       v    = aa + ai16 + 16;
622       idt -= 4;
623     }
624 
625     /* Convert t from single precision back to double precision (inplace)*/
626     idt = 4*(n-1);
627     for (i=n-1; i>=0; i--) {
628       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
629       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
630       PetscScalar *xtemp=&x[idt];
631       MatScalar   *ttemp=&t[idt];
632       xtemp[3] = (PetscScalar)ttemp[3];
633       xtemp[2] = (PetscScalar)ttemp[2];
634       xtemp[1] = (PetscScalar)ttemp[1];
635       xtemp[0] = (PetscScalar)ttemp[0];
636       idt     -= 4;
637     }
638 
639   } /* End of artificial scope. */
640   PetscCall(VecRestoreArray(bb,&b));
641   PetscCall(VecRestoreArray(xx,&x));
642   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
643   SSE_SCOPE_END;
644   PetscFunctionReturn(0);
645 }
646 
647 #endif
648