xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
9   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
10   PetscInt           n  = a->mbs;
11   const PetscInt    *ai = a->i, *aj = a->j;
12   const PetscInt    *diag = a->diag;
13   const MatScalar   *aa   = a->a;
14   PetscScalar       *x;
15   const PetscScalar *b;
16 
17   PetscFunctionBegin;
18   PetscCall(VecGetArrayRead(bb, &b));
19   PetscCall(VecGetArray(xx, &x));
20 
21 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22   {
23     static PetscScalar w[2000]; /* very BAD need to fix */
24     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
25   }
26 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
27   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
28 #else
29   {
30     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
31     const MatScalar *v;
32     PetscInt         jdx, idt, idx, nz, i, ai16;
33     const PetscInt  *vi;
34 
35     /* forward solve the lower triangular */
36     idx  = 0;
37     x[0] = b[0];
38     x[1] = b[1];
39     x[2] = b[2];
40     x[3] = b[3];
41     for (i = 1; i < n; i++) {
42       v  = aa + 16 * ai[i];
43       vi = aj + ai[i];
44       nz = diag[i] - ai[i];
45       idx += 4;
46       s1 = b[idx];
47       s2 = b[1 + idx];
48       s3 = b[2 + idx];
49       s4 = b[3 + idx];
50       while (nz--) {
51         jdx = 4 * (*vi++);
52         x1  = x[jdx];
53         x2  = x[1 + jdx];
54         x3  = x[2 + jdx];
55         x4  = x[3 + jdx];
56         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
57         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
58         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
59         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
60         v += 16;
61       }
62       x[idx]     = s1;
63       x[1 + idx] = s2;
64       x[2 + idx] = s3;
65       x[3 + idx] = s4;
66     }
67     /* backward solve the upper triangular */
68     idt = 4 * (n - 1);
69     for (i = n - 1; i >= 0; i--) {
70       ai16 = 16 * diag[i];
71       v    = aa + ai16 + 16;
72       vi   = aj + diag[i] + 1;
73       nz   = ai[i + 1] - diag[i] - 1;
74       s1   = x[idt];
75       s2   = x[1 + idt];
76       s3   = x[2 + idt];
77       s4   = x[3 + idt];
78       while (nz--) {
79         idx = 4 * (*vi++);
80         x1  = x[idx];
81         x2  = x[1 + idx];
82         x3  = x[2 + idx];
83         x4  = x[3 + idx];
84         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
85         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
86         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
87         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
88         v += 16;
89       }
90       v          = aa + ai16;
91       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
92       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
93       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
94       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
95       idt -= 4;
96     }
97   }
98 #endif
99 
100   PetscCall(VecRestoreArrayRead(bb, &b));
101   PetscCall(VecRestoreArray(xx, &x));
102   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
103   PetscFunctionReturn(0);
104 }
105 
106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) {
107   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
108   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
109   PetscInt           i, k, nz, idx, jdx, idt;
110   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
111   const MatScalar   *aa = a->a, *v;
112   PetscScalar       *x;
113   const PetscScalar *b;
114   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;
115 
116   PetscFunctionBegin;
117   PetscCall(VecGetArrayRead(bb, &b));
118   PetscCall(VecGetArray(xx, &x));
119   /* forward solve the lower triangular */
120   idx  = 0;
121   x[0] = b[idx];
122   x[1] = b[1 + idx];
123   x[2] = b[2 + idx];
124   x[3] = b[3 + idx];
125   for (i = 1; i < n; i++) {
126     v   = aa + bs2 * ai[i];
127     vi  = aj + ai[i];
128     nz  = ai[i + 1] - ai[i];
129     idx = bs * i;
130     s1  = b[idx];
131     s2  = b[1 + idx];
132     s3  = b[2 + idx];
133     s4  = b[3 + idx];
134     for (k = 0; k < nz; k++) {
135       jdx = bs * vi[k];
136       x1  = x[jdx];
137       x2  = x[1 + jdx];
138       x3  = x[2 + jdx];
139       x4  = x[3 + jdx];
140       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
141       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
142       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
143       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
144 
145       v += bs2;
146     }
147 
148     x[idx]     = s1;
149     x[1 + idx] = s2;
150     x[2 + idx] = s3;
151     x[3 + idx] = s4;
152   }
153 
154   /* backward solve the upper triangular */
155   for (i = n - 1; i >= 0; i--) {
156     v   = aa + bs2 * (adiag[i + 1] + 1);
157     vi  = aj + adiag[i + 1] + 1;
158     nz  = adiag[i] - adiag[i + 1] - 1;
159     idt = bs * i;
160     s1  = x[idt];
161     s2  = x[1 + idt];
162     s3  = x[2 + idt];
163     s4  = x[3 + idt];
164 
165     for (k = 0; k < nz; k++) {
166       idx = bs * vi[k];
167       x1  = x[idx];
168       x2  = x[1 + idx];
169       x3  = x[2 + idx];
170       x4  = x[3 + idx];
171       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
172       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
173       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
174       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
175 
176       v += bs2;
177     }
178     /* x = inv_diagonal*x */
179     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
180     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
181     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
182     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
183   }
184 
185   PetscCall(VecRestoreArrayRead(bb, &b));
186   PetscCall(VecRestoreArray(xx, &x));
187   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
188   PetscFunctionReturn(0);
189 }
190 
191 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx) {
192   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
193   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag;
194   const MatScalar   *aa = a->a;
195   const PetscScalar *b;
196   PetscScalar       *x;
197 
198   PetscFunctionBegin;
199   PetscCall(VecGetArrayRead(bb, &b));
200   PetscCall(VecGetArray(xx, &x));
201 
202   {
203     MatScalar        s1, s2, s3, s4, x1, x2, x3, x4;
204     const MatScalar *v;
205     MatScalar       *t = (MatScalar *)x;
206     PetscInt         jdx, idt, idx, nz, i, ai16;
207     const PetscInt  *vi;
208 
209     /* forward solve the lower triangular */
210     idx  = 0;
211     t[0] = (MatScalar)b[0];
212     t[1] = (MatScalar)b[1];
213     t[2] = (MatScalar)b[2];
214     t[3] = (MatScalar)b[3];
215     for (i = 1; i < n; i++) {
216       v  = aa + 16 * ai[i];
217       vi = aj + ai[i];
218       nz = diag[i] - ai[i];
219       idx += 4;
220       s1 = (MatScalar)b[idx];
221       s2 = (MatScalar)b[1 + idx];
222       s3 = (MatScalar)b[2 + idx];
223       s4 = (MatScalar)b[3 + idx];
224       while (nz--) {
225         jdx = 4 * (*vi++);
226         x1  = t[jdx];
227         x2  = t[1 + jdx];
228         x3  = t[2 + jdx];
229         x4  = t[3 + jdx];
230         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
231         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
232         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
233         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
234         v += 16;
235       }
236       t[idx]     = s1;
237       t[1 + idx] = s2;
238       t[2 + idx] = s3;
239       t[3 + idx] = s4;
240     }
241     /* backward solve the upper triangular */
242     idt = 4 * (n - 1);
243     for (i = n - 1; i >= 0; i--) {
244       ai16 = 16 * diag[i];
245       v    = aa + ai16 + 16;
246       vi   = aj + diag[i] + 1;
247       nz   = ai[i + 1] - diag[i] - 1;
248       s1   = t[idt];
249       s2   = t[1 + idt];
250       s3   = t[2 + idt];
251       s4   = t[3 + idt];
252       while (nz--) {
253         idx = 4 * (*vi++);
254         x1  = (MatScalar)x[idx];
255         x2  = (MatScalar)x[1 + idx];
256         x3  = (MatScalar)x[2 + idx];
257         x4  = (MatScalar)x[3 + idx];
258         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
259         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
260         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
261         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
262         v += 16;
263       }
264       v          = aa + ai16;
265       x[idt]     = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4);
266       x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4);
267       x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4);
268       x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4);
269       idt -= 4;
270     }
271   }
272 
273   PetscCall(VecRestoreArrayRead(bb, &b));
274   PetscCall(VecRestoreArray(xx, &x));
275   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
276   PetscFunctionReturn(0);
277 }
278 
279 #if defined(PETSC_HAVE_SSE)
280 
281 #include PETSC_HAVE_SSE
282 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx) {
283   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
284   unsigned short *aj = (unsigned short *)a->j;
285   int            *ai = a->i, n = a->mbs, *diag = a->diag;
286   MatScalar      *aa = a->a;
287   PetscScalar    *x, *b;
288 
289   PetscFunctionBegin;
290   SSE_SCOPE_BEGIN;
291   /*
292      Note: This code currently uses demotion of double
293      to float when performing the mixed-mode computation.
294      This may not be numerically reasonable for all applications.
295   */
296   PREFETCH_NTA(aa + 16 * ai[1]);
297 
298   PetscCall(VecGetArray(bb, &b));
299   PetscCall(VecGetArray(xx, &x));
300   {
301     /* x will first be computed in single precision then promoted inplace to double */
302     MatScalar      *v, *t = (MatScalar *)x;
303     int             nz, i, idt, ai16;
304     unsigned int    jdx, idx;
305     unsigned short *vi;
306     /* Forward solve the lower triangular factor. */
307 
308     /* First block is the identity. */
309     idx = 0;
310     CONVERT_DOUBLE4_FLOAT4(t, b);
311     v = aa + 16 * ((unsigned int)ai[1]);
312 
313     for (i = 1; i < n;) {
314       PREFETCH_NTA(&v[8]);
315       vi = aj + ai[i];
316       nz = diag[i] - ai[i];
317       idx += 4;
318 
319       /* Demote RHS from double to float. */
320       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
321       LOAD_PS(&t[idx], XMM7);
322 
323       while (nz--) {
324         PREFETCH_NTA(&v[16]);
325         jdx = 4 * ((unsigned int)(*vi++));
326 
327         /* 4x4 Matrix-Vector product with negative accumulation: */
328         SSE_INLINE_BEGIN_2(&t[jdx], v)
329         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
330 
331         /* First Column */
332         SSE_COPY_PS(XMM0, XMM6)
333         SSE_SHUFFLE(XMM0, XMM0, 0x00)
334         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
335         SSE_SUB_PS(XMM7, XMM0)
336 
337         /* Second Column */
338         SSE_COPY_PS(XMM1, XMM6)
339         SSE_SHUFFLE(XMM1, XMM1, 0x55)
340         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
341         SSE_SUB_PS(XMM7, XMM1)
342 
343         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
344 
345         /* Third Column */
346         SSE_COPY_PS(XMM2, XMM6)
347         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
348         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
349         SSE_SUB_PS(XMM7, XMM2)
350 
351         /* Fourth Column */
352         SSE_COPY_PS(XMM3, XMM6)
353         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
354         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
355         SSE_SUB_PS(XMM7, XMM3)
356         SSE_INLINE_END_2
357 
358         v += 16;
359       }
360       v = aa + 16 * ai[++i];
361       PREFETCH_NTA(v);
362       STORE_PS(&t[idx], XMM7);
363     }
364 
365     /* Backward solve the upper triangular factor.*/
366 
367     idt  = 4 * (n - 1);
368     ai16 = 16 * diag[n - 1];
369     v    = aa + ai16 + 16;
370     for (i = n - 1; i >= 0;) {
371       PREFETCH_NTA(&v[8]);
372       vi = aj + diag[i] + 1;
373       nz = ai[i + 1] - diag[i] - 1;
374 
375       LOAD_PS(&t[idt], XMM7);
376 
377       while (nz--) {
378         PREFETCH_NTA(&v[16]);
379         idx = 4 * ((unsigned int)(*vi++));
380 
381         /* 4x4 Matrix-Vector Product with negative accumulation: */
382         SSE_INLINE_BEGIN_2(&t[idx], v)
383         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
384 
385         /* First Column */
386         SSE_COPY_PS(XMM0, XMM6)
387         SSE_SHUFFLE(XMM0, XMM0, 0x00)
388         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
389         SSE_SUB_PS(XMM7, XMM0)
390 
391         /* Second Column */
392         SSE_COPY_PS(XMM1, XMM6)
393         SSE_SHUFFLE(XMM1, XMM1, 0x55)
394         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
395         SSE_SUB_PS(XMM7, XMM1)
396 
397         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
398 
399         /* Third Column */
400         SSE_COPY_PS(XMM2, XMM6)
401         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
402         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
403         SSE_SUB_PS(XMM7, XMM2)
404 
405         /* Fourth Column */
406         SSE_COPY_PS(XMM3, XMM6)
407         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
408         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
409         SSE_SUB_PS(XMM7, XMM3)
410         SSE_INLINE_END_2
411         v += 16;
412       }
413       v    = aa + ai16;
414       ai16 = 16 * diag[--i];
415       PREFETCH_NTA(aa + ai16 + 16);
416       /*
417          Scale the result by the diagonal 4x4 block,
418          which was inverted as part of the factorization
419       */
420       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
421       /* First Column */
422       SSE_COPY_PS(XMM0, XMM7)
423       SSE_SHUFFLE(XMM0, XMM0, 0x00)
424       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
425 
426       /* Second Column */
427       SSE_COPY_PS(XMM1, XMM7)
428       SSE_SHUFFLE(XMM1, XMM1, 0x55)
429       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
430       SSE_ADD_PS(XMM0, XMM1)
431 
432       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
433 
434       /* Third Column */
435       SSE_COPY_PS(XMM2, XMM7)
436       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
437       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
438       SSE_ADD_PS(XMM0, XMM2)
439 
440       /* Fourth Column */
441       SSE_COPY_PS(XMM3, XMM7)
442       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
443       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
444       SSE_ADD_PS(XMM0, XMM3)
445 
446       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
447       SSE_INLINE_END_3
448 
449       v = aa + ai16 + 16;
450       idt -= 4;
451     }
452 
453     /* Convert t from single precision back to double precision (inplace)*/
454     idt = 4 * (n - 1);
455     for (i = n - 1; i >= 0; i--) {
456       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
457       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
458       PetscScalar *xtemp = &x[idt];
459       MatScalar   *ttemp = &t[idt];
460       xtemp[3]           = (PetscScalar)ttemp[3];
461       xtemp[2]           = (PetscScalar)ttemp[2];
462       xtemp[1]           = (PetscScalar)ttemp[1];
463       xtemp[0]           = (PetscScalar)ttemp[0];
464       idt -= 4;
465     }
466 
467   } /* End of artificial scope. */
468   PetscCall(VecRestoreArray(bb, &b));
469   PetscCall(VecRestoreArray(xx, &x));
470   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
471   SSE_SCOPE_END;
472   PetscFunctionReturn(0);
473 }
474 
475 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx) {
476   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
477   int         *aj = a->j;
478   int         *ai = a->i, n = a->mbs, *diag = a->diag;
479   MatScalar   *aa = a->a;
480   PetscScalar *x, *b;
481 
482   PetscFunctionBegin;
483   SSE_SCOPE_BEGIN;
484   /*
485      Note: This code currently uses demotion of double
486      to float when performing the mixed-mode computation.
487      This may not be numerically reasonable for all applications.
488   */
489   PREFETCH_NTA(aa + 16 * ai[1]);
490 
491   PetscCall(VecGetArray(bb, &b));
492   PetscCall(VecGetArray(xx, &x));
493   {
494     /* x will first be computed in single precision then promoted inplace to double */
495     MatScalar *v, *t = (MatScalar *)x;
496     int        nz, i, idt, ai16;
497     int        jdx, idx;
498     int       *vi;
499     /* Forward solve the lower triangular factor. */
500 
501     /* First block is the identity. */
502     idx = 0;
503     CONVERT_DOUBLE4_FLOAT4(t, b);
504     v = aa + 16 * ai[1];
505 
506     for (i = 1; i < n;) {
507       PREFETCH_NTA(&v[8]);
508       vi = aj + ai[i];
509       nz = diag[i] - ai[i];
510       idx += 4;
511 
512       /* Demote RHS from double to float. */
513       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
514       LOAD_PS(&t[idx], XMM7);
515 
516       while (nz--) {
517         PREFETCH_NTA(&v[16]);
518         jdx = 4 * (*vi++);
519         /*          jdx = *vi++; */
520 
521         /* 4x4 Matrix-Vector product with negative accumulation: */
522         SSE_INLINE_BEGIN_2(&t[jdx], v)
523         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
524 
525         /* First Column */
526         SSE_COPY_PS(XMM0, XMM6)
527         SSE_SHUFFLE(XMM0, XMM0, 0x00)
528         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
529         SSE_SUB_PS(XMM7, XMM0)
530 
531         /* Second Column */
532         SSE_COPY_PS(XMM1, XMM6)
533         SSE_SHUFFLE(XMM1, XMM1, 0x55)
534         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
535         SSE_SUB_PS(XMM7, XMM1)
536 
537         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
538 
539         /* Third Column */
540         SSE_COPY_PS(XMM2, XMM6)
541         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
542         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
543         SSE_SUB_PS(XMM7, XMM2)
544 
545         /* Fourth Column */
546         SSE_COPY_PS(XMM3, XMM6)
547         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
548         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
549         SSE_SUB_PS(XMM7, XMM3)
550         SSE_INLINE_END_2
551 
552         v += 16;
553       }
554       v = aa + 16 * ai[++i];
555       PREFETCH_NTA(v);
556       STORE_PS(&t[idx], XMM7);
557     }
558 
559     /* Backward solve the upper triangular factor.*/
560 
561     idt  = 4 * (n - 1);
562     ai16 = 16 * diag[n - 1];
563     v    = aa + ai16 + 16;
564     for (i = n - 1; i >= 0;) {
565       PREFETCH_NTA(&v[8]);
566       vi = aj + diag[i] + 1;
567       nz = ai[i + 1] - diag[i] - 1;
568 
569       LOAD_PS(&t[idt], XMM7);
570 
571       while (nz--) {
572         PREFETCH_NTA(&v[16]);
573         idx = 4 * (*vi++);
574         /*          idx = *vi++; */
575 
576         /* 4x4 Matrix-Vector Product with negative accumulation: */
577         SSE_INLINE_BEGIN_2(&t[idx], v)
578         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
579 
580         /* First Column */
581         SSE_COPY_PS(XMM0, XMM6)
582         SSE_SHUFFLE(XMM0, XMM0, 0x00)
583         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
584         SSE_SUB_PS(XMM7, XMM0)
585 
586         /* Second Column */
587         SSE_COPY_PS(XMM1, XMM6)
588         SSE_SHUFFLE(XMM1, XMM1, 0x55)
589         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
590         SSE_SUB_PS(XMM7, XMM1)
591 
592         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
593 
594         /* Third Column */
595         SSE_COPY_PS(XMM2, XMM6)
596         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
597         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
598         SSE_SUB_PS(XMM7, XMM2)
599 
600         /* Fourth Column */
601         SSE_COPY_PS(XMM3, XMM6)
602         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
603         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
604         SSE_SUB_PS(XMM7, XMM3)
605         SSE_INLINE_END_2
606         v += 16;
607       }
608       v    = aa + ai16;
609       ai16 = 16 * diag[--i];
610       PREFETCH_NTA(aa + ai16 + 16);
611       /*
612          Scale the result by the diagonal 4x4 block,
613          which was inverted as part of the factorization
614       */
615       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
616       /* First Column */
617       SSE_COPY_PS(XMM0, XMM7)
618       SSE_SHUFFLE(XMM0, XMM0, 0x00)
619       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
620 
621       /* Second Column */
622       SSE_COPY_PS(XMM1, XMM7)
623       SSE_SHUFFLE(XMM1, XMM1, 0x55)
624       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
625       SSE_ADD_PS(XMM0, XMM1)
626 
627       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
628 
629       /* Third Column */
630       SSE_COPY_PS(XMM2, XMM7)
631       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
632       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
633       SSE_ADD_PS(XMM0, XMM2)
634 
635       /* Fourth Column */
636       SSE_COPY_PS(XMM3, XMM7)
637       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
638       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
639       SSE_ADD_PS(XMM0, XMM3)
640 
641       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
642       SSE_INLINE_END_3
643 
644       v = aa + ai16 + 16;
645       idt -= 4;
646     }
647 
648     /* Convert t from single precision back to double precision (inplace)*/
649     idt = 4 * (n - 1);
650     for (i = n - 1; i >= 0; i--) {
651       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
652       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
653       PetscScalar *xtemp = &x[idt];
654       MatScalar   *ttemp = &t[idt];
655       xtemp[3]           = (PetscScalar)ttemp[3];
656       xtemp[2]           = (PetscScalar)ttemp[2];
657       xtemp[1]           = (PetscScalar)ttemp[1];
658       xtemp[0]           = (PetscScalar)ttemp[0];
659       idt -= 4;
660     }
661 
662   } /* End of artificial scope. */
663   PetscCall(VecRestoreArray(bb, &b));
664   PetscCall(VecRestoreArray(xx, &x));
665   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
666   SSE_SCOPE_END;
667   PetscFunctionReturn(0);
668 }
669 
670 #endif
671