xref: /petsc/src/mat/impls/baij/seq/baijsolvnat4.c (revision d9acb416d05abeed0a33bde3a81aeb2ea0364f6a)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 /*
5       Special case where the matrix was ILU(0) factored in the natural
6    ordering. This eliminates the need for the column and row permutation.
7 */
8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
9 {
10   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
11   PetscInt           n  = a->mbs;
12   const PetscInt    *ai = a->i, *aj = a->j;
13   const PetscInt    *diag = a->diag;
14   const MatScalar   *aa   = a->a;
15   PetscScalar       *x;
16   const PetscScalar *b;
17 
18   PetscFunctionBegin;
19   PetscCall(VecGetArrayRead(bb, &b));
20   PetscCall(VecGetArray(xx, &x));
21 
22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23   {
24     static PetscScalar w[2000]; /* very BAD need to fix */
25     fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w);
26   }
27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28   fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b);
29 #else
30   {
31     PetscScalar      s1, s2, s3, s4, x1, x2, x3, x4;
32     const MatScalar *v;
33     PetscInt         jdx, idt, idx, nz, i, ai16;
34     const PetscInt  *vi;
35 
36     /* forward solve the lower triangular */
37     idx  = 0;
38     x[0] = b[0];
39     x[1] = b[1];
40     x[2] = b[2];
41     x[3] = b[3];
42     for (i = 1; i < n; i++) {
43       v  = aa + 16 * ai[i];
44       vi = aj + ai[i];
45       nz = diag[i] - ai[i];
46       idx += 4;
47       s1 = b[idx];
48       s2 = b[1 + idx];
49       s3 = b[2 + idx];
50       s4 = b[3 + idx];
51       while (nz--) {
52         jdx = 4 * (*vi++);
53         x1  = x[jdx];
54         x2  = x[1 + jdx];
55         x3  = x[2 + jdx];
56         x4  = x[3 + jdx];
57         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
58         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
59         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
60         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
61         v += 16;
62       }
63       x[idx]     = s1;
64       x[1 + idx] = s2;
65       x[2 + idx] = s3;
66       x[3 + idx] = s4;
67     }
68     /* backward solve the upper triangular */
69     idt = 4 * (n - 1);
70     for (i = n - 1; i >= 0; i--) {
71       ai16 = 16 * diag[i];
72       v    = aa + ai16 + 16;
73       vi   = aj + diag[i] + 1;
74       nz   = ai[i + 1] - diag[i] - 1;
75       s1   = x[idt];
76       s2   = x[1 + idt];
77       s3   = x[2 + idt];
78       s4   = x[3 + idt];
79       while (nz--) {
80         idx = 4 * (*vi++);
81         x1  = x[idx];
82         x2  = x[1 + idx];
83         x3  = x[2 + idx];
84         x4  = x[3 + idx];
85         s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
86         s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87         s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88         s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
89         v += 16;
90       }
91       v          = aa + ai16;
92       x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
93       x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
94       x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
95       x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
96       idt -= 4;
97     }
98   }
99 #endif
100 
101   PetscCall(VecRestoreArrayRead(bb, &b));
102   PetscCall(VecRestoreArray(xx, &x));
103   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
104   PetscFunctionReturn(PETSC_SUCCESS);
105 }
106 
107 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx)
108 {
109   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
110   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
111   PetscInt           i, k, nz, idx, jdx, idt;
112   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
113   const MatScalar   *aa = a->a, *v;
114   PetscScalar       *x;
115   const PetscScalar *b;
116   PetscScalar        s1, s2, s3, s4, x1, x2, x3, x4;
117 
118   PetscFunctionBegin;
119   PetscCall(VecGetArrayRead(bb, &b));
120   PetscCall(VecGetArray(xx, &x));
121   /* forward solve the lower triangular */
122   idx  = 0;
123   x[0] = b[idx];
124   x[1] = b[1 + idx];
125   x[2] = b[2 + idx];
126   x[3] = b[3 + idx];
127   for (i = 1; i < n; i++) {
128     v   = aa + bs2 * ai[i];
129     vi  = aj + ai[i];
130     nz  = ai[i + 1] - ai[i];
131     idx = bs * i;
132     s1  = b[idx];
133     s2  = b[1 + idx];
134     s3  = b[2 + idx];
135     s4  = b[3 + idx];
136     for (k = 0; k < nz; k++) {
137       jdx = bs * vi[k];
138       x1  = x[jdx];
139       x2  = x[1 + jdx];
140       x3  = x[2 + jdx];
141       x4  = x[3 + jdx];
142       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
143       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
144       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
145       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
146 
147       v += bs2;
148     }
149 
150     x[idx]     = s1;
151     x[1 + idx] = s2;
152     x[2 + idx] = s3;
153     x[3 + idx] = s4;
154   }
155 
156   /* backward solve the upper triangular */
157   for (i = n - 1; i >= 0; i--) {
158     v   = aa + bs2 * (adiag[i + 1] + 1);
159     vi  = aj + adiag[i + 1] + 1;
160     nz  = adiag[i] - adiag[i + 1] - 1;
161     idt = bs * i;
162     s1  = x[idt];
163     s2  = x[1 + idt];
164     s3  = x[2 + idt];
165     s4  = x[3 + idt];
166 
167     for (k = 0; k < nz; k++) {
168       idx = bs * vi[k];
169       x1  = x[idx];
170       x2  = x[1 + idx];
171       x3  = x[2 + idx];
172       x4  = x[3 + idx];
173       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
174       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
175       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
176       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
177 
178       v += bs2;
179     }
180     /* x = inv_diagonal*x */
181     x[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
182     x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
183     x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
184     x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
185   }
186 
187   PetscCall(VecRestoreArrayRead(bb, &b));
188   PetscCall(VecRestoreArray(xx, &x));
189   PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 #if defined(PETSC_HAVE_SSE)
194 
195   #include PETSC_HAVE_SSE
196 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx)
197 {
198   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
199   unsigned short *aj = (unsigned short *)a->j;
200   int            *ai = a->i, n = a->mbs, *diag = a->diag;
201   MatScalar      *aa = a->a;
202   PetscScalar    *x, *b;
203 
204   PetscFunctionBegin;
205   SSE_SCOPE_BEGIN;
206   /*
207      Note: This code currently uses demotion of double
208      to float when performing the mixed-mode computation.
209      This may not be numerically reasonable for all applications.
210   */
211   PREFETCH_NTA(aa + 16 * ai[1]);
212 
213   PetscCall(VecGetArray(bb, &b));
214   PetscCall(VecGetArray(xx, &x));
215   {
216     /* x will first be computed in single precision then promoted inplace to double */
217     MatScalar      *v, *t = (MatScalar *)x;
218     int             nz, i, idt, ai16;
219     unsigned int    jdx, idx;
220     unsigned short *vi;
221     /* Forward solve the lower triangular factor. */
222 
223     /* First block is the identity. */
224     idx = 0;
225     CONVERT_DOUBLE4_FLOAT4(t, b);
226     v = aa + 16 * ((unsigned int)ai[1]);
227 
228     for (i = 1; i < n;) {
229       PREFETCH_NTA(&v[8]);
230       vi = aj + ai[i];
231       nz = diag[i] - ai[i];
232       idx += 4;
233 
234       /* Demote RHS from double to float. */
235       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
236       LOAD_PS(&t[idx], XMM7);
237 
238       while (nz--) {
239         PREFETCH_NTA(&v[16]);
240         jdx = 4 * ((unsigned int)(*vi++));
241 
242         /* 4x4 Matrix-Vector product with negative accumulation: */
243         SSE_INLINE_BEGIN_2(&t[jdx], v)
244         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
245 
246         /* First Column */
247         SSE_COPY_PS(XMM0, XMM6)
248         SSE_SHUFFLE(XMM0, XMM0, 0x00)
249         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
250         SSE_SUB_PS(XMM7, XMM0)
251 
252         /* Second Column */
253         SSE_COPY_PS(XMM1, XMM6)
254         SSE_SHUFFLE(XMM1, XMM1, 0x55)
255         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
256         SSE_SUB_PS(XMM7, XMM1)
257 
258         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
259 
260         /* Third Column */
261         SSE_COPY_PS(XMM2, XMM6)
262         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
263         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
264         SSE_SUB_PS(XMM7, XMM2)
265 
266         /* Fourth Column */
267         SSE_COPY_PS(XMM3, XMM6)
268         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
269         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
270         SSE_SUB_PS(XMM7, XMM3)
271         SSE_INLINE_END_2
272 
273         v += 16;
274       }
275       v = aa + 16 * ai[++i];
276       PREFETCH_NTA(v);
277       STORE_PS(&t[idx], XMM7);
278     }
279 
280     /* Backward solve the upper triangular factor.*/
281 
282     idt  = 4 * (n - 1);
283     ai16 = 16 * diag[n - 1];
284     v    = aa + ai16 + 16;
285     for (i = n - 1; i >= 0;) {
286       PREFETCH_NTA(&v[8]);
287       vi = aj + diag[i] + 1;
288       nz = ai[i + 1] - diag[i] - 1;
289 
290       LOAD_PS(&t[idt], XMM7);
291 
292       while (nz--) {
293         PREFETCH_NTA(&v[16]);
294         idx = 4 * ((unsigned int)(*vi++));
295 
296         /* 4x4 Matrix-Vector Product with negative accumulation: */
297         SSE_INLINE_BEGIN_2(&t[idx], v)
298         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
299 
300         /* First Column */
301         SSE_COPY_PS(XMM0, XMM6)
302         SSE_SHUFFLE(XMM0, XMM0, 0x00)
303         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
304         SSE_SUB_PS(XMM7, XMM0)
305 
306         /* Second Column */
307         SSE_COPY_PS(XMM1, XMM6)
308         SSE_SHUFFLE(XMM1, XMM1, 0x55)
309         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
310         SSE_SUB_PS(XMM7, XMM1)
311 
312         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
313 
314         /* Third Column */
315         SSE_COPY_PS(XMM2, XMM6)
316         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
317         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
318         SSE_SUB_PS(XMM7, XMM2)
319 
320         /* Fourth Column */
321         SSE_COPY_PS(XMM3, XMM6)
322         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
323         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
324         SSE_SUB_PS(XMM7, XMM3)
325         SSE_INLINE_END_2
326         v += 16;
327       }
328       v    = aa + ai16;
329       ai16 = 16 * diag[--i];
330       PREFETCH_NTA(aa + ai16 + 16);
331       /*
332          Scale the result by the diagonal 4x4 block,
333          which was inverted as part of the factorization
334       */
335       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
336       /* First Column */
337       SSE_COPY_PS(XMM0, XMM7)
338       SSE_SHUFFLE(XMM0, XMM0, 0x00)
339       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
340 
341       /* Second Column */
342       SSE_COPY_PS(XMM1, XMM7)
343       SSE_SHUFFLE(XMM1, XMM1, 0x55)
344       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
345       SSE_ADD_PS(XMM0, XMM1)
346 
347       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
348 
349       /* Third Column */
350       SSE_COPY_PS(XMM2, XMM7)
351       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
352       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
353       SSE_ADD_PS(XMM0, XMM2)
354 
355       /* Fourth Column */
356       SSE_COPY_PS(XMM3, XMM7)
357       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
358       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
359       SSE_ADD_PS(XMM0, XMM3)
360 
361       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
362       SSE_INLINE_END_3
363 
364       v = aa + ai16 + 16;
365       idt -= 4;
366     }
367 
368     /* Convert t from single precision back to double precision (inplace)*/
369     idt = 4 * (n - 1);
370     for (i = n - 1; i >= 0; i--) {
371       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
372       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
373       PetscScalar *xtemp = &x[idt];
374       MatScalar   *ttemp = &t[idt];
375       xtemp[3]           = (PetscScalar)ttemp[3];
376       xtemp[2]           = (PetscScalar)ttemp[2];
377       xtemp[1]           = (PetscScalar)ttemp[1];
378       xtemp[0]           = (PetscScalar)ttemp[0];
379       idt -= 4;
380     }
381 
382   } /* End of artificial scope. */
383   PetscCall(VecRestoreArray(bb, &b));
384   PetscCall(VecRestoreArray(xx, &x));
385   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
386   SSE_SCOPE_END;
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx)
391 {
392   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
393   int         *aj = a->j;
394   int         *ai = a->i, n = a->mbs, *diag = a->diag;
395   MatScalar   *aa = a->a;
396   PetscScalar *x, *b;
397 
398   PetscFunctionBegin;
399   SSE_SCOPE_BEGIN;
400   /*
401      Note: This code currently uses demotion of double
402      to float when performing the mixed-mode computation.
403      This may not be numerically reasonable for all applications.
404   */
405   PREFETCH_NTA(aa + 16 * ai[1]);
406 
407   PetscCall(VecGetArray(bb, &b));
408   PetscCall(VecGetArray(xx, &x));
409   {
410     /* x will first be computed in single precision then promoted inplace to double */
411     MatScalar *v, *t = (MatScalar *)x;
412     int        nz, i, idt, ai16;
413     int        jdx, idx;
414     int       *vi;
415     /* Forward solve the lower triangular factor. */
416 
417     /* First block is the identity. */
418     idx = 0;
419     CONVERT_DOUBLE4_FLOAT4(t, b);
420     v = aa + 16 * ai[1];
421 
422     for (i = 1; i < n;) {
423       PREFETCH_NTA(&v[8]);
424       vi = aj + ai[i];
425       nz = diag[i] - ai[i];
426       idx += 4;
427 
428       /* Demote RHS from double to float. */
429       CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]);
430       LOAD_PS(&t[idx], XMM7);
431 
432       while (nz--) {
433         PREFETCH_NTA(&v[16]);
434         jdx = 4 * (*vi++);
435         /*          jdx = *vi++; */
436 
437         /* 4x4 Matrix-Vector product with negative accumulation: */
438         SSE_INLINE_BEGIN_2(&t[jdx], v)
439         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
440 
441         /* First Column */
442         SSE_COPY_PS(XMM0, XMM6)
443         SSE_SHUFFLE(XMM0, XMM0, 0x00)
444         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
445         SSE_SUB_PS(XMM7, XMM0)
446 
447         /* Second Column */
448         SSE_COPY_PS(XMM1, XMM6)
449         SSE_SHUFFLE(XMM1, XMM1, 0x55)
450         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
451         SSE_SUB_PS(XMM7, XMM1)
452 
453         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
454 
455         /* Third Column */
456         SSE_COPY_PS(XMM2, XMM6)
457         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
458         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
459         SSE_SUB_PS(XMM7, XMM2)
460 
461         /* Fourth Column */
462         SSE_COPY_PS(XMM3, XMM6)
463         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
464         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
465         SSE_SUB_PS(XMM7, XMM3)
466         SSE_INLINE_END_2
467 
468         v += 16;
469       }
470       v = aa + 16 * ai[++i];
471       PREFETCH_NTA(v);
472       STORE_PS(&t[idx], XMM7);
473     }
474 
475     /* Backward solve the upper triangular factor.*/
476 
477     idt  = 4 * (n - 1);
478     ai16 = 16 * diag[n - 1];
479     v    = aa + ai16 + 16;
480     for (i = n - 1; i >= 0;) {
481       PREFETCH_NTA(&v[8]);
482       vi = aj + diag[i] + 1;
483       nz = ai[i + 1] - diag[i] - 1;
484 
485       LOAD_PS(&t[idt], XMM7);
486 
487       while (nz--) {
488         PREFETCH_NTA(&v[16]);
489         idx = 4 * (*vi++);
490         /*          idx = *vi++; */
491 
492         /* 4x4 Matrix-Vector Product with negative accumulation: */
493         SSE_INLINE_BEGIN_2(&t[idx], v)
494         SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
495 
496         /* First Column */
497         SSE_COPY_PS(XMM0, XMM6)
498         SSE_SHUFFLE(XMM0, XMM0, 0x00)
499         SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
500         SSE_SUB_PS(XMM7, XMM0)
501 
502         /* Second Column */
503         SSE_COPY_PS(XMM1, XMM6)
504         SSE_SHUFFLE(XMM1, XMM1, 0x55)
505         SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
506         SSE_SUB_PS(XMM7, XMM1)
507 
508         SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
509 
510         /* Third Column */
511         SSE_COPY_PS(XMM2, XMM6)
512         SSE_SHUFFLE(XMM2, XMM2, 0xAA)
513         SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
514         SSE_SUB_PS(XMM7, XMM2)
515 
516         /* Fourth Column */
517         SSE_COPY_PS(XMM3, XMM6)
518         SSE_SHUFFLE(XMM3, XMM3, 0xFF)
519         SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
520         SSE_SUB_PS(XMM7, XMM3)
521         SSE_INLINE_END_2
522         v += 16;
523       }
524       v    = aa + ai16;
525       ai16 = 16 * diag[--i];
526       PREFETCH_NTA(aa + ai16 + 16);
527       /*
528          Scale the result by the diagonal 4x4 block,
529          which was inverted as part of the factorization
530       */
531       SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16)
532       /* First Column */
533       SSE_COPY_PS(XMM0, XMM7)
534       SSE_SHUFFLE(XMM0, XMM0, 0x00)
535       SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
536 
537       /* Second Column */
538       SSE_COPY_PS(XMM1, XMM7)
539       SSE_SHUFFLE(XMM1, XMM1, 0x55)
540       SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
541       SSE_ADD_PS(XMM0, XMM1)
542 
543       SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
544 
545       /* Third Column */
546       SSE_COPY_PS(XMM2, XMM7)
547       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
548       SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
549       SSE_ADD_PS(XMM0, XMM2)
550 
551       /* Fourth Column */
552       SSE_COPY_PS(XMM3, XMM7)
553       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
554       SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
555       SSE_ADD_PS(XMM0, XMM3)
556 
557       SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
558       SSE_INLINE_END_3
559 
560       v = aa + ai16 + 16;
561       idt -= 4;
562     }
563 
564     /* Convert t from single precision back to double precision (inplace)*/
565     idt = 4 * (n - 1);
566     for (i = n - 1; i >= 0; i--) {
567       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
568       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
569       PetscScalar *xtemp = &x[idt];
570       MatScalar   *ttemp = &t[idt];
571       xtemp[3]           = (PetscScalar)ttemp[3];
572       xtemp[2]           = (PetscScalar)ttemp[2];
573       xtemp[1]           = (PetscScalar)ttemp[1];
574       xtemp[0]           = (PetscScalar)ttemp[0];
575       idt -= 4;
576     }
577 
578   } /* End of artificial scope. */
579   PetscCall(VecRestoreArray(bb, &b));
580   PetscCall(VecRestoreArray(xx, &x));
581   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
582   SSE_SCOPE_END;
583   PetscFunctionReturn(PETSC_SUCCESS);
584 }
585 
586 #endif
587