xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) {
5   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
6   IS                 iscol = a->col, isrow = a->row;
7   const PetscInt    *r, *c, *rout, *cout;
8   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
9   PetscInt           i, nz;
10   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
11   const MatScalar   *aa = a->a, *v;
12   PetscScalar       *x, *s, *t, *ls;
13   const PetscScalar *b;
14 
15   PetscFunctionBegin;
16   PetscCall(VecGetArrayRead(bb, &b));
17   PetscCall(VecGetArray(xx, &x));
18   t = a->solve_work;
19 
20   PetscCall(ISGetIndices(isrow, &rout));
21   r = rout;
22   PetscCall(ISGetIndices(iscol, &cout));
23   c = cout + (n - 1);
24 
25   /* forward solve the lower triangular */
26   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
27   for (i = 1; i < n; i++) {
28     v  = aa + bs2 * ai[i];
29     vi = aj + ai[i];
30     nz = a->diag[i] - ai[i];
31     s  = t + bs * i;
32     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
33     while (nz--) {
34       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
35       v += bs2;
36     }
37   }
38   /* backward solve the upper triangular */
39   ls = a->solve_work + A->cmap->n;
40   for (i = n - 1; i >= 0; i--) {
41     v  = aa + bs2 * (a->diag[i] + 1);
42     vi = aj + a->diag[i] + 1;
43     nz = ai[i + 1] - a->diag[i] - 1;
44     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
45     while (nz--) {
46       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
47       v += bs2;
48     }
49     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
50     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
51   }
52 
53   PetscCall(ISRestoreIndices(isrow, &rout));
54   PetscCall(ISRestoreIndices(iscol, &cout));
55   PetscCall(VecRestoreArrayRead(bb, &b));
56   PetscCall(VecRestoreArray(xx, &x));
57   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
58   PetscFunctionReturn(0);
59 }
60 
61 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) {
62   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
63   IS                 iscol = a->col, isrow = a->row;
64   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
65   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
66   PetscInt           i, nz, idx, idt, idc;
67   const MatScalar   *aa = a->a, *v;
68   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
69   const PetscScalar *b;
70 
71   PetscFunctionBegin;
72   PetscCall(VecGetArrayRead(bb, &b));
73   PetscCall(VecGetArray(xx, &x));
74   t = a->solve_work;
75 
76   PetscCall(ISGetIndices(isrow, &rout));
77   r = rout;
78   PetscCall(ISGetIndices(iscol, &cout));
79   c = cout + (n - 1);
80 
81   /* forward solve the lower triangular */
82   idx  = 7 * (*r++);
83   t[0] = b[idx];
84   t[1] = b[1 + idx];
85   t[2] = b[2 + idx];
86   t[3] = b[3 + idx];
87   t[4] = b[4 + idx];
88   t[5] = b[5 + idx];
89   t[6] = b[6 + idx];
90 
91   for (i = 1; i < n; i++) {
92     v   = aa + 49 * ai[i];
93     vi  = aj + ai[i];
94     nz  = diag[i] - ai[i];
95     idx = 7 * (*r++);
96     s1  = b[idx];
97     s2  = b[1 + idx];
98     s3  = b[2 + idx];
99     s4  = b[3 + idx];
100     s5  = b[4 + idx];
101     s6  = b[5 + idx];
102     s7  = b[6 + idx];
103     while (nz--) {
104       idx = 7 * (*vi++);
105       x1  = t[idx];
106       x2  = t[1 + idx];
107       x3  = t[2 + idx];
108       x4  = t[3 + idx];
109       x5  = t[4 + idx];
110       x6  = t[5 + idx];
111       x7  = t[6 + idx];
112       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
113       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
114       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
115       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
116       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
117       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
118       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
119       v += 49;
120     }
121     idx        = 7 * i;
122     t[idx]     = s1;
123     t[1 + idx] = s2;
124     t[2 + idx] = s3;
125     t[3 + idx] = s4;
126     t[4 + idx] = s5;
127     t[5 + idx] = s6;
128     t[6 + idx] = s7;
129   }
130   /* backward solve the upper triangular */
131   for (i = n - 1; i >= 0; i--) {
132     v   = aa + 49 * diag[i] + 49;
133     vi  = aj + diag[i] + 1;
134     nz  = ai[i + 1] - diag[i] - 1;
135     idt = 7 * i;
136     s1  = t[idt];
137     s2  = t[1 + idt];
138     s3  = t[2 + idt];
139     s4  = t[3 + idt];
140     s5  = t[4 + idt];
141     s6  = t[5 + idt];
142     s7  = t[6 + idt];
143     while (nz--) {
144       idx = 7 * (*vi++);
145       x1  = t[idx];
146       x2  = t[1 + idx];
147       x3  = t[2 + idx];
148       x4  = t[3 + idx];
149       x5  = t[4 + idx];
150       x6  = t[5 + idx];
151       x7  = t[6 + idx];
152       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
153       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
154       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
155       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
156       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
157       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
158       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
159       v += 49;
160     }
161     idc    = 7 * (*c--);
162     v      = aa + 49 * diag[i];
163     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
164     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
165     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
166     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
167     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
168     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
169     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
170   }
171 
172   PetscCall(ISRestoreIndices(isrow, &rout));
173   PetscCall(ISRestoreIndices(iscol, &cout));
174   PetscCall(VecRestoreArrayRead(bb, &b));
175   PetscCall(VecRestoreArray(xx, &x));
176   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
177   PetscFunctionReturn(0);
178 }
179 
180 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) {
181   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
182   IS                 iscol = a->col, isrow = a->row;
183   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
184   const PetscInt     n = a->mbs, *rout, *cout, *vi;
185   PetscInt           i, nz, idx, idt, idc, m;
186   const MatScalar   *aa = a->a, *v;
187   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
188   const PetscScalar *b;
189 
190   PetscFunctionBegin;
191   PetscCall(VecGetArrayRead(bb, &b));
192   PetscCall(VecGetArray(xx, &x));
193   t = a->solve_work;
194 
195   PetscCall(ISGetIndices(isrow, &rout));
196   r = rout;
197   PetscCall(ISGetIndices(iscol, &cout));
198   c = cout;
199 
200   /* forward solve the lower triangular */
201   idx  = 7 * r[0];
202   t[0] = b[idx];
203   t[1] = b[1 + idx];
204   t[2] = b[2 + idx];
205   t[3] = b[3 + idx];
206   t[4] = b[4 + idx];
207   t[5] = b[5 + idx];
208   t[6] = b[6 + idx];
209 
210   for (i = 1; i < n; i++) {
211     v   = aa + 49 * ai[i];
212     vi  = aj + ai[i];
213     nz  = ai[i + 1] - ai[i];
214     idx = 7 * r[i];
215     s1  = b[idx];
216     s2  = b[1 + idx];
217     s3  = b[2 + idx];
218     s4  = b[3 + idx];
219     s5  = b[4 + idx];
220     s6  = b[5 + idx];
221     s7  = b[6 + idx];
222     for (m = 0; m < nz; m++) {
223       idx = 7 * vi[m];
224       x1  = t[idx];
225       x2  = t[1 + idx];
226       x3  = t[2 + idx];
227       x4  = t[3 + idx];
228       x5  = t[4 + idx];
229       x6  = t[5 + idx];
230       x7  = t[6 + idx];
231       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
232       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
233       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
234       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
235       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
236       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
237       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
238       v += 49;
239     }
240     idx        = 7 * i;
241     t[idx]     = s1;
242     t[1 + idx] = s2;
243     t[2 + idx] = s3;
244     t[3 + idx] = s4;
245     t[4 + idx] = s5;
246     t[5 + idx] = s6;
247     t[6 + idx] = s7;
248   }
249   /* backward solve the upper triangular */
250   for (i = n - 1; i >= 0; i--) {
251     v   = aa + 49 * (adiag[i + 1] + 1);
252     vi  = aj + adiag[i + 1] + 1;
253     nz  = adiag[i] - adiag[i + 1] - 1;
254     idt = 7 * i;
255     s1  = t[idt];
256     s2  = t[1 + idt];
257     s3  = t[2 + idt];
258     s4  = t[3 + idt];
259     s5  = t[4 + idt];
260     s6  = t[5 + idt];
261     s7  = t[6 + idt];
262     for (m = 0; m < nz; m++) {
263       idx = 7 * vi[m];
264       x1  = t[idx];
265       x2  = t[1 + idx];
266       x3  = t[2 + idx];
267       x4  = t[3 + idx];
268       x5  = t[4 + idx];
269       x6  = t[5 + idx];
270       x7  = t[6 + idx];
271       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
272       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
273       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
274       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
275       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
276       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
277       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
278       v += 49;
279     }
280     idc    = 7 * c[i];
281     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
282     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
283     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
284     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
285     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
286     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
287     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
288   }
289 
290   PetscCall(ISRestoreIndices(isrow, &rout));
291   PetscCall(ISRestoreIndices(iscol, &cout));
292   PetscCall(VecRestoreArrayRead(bb, &b));
293   PetscCall(VecRestoreArray(xx, &x));
294   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
295   PetscFunctionReturn(0);
296 }
297 
298 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) {
299   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
300   IS                 iscol = a->col, isrow = a->row;
301   const PetscInt    *r, *c, *rout, *cout;
302   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
303   PetscInt           i, nz, idx, idt, idc;
304   const MatScalar   *aa = a->a, *v;
305   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
306   const PetscScalar *b;
307 
308   PetscFunctionBegin;
309   PetscCall(VecGetArrayRead(bb, &b));
310   PetscCall(VecGetArray(xx, &x));
311   t = a->solve_work;
312 
313   PetscCall(ISGetIndices(isrow, &rout));
314   r = rout;
315   PetscCall(ISGetIndices(iscol, &cout));
316   c = cout + (n - 1);
317 
318   /* forward solve the lower triangular */
319   idx  = 6 * (*r++);
320   t[0] = b[idx];
321   t[1] = b[1 + idx];
322   t[2] = b[2 + idx];
323   t[3] = b[3 + idx];
324   t[4] = b[4 + idx];
325   t[5] = b[5 + idx];
326   for (i = 1; i < n; i++) {
327     v   = aa + 36 * ai[i];
328     vi  = aj + ai[i];
329     nz  = diag[i] - ai[i];
330     idx = 6 * (*r++);
331     s1  = b[idx];
332     s2  = b[1 + idx];
333     s3  = b[2 + idx];
334     s4  = b[3 + idx];
335     s5  = b[4 + idx];
336     s6  = b[5 + idx];
337     while (nz--) {
338       idx = 6 * (*vi++);
339       x1  = t[idx];
340       x2  = t[1 + idx];
341       x3  = t[2 + idx];
342       x4  = t[3 + idx];
343       x5  = t[4 + idx];
344       x6  = t[5 + idx];
345       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
346       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
347       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
348       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
349       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
350       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
351       v += 36;
352     }
353     idx        = 6 * i;
354     t[idx]     = s1;
355     t[1 + idx] = s2;
356     t[2 + idx] = s3;
357     t[3 + idx] = s4;
358     t[4 + idx] = s5;
359     t[5 + idx] = s6;
360   }
361   /* backward solve the upper triangular */
362   for (i = n - 1; i >= 0; i--) {
363     v   = aa + 36 * diag[i] + 36;
364     vi  = aj + diag[i] + 1;
365     nz  = ai[i + 1] - diag[i] - 1;
366     idt = 6 * i;
367     s1  = t[idt];
368     s2  = t[1 + idt];
369     s3  = t[2 + idt];
370     s4  = t[3 + idt];
371     s5  = t[4 + idt];
372     s6  = t[5 + idt];
373     while (nz--) {
374       idx = 6 * (*vi++);
375       x1  = t[idx];
376       x2  = t[1 + idx];
377       x3  = t[2 + idx];
378       x4  = t[3 + idx];
379       x5  = t[4 + idx];
380       x6  = t[5 + idx];
381       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
382       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
383       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
384       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
385       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
386       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
387       v += 36;
388     }
389     idc    = 6 * (*c--);
390     v      = aa + 36 * diag[i];
391     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
392     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
393     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
394     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
395     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
396     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
397   }
398 
399   PetscCall(ISRestoreIndices(isrow, &rout));
400   PetscCall(ISRestoreIndices(iscol, &cout));
401   PetscCall(VecRestoreArrayRead(bb, &b));
402   PetscCall(VecRestoreArray(xx, &x));
403   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
404   PetscFunctionReturn(0);
405 }
406 
407 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) {
408   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
409   IS                 iscol = a->col, isrow = a->row;
410   const PetscInt    *r, *c, *rout, *cout;
411   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
412   PetscInt           i, nz, idx, idt, idc, m;
413   const MatScalar   *aa = a->a, *v;
414   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
415   const PetscScalar *b;
416 
417   PetscFunctionBegin;
418   PetscCall(VecGetArrayRead(bb, &b));
419   PetscCall(VecGetArray(xx, &x));
420   t = a->solve_work;
421 
422   PetscCall(ISGetIndices(isrow, &rout));
423   r = rout;
424   PetscCall(ISGetIndices(iscol, &cout));
425   c = cout;
426 
427   /* forward solve the lower triangular */
428   idx  = 6 * r[0];
429   t[0] = b[idx];
430   t[1] = b[1 + idx];
431   t[2] = b[2 + idx];
432   t[3] = b[3 + idx];
433   t[4] = b[4 + idx];
434   t[5] = b[5 + idx];
435   for (i = 1; i < n; i++) {
436     v   = aa + 36 * ai[i];
437     vi  = aj + ai[i];
438     nz  = ai[i + 1] - ai[i];
439     idx = 6 * r[i];
440     s1  = b[idx];
441     s2  = b[1 + idx];
442     s3  = b[2 + idx];
443     s4  = b[3 + idx];
444     s5  = b[4 + idx];
445     s6  = b[5 + idx];
446     for (m = 0; m < nz; m++) {
447       idx = 6 * vi[m];
448       x1  = t[idx];
449       x2  = t[1 + idx];
450       x3  = t[2 + idx];
451       x4  = t[3 + idx];
452       x5  = t[4 + idx];
453       x6  = t[5 + idx];
454       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
455       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
456       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
457       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
458       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
459       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
460       v += 36;
461     }
462     idx        = 6 * i;
463     t[idx]     = s1;
464     t[1 + idx] = s2;
465     t[2 + idx] = s3;
466     t[3 + idx] = s4;
467     t[4 + idx] = s5;
468     t[5 + idx] = s6;
469   }
470   /* backward solve the upper triangular */
471   for (i = n - 1; i >= 0; i--) {
472     v   = aa + 36 * (adiag[i + 1] + 1);
473     vi  = aj + adiag[i + 1] + 1;
474     nz  = adiag[i] - adiag[i + 1] - 1;
475     idt = 6 * i;
476     s1  = t[idt];
477     s2  = t[1 + idt];
478     s3  = t[2 + idt];
479     s4  = t[3 + idt];
480     s5  = t[4 + idt];
481     s6  = t[5 + idt];
482     for (m = 0; m < nz; m++) {
483       idx = 6 * vi[m];
484       x1  = t[idx];
485       x2  = t[1 + idx];
486       x3  = t[2 + idx];
487       x4  = t[3 + idx];
488       x5  = t[4 + idx];
489       x6  = t[5 + idx];
490       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
491       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
492       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
493       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
494       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
495       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
496       v += 36;
497     }
498     idc    = 6 * c[i];
499     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
500     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
501     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
502     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
503     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
504     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
505   }
506 
507   PetscCall(ISRestoreIndices(isrow, &rout));
508   PetscCall(ISRestoreIndices(iscol, &cout));
509   PetscCall(VecRestoreArrayRead(bb, &b));
510   PetscCall(VecRestoreArray(xx, &x));
511   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
512   PetscFunctionReturn(0);
513 }
514 
515 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) {
516   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
517   IS                 iscol = a->col, isrow = a->row;
518   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
519   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
520   PetscInt           i, nz, idx, idt, idc;
521   const MatScalar   *aa = a->a, *v;
522   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
523   const PetscScalar *b;
524 
525   PetscFunctionBegin;
526   PetscCall(VecGetArrayRead(bb, &b));
527   PetscCall(VecGetArray(xx, &x));
528   t = a->solve_work;
529 
530   PetscCall(ISGetIndices(isrow, &rout));
531   r = rout;
532   PetscCall(ISGetIndices(iscol, &cout));
533   c = cout + (n - 1);
534 
535   /* forward solve the lower triangular */
536   idx  = 5 * (*r++);
537   t[0] = b[idx];
538   t[1] = b[1 + idx];
539   t[2] = b[2 + idx];
540   t[3] = b[3 + idx];
541   t[4] = b[4 + idx];
542   for (i = 1; i < n; i++) {
543     v   = aa + 25 * ai[i];
544     vi  = aj + ai[i];
545     nz  = diag[i] - ai[i];
546     idx = 5 * (*r++);
547     s1  = b[idx];
548     s2  = b[1 + idx];
549     s3  = b[2 + idx];
550     s4  = b[3 + idx];
551     s5  = b[4 + idx];
552     while (nz--) {
553       idx = 5 * (*vi++);
554       x1  = t[idx];
555       x2  = t[1 + idx];
556       x3  = t[2 + idx];
557       x4  = t[3 + idx];
558       x5  = t[4 + idx];
559       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
560       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
561       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
562       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
563       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
564       v += 25;
565     }
566     idx        = 5 * i;
567     t[idx]     = s1;
568     t[1 + idx] = s2;
569     t[2 + idx] = s3;
570     t[3 + idx] = s4;
571     t[4 + idx] = s5;
572   }
573   /* backward solve the upper triangular */
574   for (i = n - 1; i >= 0; i--) {
575     v   = aa + 25 * diag[i] + 25;
576     vi  = aj + diag[i] + 1;
577     nz  = ai[i + 1] - diag[i] - 1;
578     idt = 5 * i;
579     s1  = t[idt];
580     s2  = t[1 + idt];
581     s3  = t[2 + idt];
582     s4  = t[3 + idt];
583     s5  = t[4 + idt];
584     while (nz--) {
585       idx = 5 * (*vi++);
586       x1  = t[idx];
587       x2  = t[1 + idx];
588       x3  = t[2 + idx];
589       x4  = t[3 + idx];
590       x5  = t[4 + idx];
591       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
592       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
593       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
594       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
595       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
596       v += 25;
597     }
598     idc    = 5 * (*c--);
599     v      = aa + 25 * diag[i];
600     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
601     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
602     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
603     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
604     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
605   }
606 
607   PetscCall(ISRestoreIndices(isrow, &rout));
608   PetscCall(ISRestoreIndices(iscol, &cout));
609   PetscCall(VecRestoreArrayRead(bb, &b));
610   PetscCall(VecRestoreArray(xx, &x));
611   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
612   PetscFunctionReturn(0);
613 }
614 
615 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) {
616   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
617   IS                 iscol = a->col, isrow = a->row;
618   const PetscInt    *r, *c, *rout, *cout;
619   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
620   PetscInt           i, nz, idx, idt, idc, m;
621   const MatScalar   *aa = a->a, *v;
622   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
623   const PetscScalar *b;
624 
625   PetscFunctionBegin;
626   PetscCall(VecGetArrayRead(bb, &b));
627   PetscCall(VecGetArray(xx, &x));
628   t = a->solve_work;
629 
630   PetscCall(ISGetIndices(isrow, &rout));
631   r = rout;
632   PetscCall(ISGetIndices(iscol, &cout));
633   c = cout;
634 
635   /* forward solve the lower triangular */
636   idx  = 5 * r[0];
637   t[0] = b[idx];
638   t[1] = b[1 + idx];
639   t[2] = b[2 + idx];
640   t[3] = b[3 + idx];
641   t[4] = b[4 + idx];
642   for (i = 1; i < n; i++) {
643     v   = aa + 25 * ai[i];
644     vi  = aj + ai[i];
645     nz  = ai[i + 1] - ai[i];
646     idx = 5 * r[i];
647     s1  = b[idx];
648     s2  = b[1 + idx];
649     s3  = b[2 + idx];
650     s4  = b[3 + idx];
651     s5  = b[4 + idx];
652     for (m = 0; m < nz; m++) {
653       idx = 5 * vi[m];
654       x1  = t[idx];
655       x2  = t[1 + idx];
656       x3  = t[2 + idx];
657       x4  = t[3 + idx];
658       x5  = t[4 + idx];
659       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
660       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
661       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
662       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
663       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
664       v += 25;
665     }
666     idx        = 5 * i;
667     t[idx]     = s1;
668     t[1 + idx] = s2;
669     t[2 + idx] = s3;
670     t[3 + idx] = s4;
671     t[4 + idx] = s5;
672   }
673   /* backward solve the upper triangular */
674   for (i = n - 1; i >= 0; i--) {
675     v   = aa + 25 * (adiag[i + 1] + 1);
676     vi  = aj + adiag[i + 1] + 1;
677     nz  = adiag[i] - adiag[i + 1] - 1;
678     idt = 5 * i;
679     s1  = t[idt];
680     s2  = t[1 + idt];
681     s3  = t[2 + idt];
682     s4  = t[3 + idt];
683     s5  = t[4 + idt];
684     for (m = 0; m < nz; m++) {
685       idx = 5 * vi[m];
686       x1  = t[idx];
687       x2  = t[1 + idx];
688       x3  = t[2 + idx];
689       x4  = t[3 + idx];
690       x5  = t[4 + idx];
691       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
692       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
693       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
694       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
695       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
696       v += 25;
697     }
698     idc    = 5 * c[i];
699     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
700     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
701     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
702     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
703     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
704   }
705 
706   PetscCall(ISRestoreIndices(isrow, &rout));
707   PetscCall(ISRestoreIndices(iscol, &cout));
708   PetscCall(VecRestoreArrayRead(bb, &b));
709   PetscCall(VecRestoreArray(xx, &x));
710   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
711   PetscFunctionReturn(0);
712 }
713 
714 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) {
715   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
716   IS                 iscol = a->col, isrow = a->row;
717   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
718   PetscInt           i, nz, idx, idt, idc;
719   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
720   const MatScalar   *aa = a->a, *v;
721   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
722   const PetscScalar *b;
723 
724   PetscFunctionBegin;
725   PetscCall(VecGetArrayRead(bb, &b));
726   PetscCall(VecGetArray(xx, &x));
727   t = a->solve_work;
728 
729   PetscCall(ISGetIndices(isrow, &rout));
730   r = rout;
731   PetscCall(ISGetIndices(iscol, &cout));
732   c = cout + (n - 1);
733 
734   /* forward solve the lower triangular */
735   idx  = 4 * (*r++);
736   t[0] = b[idx];
737   t[1] = b[1 + idx];
738   t[2] = b[2 + idx];
739   t[3] = b[3 + idx];
740   for (i = 1; i < n; i++) {
741     v   = aa + 16 * ai[i];
742     vi  = aj + ai[i];
743     nz  = diag[i] - ai[i];
744     idx = 4 * (*r++);
745     s1  = b[idx];
746     s2  = b[1 + idx];
747     s3  = b[2 + idx];
748     s4  = b[3 + idx];
749     while (nz--) {
750       idx = 4 * (*vi++);
751       x1  = t[idx];
752       x2  = t[1 + idx];
753       x3  = t[2 + idx];
754       x4  = t[3 + idx];
755       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
756       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
757       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
758       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
759       v += 16;
760     }
761     idx        = 4 * i;
762     t[idx]     = s1;
763     t[1 + idx] = s2;
764     t[2 + idx] = s3;
765     t[3 + idx] = s4;
766   }
767   /* backward solve the upper triangular */
768   for (i = n - 1; i >= 0; i--) {
769     v   = aa + 16 * diag[i] + 16;
770     vi  = aj + diag[i] + 1;
771     nz  = ai[i + 1] - diag[i] - 1;
772     idt = 4 * i;
773     s1  = t[idt];
774     s2  = t[1 + idt];
775     s3  = t[2 + idt];
776     s4  = t[3 + idt];
777     while (nz--) {
778       idx = 4 * (*vi++);
779       x1  = t[idx];
780       x2  = t[1 + idx];
781       x3  = t[2 + idx];
782       x4  = t[3 + idx];
783       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
784       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
785       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
786       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
787       v += 16;
788     }
789     idc    = 4 * (*c--);
790     v      = aa + 16 * diag[i];
791     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
792     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
793     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
794     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
795   }
796 
797   PetscCall(ISRestoreIndices(isrow, &rout));
798   PetscCall(ISRestoreIndices(iscol, &cout));
799   PetscCall(VecRestoreArrayRead(bb, &b));
800   PetscCall(VecRestoreArray(xx, &x));
801   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
802   PetscFunctionReturn(0);
803 }
804 
805 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) {
806   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
807   IS                 iscol = a->col, isrow = a->row;
808   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
809   PetscInt           i, nz, idx, idt, idc, m;
810   const PetscInt    *r, *c, *rout, *cout;
811   const MatScalar   *aa = a->a, *v;
812   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
813   const PetscScalar *b;
814 
815   PetscFunctionBegin;
816   PetscCall(VecGetArrayRead(bb, &b));
817   PetscCall(VecGetArray(xx, &x));
818   t = a->solve_work;
819 
820   PetscCall(ISGetIndices(isrow, &rout));
821   r = rout;
822   PetscCall(ISGetIndices(iscol, &cout));
823   c = cout;
824 
825   /* forward solve the lower triangular */
826   idx  = 4 * r[0];
827   t[0] = b[idx];
828   t[1] = b[1 + idx];
829   t[2] = b[2 + idx];
830   t[3] = b[3 + idx];
831   for (i = 1; i < n; i++) {
832     v   = aa + 16 * ai[i];
833     vi  = aj + ai[i];
834     nz  = ai[i + 1] - ai[i];
835     idx = 4 * r[i];
836     s1  = b[idx];
837     s2  = b[1 + idx];
838     s3  = b[2 + idx];
839     s4  = b[3 + idx];
840     for (m = 0; m < nz; m++) {
841       idx = 4 * vi[m];
842       x1  = t[idx];
843       x2  = t[1 + idx];
844       x3  = t[2 + idx];
845       x4  = t[3 + idx];
846       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
847       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
848       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
849       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
850       v += 16;
851     }
852     idx        = 4 * i;
853     t[idx]     = s1;
854     t[1 + idx] = s2;
855     t[2 + idx] = s3;
856     t[3 + idx] = s4;
857   }
858   /* backward solve the upper triangular */
859   for (i = n - 1; i >= 0; i--) {
860     v   = aa + 16 * (adiag[i + 1] + 1);
861     vi  = aj + adiag[i + 1] + 1;
862     nz  = adiag[i] - adiag[i + 1] - 1;
863     idt = 4 * i;
864     s1  = t[idt];
865     s2  = t[1 + idt];
866     s3  = t[2 + idt];
867     s4  = t[3 + idt];
868     for (m = 0; m < nz; m++) {
869       idx = 4 * vi[m];
870       x1  = t[idx];
871       x2  = t[1 + idx];
872       x3  = t[2 + idx];
873       x4  = t[3 + idx];
874       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
875       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
876       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
877       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
878       v += 16;
879     }
880     idc    = 4 * c[i];
881     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
882     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
883     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
884     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
885   }
886 
887   PetscCall(ISRestoreIndices(isrow, &rout));
888   PetscCall(ISRestoreIndices(iscol, &cout));
889   PetscCall(VecRestoreArrayRead(bb, &b));
890   PetscCall(VecRestoreArray(xx, &x));
891   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
892   PetscFunctionReturn(0);
893 }
894 
895 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) {
896   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
897   IS                 iscol = a->col, isrow = a->row;
898   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
899   PetscInt           i, nz, idx, idt, idc;
900   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
901   const MatScalar   *aa = a->a, *v;
902   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
903   PetscScalar       *x;
904   const PetscScalar *b;
905 
906   PetscFunctionBegin;
907   PetscCall(VecGetArrayRead(bb, &b));
908   PetscCall(VecGetArray(xx, &x));
909   t = (MatScalar *)a->solve_work;
910 
911   PetscCall(ISGetIndices(isrow, &rout));
912   r = rout;
913   PetscCall(ISGetIndices(iscol, &cout));
914   c = cout + (n - 1);
915 
916   /* forward solve the lower triangular */
917   idx  = 4 * (*r++);
918   t[0] = (MatScalar)b[idx];
919   t[1] = (MatScalar)b[1 + idx];
920   t[2] = (MatScalar)b[2 + idx];
921   t[3] = (MatScalar)b[3 + idx];
922   for (i = 1; i < n; i++) {
923     v   = aa + 16 * ai[i];
924     vi  = aj + ai[i];
925     nz  = diag[i] - ai[i];
926     idx = 4 * (*r++);
927     s1  = (MatScalar)b[idx];
928     s2  = (MatScalar)b[1 + idx];
929     s3  = (MatScalar)b[2 + idx];
930     s4  = (MatScalar)b[3 + idx];
931     while (nz--) {
932       idx = 4 * (*vi++);
933       x1  = t[idx];
934       x2  = t[1 + idx];
935       x3  = t[2 + idx];
936       x4  = t[3 + idx];
937       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
938       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
939       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
940       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
941       v += 16;
942     }
943     idx        = 4 * i;
944     t[idx]     = s1;
945     t[1 + idx] = s2;
946     t[2 + idx] = s3;
947     t[3 + idx] = s4;
948   }
949   /* backward solve the upper triangular */
950   for (i = n - 1; i >= 0; i--) {
951     v   = aa + 16 * diag[i] + 16;
952     vi  = aj + diag[i] + 1;
953     nz  = ai[i + 1] - diag[i] - 1;
954     idt = 4 * i;
955     s1  = t[idt];
956     s2  = t[1 + idt];
957     s3  = t[2 + idt];
958     s4  = t[3 + idt];
959     while (nz--) {
960       idx = 4 * (*vi++);
961       x1  = t[idx];
962       x2  = t[1 + idx];
963       x3  = t[2 + idx];
964       x4  = t[3 + idx];
965       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
966       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
967       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
968       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
969       v += 16;
970     }
971     idc        = 4 * (*c--);
972     v          = aa + 16 * diag[i];
973     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
974     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
975     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
976     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
977     x[idc]     = (PetscScalar)t[idt];
978     x[1 + idc] = (PetscScalar)t[1 + idt];
979     x[2 + idc] = (PetscScalar)t[2 + idt];
980     x[3 + idc] = (PetscScalar)t[3 + idt];
981   }
982 
983   PetscCall(ISRestoreIndices(isrow, &rout));
984   PetscCall(ISRestoreIndices(iscol, &cout));
985   PetscCall(VecRestoreArrayRead(bb, &b));
986   PetscCall(VecRestoreArray(xx, &x));
987   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
988   PetscFunctionReturn(0);
989 }
990 
991 #if defined(PETSC_HAVE_SSE)
992 
993 #include PETSC_HAVE_SSE
994 
995 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) {
996   /*
997      Note: This code uses demotion of double
998      to float when performing the mixed-mode computation.
999      This may not be numerically reasonable for all applications.
1000   */
1001   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
1002   IS              iscol = a->col, isrow = a->row;
1003   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1004   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1005   MatScalar      *aa = a->a, *v;
1006   PetscScalar    *x, *b, *t;
1007 
1008   /* Make space in temp stack for 16 Byte Aligned arrays */
1009   float         ssealignedspace[11], *tmps, *tmpx;
1010   unsigned long offset;
1011 
1012   PetscFunctionBegin;
1013   SSE_SCOPE_BEGIN;
1014 
1015   offset = (unsigned long)ssealignedspace % 16;
1016   if (offset) offset = (16 - offset) / 4;
1017   tmps = &ssealignedspace[offset];
1018   tmpx = &ssealignedspace[offset + 4];
1019   PREFETCH_NTA(aa + 16 * ai[1]);
1020 
1021   PetscCall(VecGetArray(bb, &b));
1022   PetscCall(VecGetArray(xx, &x));
1023   t = a->solve_work;
1024 
1025   PetscCall(ISGetIndices(isrow, &rout));
1026   r = rout;
1027   PetscCall(ISGetIndices(iscol, &cout));
1028   c = cout + (n - 1);
1029 
1030   /* forward solve the lower triangular */
1031   idx  = 4 * (*r++);
1032   t[0] = b[idx];
1033   t[1] = b[1 + idx];
1034   t[2] = b[2 + idx];
1035   t[3] = b[3 + idx];
1036   v    = aa + 16 * ai[1];
1037 
1038   for (i = 1; i < n;) {
1039     PREFETCH_NTA(&v[8]);
1040     vi  = aj + ai[i];
1041     nz  = diag[i] - ai[i];
1042     idx = 4 * (*r++);
1043 
1044     /* Demote sum from double to float */
1045     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1046     LOAD_PS(tmps, XMM7);
1047 
1048     while (nz--) {
1049       PREFETCH_NTA(&v[16]);
1050       idx = 4 * (*vi++);
1051 
1052       /* Demote solution (so far) from double to float */
1053       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
1054 
1055       /* 4x4 Matrix-Vector product with negative accumulation: */
1056       SSE_INLINE_BEGIN_2(tmpx, v)
1057       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1058 
1059       /* First Column */
1060       SSE_COPY_PS(XMM0, XMM6)
1061       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1062       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1063       SSE_SUB_PS(XMM7, XMM0)
1064 
1065       /* Second Column */
1066       SSE_COPY_PS(XMM1, XMM6)
1067       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1068       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1069       SSE_SUB_PS(XMM7, XMM1)
1070 
1071       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1072 
1073       /* Third Column */
1074       SSE_COPY_PS(XMM2, XMM6)
1075       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1076       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1077       SSE_SUB_PS(XMM7, XMM2)
1078 
1079       /* Fourth Column */
1080       SSE_COPY_PS(XMM3, XMM6)
1081       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1082       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1083       SSE_SUB_PS(XMM7, XMM3)
1084       SSE_INLINE_END_2
1085 
1086       v += 16;
1087     }
1088     idx = 4 * i;
1089     v   = aa + 16 * ai[++i];
1090     PREFETCH_NTA(v);
1091     STORE_PS(tmps, XMM7);
1092 
1093     /* Promote result from float to double */
1094     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1095   }
1096   /* backward solve the upper triangular */
1097   idt  = 4 * (n - 1);
1098   ai16 = 16 * diag[n - 1];
1099   v    = aa + ai16 + 16;
1100   for (i = n - 1; i >= 0;) {
1101     PREFETCH_NTA(&v[8]);
1102     vi = aj + diag[i] + 1;
1103     nz = ai[i + 1] - diag[i] - 1;
1104 
1105     /* Demote accumulator from double to float */
1106     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1107     LOAD_PS(tmps, XMM7);
1108 
1109     while (nz--) {
1110       PREFETCH_NTA(&v[16]);
1111       idx = 4 * (*vi++);
1112 
1113       /* Demote solution (so far) from double to float */
1114       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1115 
1116       /* 4x4 Matrix-Vector Product with negative accumulation: */
1117       SSE_INLINE_BEGIN_2(tmpx, v)
1118       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1119 
1120       /* First Column */
1121       SSE_COPY_PS(XMM0, XMM6)
1122       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1123       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1124       SSE_SUB_PS(XMM7, XMM0)
1125 
1126       /* Second Column */
1127       SSE_COPY_PS(XMM1, XMM6)
1128       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1129       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1130       SSE_SUB_PS(XMM7, XMM1)
1131 
1132       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1133 
1134       /* Third Column */
1135       SSE_COPY_PS(XMM2, XMM6)
1136       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1137       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1138       SSE_SUB_PS(XMM7, XMM2)
1139 
1140       /* Fourth Column */
1141       SSE_COPY_PS(XMM3, XMM6)
1142       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1143       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1144       SSE_SUB_PS(XMM7, XMM3)
1145       SSE_INLINE_END_2
1146       v += 16;
1147     }
1148     v    = aa + ai16;
1149     ai16 = 16 * diag[--i];
1150     PREFETCH_NTA(aa + ai16 + 16);
1151     /*
1152        Scale the result by the diagonal 4x4 block,
1153        which was inverted as part of the factorization
1154     */
1155     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1156     /* First Column */
1157     SSE_COPY_PS(XMM0, XMM7)
1158     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1159     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1160 
1161     /* Second Column */
1162     SSE_COPY_PS(XMM1, XMM7)
1163     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1164     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1165     SSE_ADD_PS(XMM0, XMM1)
1166 
1167     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1168 
1169     /* Third Column */
1170     SSE_COPY_PS(XMM2, XMM7)
1171     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1172     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1173     SSE_ADD_PS(XMM0, XMM2)
1174 
1175     /* Fourth Column */
1176     SSE_COPY_PS(XMM3, XMM7)
1177     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1178     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1179     SSE_ADD_PS(XMM0, XMM3)
1180 
1181     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1182     SSE_INLINE_END_3
1183 
1184     /* Promote solution from float to double */
1185     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1186 
1187     /* Apply reordering to t and stream into x.    */
1188     /* This way, x doesn't pollute the cache.      */
1189     /* Be careful with size: 2 doubles = 4 floats! */
1190     idc = 4 * (*c--);
1191     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1192     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1193     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1194     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1195     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1196     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1197     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1198     SSE_INLINE_END_2
1199     v = aa + ai16 + 16;
1200     idt -= 4;
1201   }
1202 
1203   PetscCall(ISRestoreIndices(isrow, &rout));
1204   PetscCall(ISRestoreIndices(iscol, &cout));
1205   PetscCall(VecRestoreArray(bb, &b));
1206   PetscCall(VecRestoreArray(xx, &x));
1207   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1208   SSE_SCOPE_END;
1209   PetscFunctionReturn(0);
1210 }
1211 
1212 #endif
1213 
1214 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) {
1215   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1216   IS                 iscol = a->col, isrow = a->row;
1217   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1218   PetscInt           i, nz, idx, idt, idc;
1219   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1220   const MatScalar   *aa = a->a, *v;
1221   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1222   const PetscScalar *b;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(VecGetArrayRead(bb, &b));
1226   PetscCall(VecGetArray(xx, &x));
1227   t = a->solve_work;
1228 
1229   PetscCall(ISGetIndices(isrow, &rout));
1230   r = rout;
1231   PetscCall(ISGetIndices(iscol, &cout));
1232   c = cout + (n - 1);
1233 
1234   /* forward solve the lower triangular */
1235   idx  = 3 * (*r++);
1236   t[0] = b[idx];
1237   t[1] = b[1 + idx];
1238   t[2] = b[2 + idx];
1239   for (i = 1; i < n; i++) {
1240     v   = aa + 9 * ai[i];
1241     vi  = aj + ai[i];
1242     nz  = diag[i] - ai[i];
1243     idx = 3 * (*r++);
1244     s1  = b[idx];
1245     s2  = b[1 + idx];
1246     s3  = b[2 + idx];
1247     while (nz--) {
1248       idx = 3 * (*vi++);
1249       x1  = t[idx];
1250       x2  = t[1 + idx];
1251       x3  = t[2 + idx];
1252       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1253       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1254       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1255       v += 9;
1256     }
1257     idx        = 3 * i;
1258     t[idx]     = s1;
1259     t[1 + idx] = s2;
1260     t[2 + idx] = s3;
1261   }
1262   /* backward solve the upper triangular */
1263   for (i = n - 1; i >= 0; i--) {
1264     v   = aa + 9 * diag[i] + 9;
1265     vi  = aj + diag[i] + 1;
1266     nz  = ai[i + 1] - diag[i] - 1;
1267     idt = 3 * i;
1268     s1  = t[idt];
1269     s2  = t[1 + idt];
1270     s3  = t[2 + idt];
1271     while (nz--) {
1272       idx = 3 * (*vi++);
1273       x1  = t[idx];
1274       x2  = t[1 + idx];
1275       x3  = t[2 + idx];
1276       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1277       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1278       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1279       v += 9;
1280     }
1281     idc    = 3 * (*c--);
1282     v      = aa + 9 * diag[i];
1283     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1284     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1285     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1286   }
1287   PetscCall(ISRestoreIndices(isrow, &rout));
1288   PetscCall(ISRestoreIndices(iscol, &cout));
1289   PetscCall(VecRestoreArrayRead(bb, &b));
1290   PetscCall(VecRestoreArray(xx, &x));
1291   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) {
1296   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1297   IS                 iscol = a->col, isrow = a->row;
1298   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1299   PetscInt           i, nz, idx, idt, idc, m;
1300   const PetscInt    *r, *c, *rout, *cout;
1301   const MatScalar   *aa = a->a, *v;
1302   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1303   const PetscScalar *b;
1304 
1305   PetscFunctionBegin;
1306   PetscCall(VecGetArrayRead(bb, &b));
1307   PetscCall(VecGetArray(xx, &x));
1308   t = a->solve_work;
1309 
1310   PetscCall(ISGetIndices(isrow, &rout));
1311   r = rout;
1312   PetscCall(ISGetIndices(iscol, &cout));
1313   c = cout;
1314 
1315   /* forward solve the lower triangular */
1316   idx  = 3 * r[0];
1317   t[0] = b[idx];
1318   t[1] = b[1 + idx];
1319   t[2] = b[2 + idx];
1320   for (i = 1; i < n; i++) {
1321     v   = aa + 9 * ai[i];
1322     vi  = aj + ai[i];
1323     nz  = ai[i + 1] - ai[i];
1324     idx = 3 * r[i];
1325     s1  = b[idx];
1326     s2  = b[1 + idx];
1327     s3  = b[2 + idx];
1328     for (m = 0; m < nz; m++) {
1329       idx = 3 * vi[m];
1330       x1  = t[idx];
1331       x2  = t[1 + idx];
1332       x3  = t[2 + idx];
1333       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1334       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1335       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1336       v += 9;
1337     }
1338     idx        = 3 * i;
1339     t[idx]     = s1;
1340     t[1 + idx] = s2;
1341     t[2 + idx] = s3;
1342   }
1343   /* backward solve the upper triangular */
1344   for (i = n - 1; i >= 0; i--) {
1345     v   = aa + 9 * (adiag[i + 1] + 1);
1346     vi  = aj + adiag[i + 1] + 1;
1347     nz  = adiag[i] - adiag[i + 1] - 1;
1348     idt = 3 * i;
1349     s1  = t[idt];
1350     s2  = t[1 + idt];
1351     s3  = t[2 + idt];
1352     for (m = 0; m < nz; m++) {
1353       idx = 3 * vi[m];
1354       x1  = t[idx];
1355       x2  = t[1 + idx];
1356       x3  = t[2 + idx];
1357       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1358       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1359       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1360       v += 9;
1361     }
1362     idc    = 3 * c[i];
1363     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1364     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1365     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1366   }
1367   PetscCall(ISRestoreIndices(isrow, &rout));
1368   PetscCall(ISRestoreIndices(iscol, &cout));
1369   PetscCall(VecRestoreArrayRead(bb, &b));
1370   PetscCall(VecRestoreArray(xx, &x));
1371   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) {
1376   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1377   IS                 iscol = a->col, isrow = a->row;
1378   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1379   PetscInt           i, nz, idx, idt, idc;
1380   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1381   const MatScalar   *aa = a->a, *v;
1382   PetscScalar       *x, s1, s2, x1, x2, *t;
1383   const PetscScalar *b;
1384 
1385   PetscFunctionBegin;
1386   PetscCall(VecGetArrayRead(bb, &b));
1387   PetscCall(VecGetArray(xx, &x));
1388   t = a->solve_work;
1389 
1390   PetscCall(ISGetIndices(isrow, &rout));
1391   r = rout;
1392   PetscCall(ISGetIndices(iscol, &cout));
1393   c = cout + (n - 1);
1394 
1395   /* forward solve the lower triangular */
1396   idx  = 2 * (*r++);
1397   t[0] = b[idx];
1398   t[1] = b[1 + idx];
1399   for (i = 1; i < n; i++) {
1400     v   = aa + 4 * ai[i];
1401     vi  = aj + ai[i];
1402     nz  = diag[i] - ai[i];
1403     idx = 2 * (*r++);
1404     s1  = b[idx];
1405     s2  = b[1 + idx];
1406     while (nz--) {
1407       idx = 2 * (*vi++);
1408       x1  = t[idx];
1409       x2  = t[1 + idx];
1410       s1 -= v[0] * x1 + v[2] * x2;
1411       s2 -= v[1] * x1 + v[3] * x2;
1412       v += 4;
1413     }
1414     idx        = 2 * i;
1415     t[idx]     = s1;
1416     t[1 + idx] = s2;
1417   }
1418   /* backward solve the upper triangular */
1419   for (i = n - 1; i >= 0; i--) {
1420     v   = aa + 4 * diag[i] + 4;
1421     vi  = aj + diag[i] + 1;
1422     nz  = ai[i + 1] - diag[i] - 1;
1423     idt = 2 * i;
1424     s1  = t[idt];
1425     s2  = t[1 + idt];
1426     while (nz--) {
1427       idx = 2 * (*vi++);
1428       x1  = t[idx];
1429       x2  = t[1 + idx];
1430       s1 -= v[0] * x1 + v[2] * x2;
1431       s2 -= v[1] * x1 + v[3] * x2;
1432       v += 4;
1433     }
1434     idc    = 2 * (*c--);
1435     v      = aa + 4 * diag[i];
1436     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1437     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1438   }
1439   PetscCall(ISRestoreIndices(isrow, &rout));
1440   PetscCall(ISRestoreIndices(iscol, &cout));
1441   PetscCall(VecRestoreArrayRead(bb, &b));
1442   PetscCall(VecRestoreArray(xx, &x));
1443   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) {
1448   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1449   IS                 iscol = a->col, isrow = a->row;
1450   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1451   PetscInt           i, nz, idx, jdx, idt, idc, m;
1452   const PetscInt    *r, *c, *rout, *cout;
1453   const MatScalar   *aa = a->a, *v;
1454   PetscScalar       *x, s1, s2, x1, x2, *t;
1455   const PetscScalar *b;
1456 
1457   PetscFunctionBegin;
1458   PetscCall(VecGetArrayRead(bb, &b));
1459   PetscCall(VecGetArray(xx, &x));
1460   t = a->solve_work;
1461 
1462   PetscCall(ISGetIndices(isrow, &rout));
1463   r = rout;
1464   PetscCall(ISGetIndices(iscol, &cout));
1465   c = cout;
1466 
1467   /* forward solve the lower triangular */
1468   idx  = 2 * r[0];
1469   t[0] = b[idx];
1470   t[1] = b[1 + idx];
1471   for (i = 1; i < n; i++) {
1472     v   = aa + 4 * ai[i];
1473     vi  = aj + ai[i];
1474     nz  = ai[i + 1] - ai[i];
1475     idx = 2 * r[i];
1476     s1  = b[idx];
1477     s2  = b[1 + idx];
1478     for (m = 0; m < nz; m++) {
1479       jdx = 2 * vi[m];
1480       x1  = t[jdx];
1481       x2  = t[1 + jdx];
1482       s1 -= v[0] * x1 + v[2] * x2;
1483       s2 -= v[1] * x1 + v[3] * x2;
1484       v += 4;
1485     }
1486     idx        = 2 * i;
1487     t[idx]     = s1;
1488     t[1 + idx] = s2;
1489   }
1490   /* backward solve the upper triangular */
1491   for (i = n - 1; i >= 0; i--) {
1492     v   = aa + 4 * (adiag[i + 1] + 1);
1493     vi  = aj + adiag[i + 1] + 1;
1494     nz  = adiag[i] - adiag[i + 1] - 1;
1495     idt = 2 * i;
1496     s1  = t[idt];
1497     s2  = t[1 + idt];
1498     for (m = 0; m < nz; m++) {
1499       idx = 2 * vi[m];
1500       x1  = t[idx];
1501       x2  = t[1 + idx];
1502       s1 -= v[0] * x1 + v[2] * x2;
1503       s2 -= v[1] * x1 + v[3] * x2;
1504       v += 4;
1505     }
1506     idc    = 2 * c[i];
1507     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1508     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1509   }
1510   PetscCall(ISRestoreIndices(isrow, &rout));
1511   PetscCall(ISRestoreIndices(iscol, &cout));
1512   PetscCall(VecRestoreArrayRead(bb, &b));
1513   PetscCall(VecRestoreArray(xx, &x));
1514   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1515   PetscFunctionReturn(0);
1516 }
1517 
1518 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) {
1519   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1520   IS                 iscol = a->col, isrow = a->row;
1521   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1522   PetscInt           i, nz;
1523   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1524   const MatScalar   *aa = a->a, *v;
1525   PetscScalar       *x, s1, *t;
1526   const PetscScalar *b;
1527 
1528   PetscFunctionBegin;
1529   if (!n) PetscFunctionReturn(0);
1530 
1531   PetscCall(VecGetArrayRead(bb, &b));
1532   PetscCall(VecGetArray(xx, &x));
1533   t = a->solve_work;
1534 
1535   PetscCall(ISGetIndices(isrow, &rout));
1536   r = rout;
1537   PetscCall(ISGetIndices(iscol, &cout));
1538   c = cout + (n - 1);
1539 
1540   /* forward solve the lower triangular */
1541   t[0] = b[*r++];
1542   for (i = 1; i < n; i++) {
1543     v  = aa + ai[i];
1544     vi = aj + ai[i];
1545     nz = diag[i] - ai[i];
1546     s1 = b[*r++];
1547     while (nz--) { s1 -= (*v++) * t[*vi++]; }
1548     t[i] = s1;
1549   }
1550   /* backward solve the upper triangular */
1551   for (i = n - 1; i >= 0; i--) {
1552     v  = aa + diag[i] + 1;
1553     vi = aj + diag[i] + 1;
1554     nz = ai[i + 1] - diag[i] - 1;
1555     s1 = t[i];
1556     while (nz--) { s1 -= (*v++) * t[*vi++]; }
1557     x[*c--] = t[i] = aa[diag[i]] * s1;
1558   }
1559 
1560   PetscCall(ISRestoreIndices(isrow, &rout));
1561   PetscCall(ISRestoreIndices(iscol, &cout));
1562   PetscCall(VecRestoreArrayRead(bb, &b));
1563   PetscCall(VecRestoreArray(xx, &x));
1564   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) {
1569   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1570   IS                 iscol = a->col, isrow = a->row;
1571   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1572   const PetscInt    *rout, *cout, *r, *c;
1573   PetscScalar       *x, *tmp, sum;
1574   const PetscScalar *b;
1575   const MatScalar   *aa = a->a, *v;
1576 
1577   PetscFunctionBegin;
1578   if (!n) PetscFunctionReturn(0);
1579 
1580   PetscCall(VecGetArrayRead(bb, &b));
1581   PetscCall(VecGetArray(xx, &x));
1582   tmp = a->solve_work;
1583 
1584   PetscCall(ISGetIndices(isrow, &rout));
1585   r = rout;
1586   PetscCall(ISGetIndices(iscol, &cout));
1587   c = cout;
1588 
1589   /* forward solve the lower triangular */
1590   tmp[0] = b[r[0]];
1591   v      = aa;
1592   vi     = aj;
1593   for (i = 1; i < n; i++) {
1594     nz  = ai[i + 1] - ai[i];
1595     sum = b[r[i]];
1596     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1597     tmp[i] = sum;
1598     v += nz;
1599     vi += nz;
1600   }
1601 
1602   /* backward solve the upper triangular */
1603   for (i = n - 1; i >= 0; i--) {
1604     v   = aa + adiag[i + 1] + 1;
1605     vi  = aj + adiag[i + 1] + 1;
1606     nz  = adiag[i] - adiag[i + 1] - 1;
1607     sum = tmp[i];
1608     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1609     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1610   }
1611 
1612   PetscCall(ISRestoreIndices(isrow, &rout));
1613   PetscCall(ISRestoreIndices(iscol, &cout));
1614   PetscCall(VecRestoreArrayRead(bb, &b));
1615   PetscCall(VecRestoreArray(xx, &x));
1616   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1617   PetscFunctionReturn(0);
1618 }
1619