xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 98d129c30f3ee9fdddc40fdbc5a989b7be64f888)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5 {
6   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
7   IS                 iscol = a->col, isrow = a->row;
8   const PetscInt    *r, *c, *rout, *cout;
9   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10   PetscInt           i, nz;
11   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
12   const MatScalar   *aa = a->a, *v;
13   PetscScalar       *x, *s, *t, *ls;
14   const PetscScalar *b;
15 
16   PetscFunctionBegin;
17   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
18   PetscCall(VecGetArrayRead(bb, &b));
19   PetscCall(VecGetArray(xx, &x));
20   t = a->solve_work;
21 
22   PetscCall(ISGetIndices(isrow, &rout));
23   r = rout;
24   PetscCall(ISGetIndices(iscol, &cout));
25   c = cout + (n - 1);
26 
27   /* forward solve the lower triangular */
28   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
29   for (i = 1; i < n; i++) {
30     v  = aa + bs2 * ai[i];
31     vi = aj + ai[i];
32     nz = a->diag[i] - ai[i];
33     s  = t + bs * i;
34     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
35     while (nz--) {
36       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
37       v += bs2;
38     }
39   }
40   /* backward solve the upper triangular */
41   ls = a->solve_work + A->cmap->n;
42   for (i = n - 1; i >= 0; i--) {
43     v  = aa + bs2 * (a->diag[i] + 1);
44     vi = aj + a->diag[i] + 1;
45     nz = ai[i + 1] - a->diag[i] - 1;
46     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
47     while (nz--) {
48       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
49       v += bs2;
50     }
51     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
52     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
53   }
54 
55   PetscCall(ISRestoreIndices(isrow, &rout));
56   PetscCall(ISRestoreIndices(iscol, &cout));
57   PetscCall(VecRestoreArrayRead(bb, &b));
58   PetscCall(VecRestoreArray(xx, &x));
59   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
64 {
65   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
66   IS                 iscol = a->col, isrow = a->row;
67   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
68   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
69   PetscInt           i, nz, idx, idt, idc;
70   const MatScalar   *aa = a->a, *v;
71   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
72   const PetscScalar *b;
73 
74   PetscFunctionBegin;
75   PetscCall(VecGetArrayRead(bb, &b));
76   PetscCall(VecGetArray(xx, &x));
77   t = a->solve_work;
78 
79   PetscCall(ISGetIndices(isrow, &rout));
80   r = rout;
81   PetscCall(ISGetIndices(iscol, &cout));
82   c = cout + (n - 1);
83 
84   /* forward solve the lower triangular */
85   idx  = 7 * (*r++);
86   t[0] = b[idx];
87   t[1] = b[1 + idx];
88   t[2] = b[2 + idx];
89   t[3] = b[3 + idx];
90   t[4] = b[4 + idx];
91   t[5] = b[5 + idx];
92   t[6] = b[6 + idx];
93 
94   for (i = 1; i < n; i++) {
95     v   = aa + 49 * ai[i];
96     vi  = aj + ai[i];
97     nz  = diag[i] - ai[i];
98     idx = 7 * (*r++);
99     s1  = b[idx];
100     s2  = b[1 + idx];
101     s3  = b[2 + idx];
102     s4  = b[3 + idx];
103     s5  = b[4 + idx];
104     s6  = b[5 + idx];
105     s7  = b[6 + idx];
106     while (nz--) {
107       idx = 7 * (*vi++);
108       x1  = t[idx];
109       x2  = t[1 + idx];
110       x3  = t[2 + idx];
111       x4  = t[3 + idx];
112       x5  = t[4 + idx];
113       x6  = t[5 + idx];
114       x7  = t[6 + idx];
115       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122       v += 49;
123     }
124     idx        = 7 * i;
125     t[idx]     = s1;
126     t[1 + idx] = s2;
127     t[2 + idx] = s3;
128     t[3 + idx] = s4;
129     t[4 + idx] = s5;
130     t[5 + idx] = s6;
131     t[6 + idx] = s7;
132   }
133   /* backward solve the upper triangular */
134   for (i = n - 1; i >= 0; i--) {
135     v   = aa + 49 * diag[i] + 49;
136     vi  = aj + diag[i] + 1;
137     nz  = ai[i + 1] - diag[i] - 1;
138     idt = 7 * i;
139     s1  = t[idt];
140     s2  = t[1 + idt];
141     s3  = t[2 + idt];
142     s4  = t[3 + idt];
143     s5  = t[4 + idt];
144     s6  = t[5 + idt];
145     s7  = t[6 + idt];
146     while (nz--) {
147       idx = 7 * (*vi++);
148       x1  = t[idx];
149       x2  = t[1 + idx];
150       x3  = t[2 + idx];
151       x4  = t[3 + idx];
152       x5  = t[4 + idx];
153       x6  = t[5 + idx];
154       x7  = t[6 + idx];
155       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162       v += 49;
163     }
164     idc    = 7 * (*c--);
165     v      = aa + 49 * diag[i];
166     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173   }
174 
175   PetscCall(ISRestoreIndices(isrow, &rout));
176   PetscCall(ISRestoreIndices(iscol, &cout));
177   PetscCall(VecRestoreArrayRead(bb, &b));
178   PetscCall(VecRestoreArray(xx, &x));
179   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184 {
185   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
186   IS                 iscol = a->col, isrow = a->row;
187   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188   const PetscInt     n = a->mbs, *rout, *cout, *vi;
189   PetscInt           i, nz, idx, idt, idc, m;
190   const MatScalar   *aa = a->a, *v;
191   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192   const PetscScalar *b;
193 
194   PetscFunctionBegin;
195   PetscCall(VecGetArrayRead(bb, &b));
196   PetscCall(VecGetArray(xx, &x));
197   t = a->solve_work;
198 
199   PetscCall(ISGetIndices(isrow, &rout));
200   r = rout;
201   PetscCall(ISGetIndices(iscol, &cout));
202   c = cout;
203 
204   /* forward solve the lower triangular */
205   idx  = 7 * r[0];
206   t[0] = b[idx];
207   t[1] = b[1 + idx];
208   t[2] = b[2 + idx];
209   t[3] = b[3 + idx];
210   t[4] = b[4 + idx];
211   t[5] = b[5 + idx];
212   t[6] = b[6 + idx];
213 
214   for (i = 1; i < n; i++) {
215     v   = aa + 49 * ai[i];
216     vi  = aj + ai[i];
217     nz  = ai[i + 1] - ai[i];
218     idx = 7 * r[i];
219     s1  = b[idx];
220     s2  = b[1 + idx];
221     s3  = b[2 + idx];
222     s4  = b[3 + idx];
223     s5  = b[4 + idx];
224     s6  = b[5 + idx];
225     s7  = b[6 + idx];
226     for (m = 0; m < nz; m++) {
227       idx = 7 * vi[m];
228       x1  = t[idx];
229       x2  = t[1 + idx];
230       x3  = t[2 + idx];
231       x4  = t[3 + idx];
232       x5  = t[4 + idx];
233       x6  = t[5 + idx];
234       x7  = t[6 + idx];
235       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242       v += 49;
243     }
244     idx        = 7 * i;
245     t[idx]     = s1;
246     t[1 + idx] = s2;
247     t[2 + idx] = s3;
248     t[3 + idx] = s4;
249     t[4 + idx] = s5;
250     t[5 + idx] = s6;
251     t[6 + idx] = s7;
252   }
253   /* backward solve the upper triangular */
254   for (i = n - 1; i >= 0; i--) {
255     v   = aa + 49 * (adiag[i + 1] + 1);
256     vi  = aj + adiag[i + 1] + 1;
257     nz  = adiag[i] - adiag[i + 1] - 1;
258     idt = 7 * i;
259     s1  = t[idt];
260     s2  = t[1 + idt];
261     s3  = t[2 + idt];
262     s4  = t[3 + idt];
263     s5  = t[4 + idt];
264     s6  = t[5 + idt];
265     s7  = t[6 + idt];
266     for (m = 0; m < nz; m++) {
267       idx = 7 * vi[m];
268       x1  = t[idx];
269       x2  = t[1 + idx];
270       x3  = t[2 + idx];
271       x4  = t[3 + idx];
272       x5  = t[4 + idx];
273       x6  = t[5 + idx];
274       x7  = t[6 + idx];
275       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282       v += 49;
283     }
284     idc    = 7 * c[i];
285     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292   }
293 
294   PetscCall(ISRestoreIndices(isrow, &rout));
295   PetscCall(ISRestoreIndices(iscol, &cout));
296   PetscCall(VecRestoreArrayRead(bb, &b));
297   PetscCall(VecRestoreArray(xx, &x));
298   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299   PetscFunctionReturn(PETSC_SUCCESS);
300 }
301 
302 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303 {
304   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
305   IS                 iscol = a->col, isrow = a->row;
306   const PetscInt    *r, *c, *rout, *cout;
307   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308   PetscInt           i, nz, idx, idt, idc;
309   const MatScalar   *aa = a->a, *v;
310   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311   const PetscScalar *b;
312 
313   PetscFunctionBegin;
314   PetscCall(VecGetArrayRead(bb, &b));
315   PetscCall(VecGetArray(xx, &x));
316   t = a->solve_work;
317 
318   PetscCall(ISGetIndices(isrow, &rout));
319   r = rout;
320   PetscCall(ISGetIndices(iscol, &cout));
321   c = cout + (n - 1);
322 
323   /* forward solve the lower triangular */
324   idx  = 6 * (*r++);
325   t[0] = b[idx];
326   t[1] = b[1 + idx];
327   t[2] = b[2 + idx];
328   t[3] = b[3 + idx];
329   t[4] = b[4 + idx];
330   t[5] = b[5 + idx];
331   for (i = 1; i < n; i++) {
332     v   = aa + 36 * ai[i];
333     vi  = aj + ai[i];
334     nz  = diag[i] - ai[i];
335     idx = 6 * (*r++);
336     s1  = b[idx];
337     s2  = b[1 + idx];
338     s3  = b[2 + idx];
339     s4  = b[3 + idx];
340     s5  = b[4 + idx];
341     s6  = b[5 + idx];
342     while (nz--) {
343       idx = 6 * (*vi++);
344       x1  = t[idx];
345       x2  = t[1 + idx];
346       x3  = t[2 + idx];
347       x4  = t[3 + idx];
348       x5  = t[4 + idx];
349       x6  = t[5 + idx];
350       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356       v += 36;
357     }
358     idx        = 6 * i;
359     t[idx]     = s1;
360     t[1 + idx] = s2;
361     t[2 + idx] = s3;
362     t[3 + idx] = s4;
363     t[4 + idx] = s5;
364     t[5 + idx] = s6;
365   }
366   /* backward solve the upper triangular */
367   for (i = n - 1; i >= 0; i--) {
368     v   = aa + 36 * diag[i] + 36;
369     vi  = aj + diag[i] + 1;
370     nz  = ai[i + 1] - diag[i] - 1;
371     idt = 6 * i;
372     s1  = t[idt];
373     s2  = t[1 + idt];
374     s3  = t[2 + idt];
375     s4  = t[3 + idt];
376     s5  = t[4 + idt];
377     s6  = t[5 + idt];
378     while (nz--) {
379       idx = 6 * (*vi++);
380       x1  = t[idx];
381       x2  = t[1 + idx];
382       x3  = t[2 + idx];
383       x4  = t[3 + idx];
384       x5  = t[4 + idx];
385       x6  = t[5 + idx];
386       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392       v += 36;
393     }
394     idc    = 6 * (*c--);
395     v      = aa + 36 * diag[i];
396     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402   }
403 
404   PetscCall(ISRestoreIndices(isrow, &rout));
405   PetscCall(ISRestoreIndices(iscol, &cout));
406   PetscCall(VecRestoreArrayRead(bb, &b));
407   PetscCall(VecRestoreArray(xx, &x));
408   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409   PetscFunctionReturn(PETSC_SUCCESS);
410 }
411 
412 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413 {
414   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
415   IS                 iscol = a->col, isrow = a->row;
416   const PetscInt    *r, *c, *rout, *cout;
417   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418   PetscInt           i, nz, idx, idt, idc, m;
419   const MatScalar   *aa = a->a, *v;
420   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421   const PetscScalar *b;
422 
423   PetscFunctionBegin;
424   PetscCall(VecGetArrayRead(bb, &b));
425   PetscCall(VecGetArray(xx, &x));
426   t = a->solve_work;
427 
428   PetscCall(ISGetIndices(isrow, &rout));
429   r = rout;
430   PetscCall(ISGetIndices(iscol, &cout));
431   c = cout;
432 
433   /* forward solve the lower triangular */
434   idx  = 6 * r[0];
435   t[0] = b[idx];
436   t[1] = b[1 + idx];
437   t[2] = b[2 + idx];
438   t[3] = b[3 + idx];
439   t[4] = b[4 + idx];
440   t[5] = b[5 + idx];
441   for (i = 1; i < n; i++) {
442     v   = aa + 36 * ai[i];
443     vi  = aj + ai[i];
444     nz  = ai[i + 1] - ai[i];
445     idx = 6 * r[i];
446     s1  = b[idx];
447     s2  = b[1 + idx];
448     s3  = b[2 + idx];
449     s4  = b[3 + idx];
450     s5  = b[4 + idx];
451     s6  = b[5 + idx];
452     for (m = 0; m < nz; m++) {
453       idx = 6 * vi[m];
454       x1  = t[idx];
455       x2  = t[1 + idx];
456       x3  = t[2 + idx];
457       x4  = t[3 + idx];
458       x5  = t[4 + idx];
459       x6  = t[5 + idx];
460       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466       v += 36;
467     }
468     idx        = 6 * i;
469     t[idx]     = s1;
470     t[1 + idx] = s2;
471     t[2 + idx] = s3;
472     t[3 + idx] = s4;
473     t[4 + idx] = s5;
474     t[5 + idx] = s6;
475   }
476   /* backward solve the upper triangular */
477   for (i = n - 1; i >= 0; i--) {
478     v   = aa + 36 * (adiag[i + 1] + 1);
479     vi  = aj + adiag[i + 1] + 1;
480     nz  = adiag[i] - adiag[i + 1] - 1;
481     idt = 6 * i;
482     s1  = t[idt];
483     s2  = t[1 + idt];
484     s3  = t[2 + idt];
485     s4  = t[3 + idt];
486     s5  = t[4 + idt];
487     s6  = t[5 + idt];
488     for (m = 0; m < nz; m++) {
489       idx = 6 * vi[m];
490       x1  = t[idx];
491       x2  = t[1 + idx];
492       x3  = t[2 + idx];
493       x4  = t[3 + idx];
494       x5  = t[4 + idx];
495       x6  = t[5 + idx];
496       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502       v += 36;
503     }
504     idc    = 6 * c[i];
505     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511   }
512 
513   PetscCall(ISRestoreIndices(isrow, &rout));
514   PetscCall(ISRestoreIndices(iscol, &cout));
515   PetscCall(VecRestoreArrayRead(bb, &b));
516   PetscCall(VecRestoreArray(xx, &x));
517   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520 
521 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522 {
523   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
524   IS                 iscol = a->col, isrow = a->row;
525   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
526   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527   PetscInt           i, nz, idx, idt, idc;
528   const MatScalar   *aa = a->a, *v;
529   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530   const PetscScalar *b;
531 
532   PetscFunctionBegin;
533   PetscCall(VecGetArrayRead(bb, &b));
534   PetscCall(VecGetArray(xx, &x));
535   t = a->solve_work;
536 
537   PetscCall(ISGetIndices(isrow, &rout));
538   r = rout;
539   PetscCall(ISGetIndices(iscol, &cout));
540   c = cout + (n - 1);
541 
542   /* forward solve the lower triangular */
543   idx  = 5 * (*r++);
544   t[0] = b[idx];
545   t[1] = b[1 + idx];
546   t[2] = b[2 + idx];
547   t[3] = b[3 + idx];
548   t[4] = b[4 + idx];
549   for (i = 1; i < n; i++) {
550     v   = aa + 25 * ai[i];
551     vi  = aj + ai[i];
552     nz  = diag[i] - ai[i];
553     idx = 5 * (*r++);
554     s1  = b[idx];
555     s2  = b[1 + idx];
556     s3  = b[2 + idx];
557     s4  = b[3 + idx];
558     s5  = b[4 + idx];
559     while (nz--) {
560       idx = 5 * (*vi++);
561       x1  = t[idx];
562       x2  = t[1 + idx];
563       x3  = t[2 + idx];
564       x4  = t[3 + idx];
565       x5  = t[4 + idx];
566       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571       v += 25;
572     }
573     idx        = 5 * i;
574     t[idx]     = s1;
575     t[1 + idx] = s2;
576     t[2 + idx] = s3;
577     t[3 + idx] = s4;
578     t[4 + idx] = s5;
579   }
580   /* backward solve the upper triangular */
581   for (i = n - 1; i >= 0; i--) {
582     v   = aa + 25 * diag[i] + 25;
583     vi  = aj + diag[i] + 1;
584     nz  = ai[i + 1] - diag[i] - 1;
585     idt = 5 * i;
586     s1  = t[idt];
587     s2  = t[1 + idt];
588     s3  = t[2 + idt];
589     s4  = t[3 + idt];
590     s5  = t[4 + idt];
591     while (nz--) {
592       idx = 5 * (*vi++);
593       x1  = t[idx];
594       x2  = t[1 + idx];
595       x3  = t[2 + idx];
596       x4  = t[3 + idx];
597       x5  = t[4 + idx];
598       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603       v += 25;
604     }
605     idc    = 5 * (*c--);
606     v      = aa + 25 * diag[i];
607     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612   }
613 
614   PetscCall(ISRestoreIndices(isrow, &rout));
615   PetscCall(ISRestoreIndices(iscol, &cout));
616   PetscCall(VecRestoreArrayRead(bb, &b));
617   PetscCall(VecRestoreArray(xx, &x));
618   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623 {
624   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
625   IS                 iscol = a->col, isrow = a->row;
626   const PetscInt    *r, *c, *rout, *cout;
627   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628   PetscInt           i, nz, idx, idt, idc, m;
629   const MatScalar   *aa = a->a, *v;
630   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631   const PetscScalar *b;
632 
633   PetscFunctionBegin;
634   PetscCall(VecGetArrayRead(bb, &b));
635   PetscCall(VecGetArray(xx, &x));
636   t = a->solve_work;
637 
638   PetscCall(ISGetIndices(isrow, &rout));
639   r = rout;
640   PetscCall(ISGetIndices(iscol, &cout));
641   c = cout;
642 
643   /* forward solve the lower triangular */
644   idx  = 5 * r[0];
645   t[0] = b[idx];
646   t[1] = b[1 + idx];
647   t[2] = b[2 + idx];
648   t[3] = b[3 + idx];
649   t[4] = b[4 + idx];
650   for (i = 1; i < n; i++) {
651     v   = aa + 25 * ai[i];
652     vi  = aj + ai[i];
653     nz  = ai[i + 1] - ai[i];
654     idx = 5 * r[i];
655     s1  = b[idx];
656     s2  = b[1 + idx];
657     s3  = b[2 + idx];
658     s4  = b[3 + idx];
659     s5  = b[4 + idx];
660     for (m = 0; m < nz; m++) {
661       idx = 5 * vi[m];
662       x1  = t[idx];
663       x2  = t[1 + idx];
664       x3  = t[2 + idx];
665       x4  = t[3 + idx];
666       x5  = t[4 + idx];
667       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672       v += 25;
673     }
674     idx        = 5 * i;
675     t[idx]     = s1;
676     t[1 + idx] = s2;
677     t[2 + idx] = s3;
678     t[3 + idx] = s4;
679     t[4 + idx] = s5;
680   }
681   /* backward solve the upper triangular */
682   for (i = n - 1; i >= 0; i--) {
683     v   = aa + 25 * (adiag[i + 1] + 1);
684     vi  = aj + adiag[i + 1] + 1;
685     nz  = adiag[i] - adiag[i + 1] - 1;
686     idt = 5 * i;
687     s1  = t[idt];
688     s2  = t[1 + idt];
689     s3  = t[2 + idt];
690     s4  = t[3 + idt];
691     s5  = t[4 + idt];
692     for (m = 0; m < nz; m++) {
693       idx = 5 * vi[m];
694       x1  = t[idx];
695       x2  = t[1 + idx];
696       x3  = t[2 + idx];
697       x4  = t[3 + idx];
698       x5  = t[4 + idx];
699       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704       v += 25;
705     }
706     idc    = 5 * c[i];
707     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712   }
713 
714   PetscCall(ISRestoreIndices(isrow, &rout));
715   PetscCall(ISRestoreIndices(iscol, &cout));
716   PetscCall(VecRestoreArrayRead(bb, &b));
717   PetscCall(VecRestoreArray(xx, &x));
718   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
722 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723 {
724   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
725   IS                 iscol = a->col, isrow = a->row;
726   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727   PetscInt           i, nz, idx, idt, idc;
728   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
729   const MatScalar   *aa = a->a, *v;
730   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731   const PetscScalar *b;
732 
733   PetscFunctionBegin;
734   PetscCall(VecGetArrayRead(bb, &b));
735   PetscCall(VecGetArray(xx, &x));
736   t = a->solve_work;
737 
738   PetscCall(ISGetIndices(isrow, &rout));
739   r = rout;
740   PetscCall(ISGetIndices(iscol, &cout));
741   c = cout + (n - 1);
742 
743   /* forward solve the lower triangular */
744   idx  = 4 * (*r++);
745   t[0] = b[idx];
746   t[1] = b[1 + idx];
747   t[2] = b[2 + idx];
748   t[3] = b[3 + idx];
749   for (i = 1; i < n; i++) {
750     v   = aa + 16 * ai[i];
751     vi  = aj + ai[i];
752     nz  = diag[i] - ai[i];
753     idx = 4 * (*r++);
754     s1  = b[idx];
755     s2  = b[1 + idx];
756     s3  = b[2 + idx];
757     s4  = b[3 + idx];
758     while (nz--) {
759       idx = 4 * (*vi++);
760       x1  = t[idx];
761       x2  = t[1 + idx];
762       x3  = t[2 + idx];
763       x4  = t[3 + idx];
764       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768       v += 16;
769     }
770     idx        = 4 * i;
771     t[idx]     = s1;
772     t[1 + idx] = s2;
773     t[2 + idx] = s3;
774     t[3 + idx] = s4;
775   }
776   /* backward solve the upper triangular */
777   for (i = n - 1; i >= 0; i--) {
778     v   = aa + 16 * diag[i] + 16;
779     vi  = aj + diag[i] + 1;
780     nz  = ai[i + 1] - diag[i] - 1;
781     idt = 4 * i;
782     s1  = t[idt];
783     s2  = t[1 + idt];
784     s3  = t[2 + idt];
785     s4  = t[3 + idt];
786     while (nz--) {
787       idx = 4 * (*vi++);
788       x1  = t[idx];
789       x2  = t[1 + idx];
790       x3  = t[2 + idx];
791       x4  = t[3 + idx];
792       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796       v += 16;
797     }
798     idc    = 4 * (*c--);
799     v      = aa + 16 * diag[i];
800     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804   }
805 
806   PetscCall(ISRestoreIndices(isrow, &rout));
807   PetscCall(ISRestoreIndices(iscol, &cout));
808   PetscCall(VecRestoreArrayRead(bb, &b));
809   PetscCall(VecRestoreArray(xx, &x));
810   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811   PetscFunctionReturn(PETSC_SUCCESS);
812 }
813 
814 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815 {
816   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
817   IS                 iscol = a->col, isrow = a->row;
818   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819   PetscInt           i, nz, idx, idt, idc, m;
820   const PetscInt    *r, *c, *rout, *cout;
821   const MatScalar   *aa = a->a, *v;
822   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823   const PetscScalar *b;
824 
825   PetscFunctionBegin;
826   PetscCall(VecGetArrayRead(bb, &b));
827   PetscCall(VecGetArray(xx, &x));
828   t = a->solve_work;
829 
830   PetscCall(ISGetIndices(isrow, &rout));
831   r = rout;
832   PetscCall(ISGetIndices(iscol, &cout));
833   c = cout;
834 
835   /* forward solve the lower triangular */
836   idx  = 4 * r[0];
837   t[0] = b[idx];
838   t[1] = b[1 + idx];
839   t[2] = b[2 + idx];
840   t[3] = b[3 + idx];
841   for (i = 1; i < n; i++) {
842     v   = aa + 16 * ai[i];
843     vi  = aj + ai[i];
844     nz  = ai[i + 1] - ai[i];
845     idx = 4 * r[i];
846     s1  = b[idx];
847     s2  = b[1 + idx];
848     s3  = b[2 + idx];
849     s4  = b[3 + idx];
850     for (m = 0; m < nz; m++) {
851       idx = 4 * vi[m];
852       x1  = t[idx];
853       x2  = t[1 + idx];
854       x3  = t[2 + idx];
855       x4  = t[3 + idx];
856       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860       v += 16;
861     }
862     idx        = 4 * i;
863     t[idx]     = s1;
864     t[1 + idx] = s2;
865     t[2 + idx] = s3;
866     t[3 + idx] = s4;
867   }
868   /* backward solve the upper triangular */
869   for (i = n - 1; i >= 0; i--) {
870     v   = aa + 16 * (adiag[i + 1] + 1);
871     vi  = aj + adiag[i + 1] + 1;
872     nz  = adiag[i] - adiag[i + 1] - 1;
873     idt = 4 * i;
874     s1  = t[idt];
875     s2  = t[1 + idt];
876     s3  = t[2 + idt];
877     s4  = t[3 + idt];
878     for (m = 0; m < nz; m++) {
879       idx = 4 * vi[m];
880       x1  = t[idx];
881       x2  = t[1 + idx];
882       x3  = t[2 + idx];
883       x4  = t[3 + idx];
884       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888       v += 16;
889     }
890     idc    = 4 * c[i];
891     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895   }
896 
897   PetscCall(ISRestoreIndices(isrow, &rout));
898   PetscCall(ISRestoreIndices(iscol, &cout));
899   PetscCall(VecRestoreArrayRead(bb, &b));
900   PetscCall(VecRestoreArray(xx, &x));
901   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902   PetscFunctionReturn(PETSC_SUCCESS);
903 }
904 
905 #if defined(PETSC_HAVE_SSE)
906 
907   #include PETSC_HAVE_SSE
908 
909 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
910 {
911   /*
912      Note: This code uses demotion of double
913      to float when performing the mixed-mode computation.
914      This may not be numerically reasonable for all applications.
915   */
916   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
917   IS              iscol = a->col, isrow = a->row;
918   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
919   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
920   MatScalar      *aa = a->a, *v;
921   PetscScalar    *x, *b, *t;
922 
923   /* Make space in temp stack for 16 Byte Aligned arrays */
924   float         ssealignedspace[11], *tmps, *tmpx;
925   unsigned long offset;
926 
927   PetscFunctionBegin;
928   SSE_SCOPE_BEGIN;
929 
930   offset = (unsigned long)ssealignedspace % 16;
931   if (offset) offset = (16 - offset) / 4;
932   tmps = &ssealignedspace[offset];
933   tmpx = &ssealignedspace[offset + 4];
934   PREFETCH_NTA(aa + 16 * ai[1]);
935 
936   PetscCall(VecGetArray(bb, &b));
937   PetscCall(VecGetArray(xx, &x));
938   t = a->solve_work;
939 
940   PetscCall(ISGetIndices(isrow, &rout));
941   r = rout;
942   PetscCall(ISGetIndices(iscol, &cout));
943   c = cout + (n - 1);
944 
945   /* forward solve the lower triangular */
946   idx  = 4 * (*r++);
947   t[0] = b[idx];
948   t[1] = b[1 + idx];
949   t[2] = b[2 + idx];
950   t[3] = b[3 + idx];
951   v    = aa + 16 * ai[1];
952 
953   for (i = 1; i < n;) {
954     PREFETCH_NTA(&v[8]);
955     vi  = aj + ai[i];
956     nz  = diag[i] - ai[i];
957     idx = 4 * (*r++);
958 
959     /* Demote sum from double to float */
960     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
961     LOAD_PS(tmps, XMM7);
962 
963     while (nz--) {
964       PREFETCH_NTA(&v[16]);
965       idx = 4 * (*vi++);
966 
967       /* Demote solution (so far) from double to float */
968       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
969 
970       /* 4x4 Matrix-Vector product with negative accumulation: */
971       SSE_INLINE_BEGIN_2(tmpx, v)
972       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
973 
974       /* First Column */
975       SSE_COPY_PS(XMM0, XMM6)
976       SSE_SHUFFLE(XMM0, XMM0, 0x00)
977       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
978       SSE_SUB_PS(XMM7, XMM0)
979 
980       /* Second Column */
981       SSE_COPY_PS(XMM1, XMM6)
982       SSE_SHUFFLE(XMM1, XMM1, 0x55)
983       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
984       SSE_SUB_PS(XMM7, XMM1)
985 
986       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
987 
988       /* Third Column */
989       SSE_COPY_PS(XMM2, XMM6)
990       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
991       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
992       SSE_SUB_PS(XMM7, XMM2)
993 
994       /* Fourth Column */
995       SSE_COPY_PS(XMM3, XMM6)
996       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
997       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
998       SSE_SUB_PS(XMM7, XMM3)
999       SSE_INLINE_END_2
1000 
1001       v += 16;
1002     }
1003     idx = 4 * i;
1004     v   = aa + 16 * ai[++i];
1005     PREFETCH_NTA(v);
1006     STORE_PS(tmps, XMM7);
1007 
1008     /* Promote result from float to double */
1009     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1010   }
1011   /* backward solve the upper triangular */
1012   idt  = 4 * (n - 1);
1013   ai16 = 16 * diag[n - 1];
1014   v    = aa + ai16 + 16;
1015   for (i = n - 1; i >= 0;) {
1016     PREFETCH_NTA(&v[8]);
1017     vi = aj + diag[i] + 1;
1018     nz = ai[i + 1] - diag[i] - 1;
1019 
1020     /* Demote accumulator from double to float */
1021     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1022     LOAD_PS(tmps, XMM7);
1023 
1024     while (nz--) {
1025       PREFETCH_NTA(&v[16]);
1026       idx = 4 * (*vi++);
1027 
1028       /* Demote solution (so far) from double to float */
1029       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1030 
1031       /* 4x4 Matrix-Vector Product with negative accumulation: */
1032       SSE_INLINE_BEGIN_2(tmpx, v)
1033       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1034 
1035       /* First Column */
1036       SSE_COPY_PS(XMM0, XMM6)
1037       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1038       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1039       SSE_SUB_PS(XMM7, XMM0)
1040 
1041       /* Second Column */
1042       SSE_COPY_PS(XMM1, XMM6)
1043       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1044       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1045       SSE_SUB_PS(XMM7, XMM1)
1046 
1047       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1048 
1049       /* Third Column */
1050       SSE_COPY_PS(XMM2, XMM6)
1051       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1052       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1053       SSE_SUB_PS(XMM7, XMM2)
1054 
1055       /* Fourth Column */
1056       SSE_COPY_PS(XMM3, XMM6)
1057       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1058       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1059       SSE_SUB_PS(XMM7, XMM3)
1060       SSE_INLINE_END_2
1061       v += 16;
1062     }
1063     v    = aa + ai16;
1064     ai16 = 16 * diag[--i];
1065     PREFETCH_NTA(aa + ai16 + 16);
1066     /*
1067        Scale the result by the diagonal 4x4 block,
1068        which was inverted as part of the factorization
1069     */
1070     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1071     /* First Column */
1072     SSE_COPY_PS(XMM0, XMM7)
1073     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1074     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1075 
1076     /* Second Column */
1077     SSE_COPY_PS(XMM1, XMM7)
1078     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1080     SSE_ADD_PS(XMM0, XMM1)
1081 
1082     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1083 
1084     /* Third Column */
1085     SSE_COPY_PS(XMM2, XMM7)
1086     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1088     SSE_ADD_PS(XMM0, XMM2)
1089 
1090     /* Fourth Column */
1091     SSE_COPY_PS(XMM3, XMM7)
1092     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1094     SSE_ADD_PS(XMM0, XMM3)
1095 
1096     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1097     SSE_INLINE_END_3
1098 
1099     /* Promote solution from float to double */
1100     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1101 
1102     /* Apply reordering to t and stream into x.    */
1103     /* This way, x doesn't pollute the cache.      */
1104     /* Be careful with size: 2 doubles = 4 floats! */
1105     idc = 4 * (*c--);
1106     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1107     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1108     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1109     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1110     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1111     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1112     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1113     SSE_INLINE_END_2
1114     v = aa + ai16 + 16;
1115     idt -= 4;
1116   }
1117 
1118   PetscCall(ISRestoreIndices(isrow, &rout));
1119   PetscCall(ISRestoreIndices(iscol, &cout));
1120   PetscCall(VecRestoreArray(bb, &b));
1121   PetscCall(VecRestoreArray(xx, &x));
1122   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1123   SSE_SCOPE_END;
1124   PetscFunctionReturn(PETSC_SUCCESS);
1125 }
1126 
1127 #endif
1128 
1129 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1130 {
1131   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1132   IS                 iscol = a->col, isrow = a->row;
1133   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1134   PetscInt           i, nz, idx, idt, idc;
1135   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1136   const MatScalar   *aa = a->a, *v;
1137   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1138   const PetscScalar *b;
1139 
1140   PetscFunctionBegin;
1141   PetscCall(VecGetArrayRead(bb, &b));
1142   PetscCall(VecGetArray(xx, &x));
1143   t = a->solve_work;
1144 
1145   PetscCall(ISGetIndices(isrow, &rout));
1146   r = rout;
1147   PetscCall(ISGetIndices(iscol, &cout));
1148   c = cout + (n - 1);
1149 
1150   /* forward solve the lower triangular */
1151   idx  = 3 * (*r++);
1152   t[0] = b[idx];
1153   t[1] = b[1 + idx];
1154   t[2] = b[2 + idx];
1155   for (i = 1; i < n; i++) {
1156     v   = aa + 9 * ai[i];
1157     vi  = aj + ai[i];
1158     nz  = diag[i] - ai[i];
1159     idx = 3 * (*r++);
1160     s1  = b[idx];
1161     s2  = b[1 + idx];
1162     s3  = b[2 + idx];
1163     while (nz--) {
1164       idx = 3 * (*vi++);
1165       x1  = t[idx];
1166       x2  = t[1 + idx];
1167       x3  = t[2 + idx];
1168       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1169       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1170       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1171       v += 9;
1172     }
1173     idx        = 3 * i;
1174     t[idx]     = s1;
1175     t[1 + idx] = s2;
1176     t[2 + idx] = s3;
1177   }
1178   /* backward solve the upper triangular */
1179   for (i = n - 1; i >= 0; i--) {
1180     v   = aa + 9 * diag[i] + 9;
1181     vi  = aj + diag[i] + 1;
1182     nz  = ai[i + 1] - diag[i] - 1;
1183     idt = 3 * i;
1184     s1  = t[idt];
1185     s2  = t[1 + idt];
1186     s3  = t[2 + idt];
1187     while (nz--) {
1188       idx = 3 * (*vi++);
1189       x1  = t[idx];
1190       x2  = t[1 + idx];
1191       x3  = t[2 + idx];
1192       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1193       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1194       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1195       v += 9;
1196     }
1197     idc    = 3 * (*c--);
1198     v      = aa + 9 * diag[i];
1199     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1200     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1201     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1202   }
1203   PetscCall(ISRestoreIndices(isrow, &rout));
1204   PetscCall(ISRestoreIndices(iscol, &cout));
1205   PetscCall(VecRestoreArrayRead(bb, &b));
1206   PetscCall(VecRestoreArray(xx, &x));
1207   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1208   PetscFunctionReturn(PETSC_SUCCESS);
1209 }
1210 
1211 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1212 {
1213   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1214   IS                 iscol = a->col, isrow = a->row;
1215   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1216   PetscInt           i, nz, idx, idt, idc, m;
1217   const PetscInt    *r, *c, *rout, *cout;
1218   const MatScalar   *aa = a->a, *v;
1219   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1220   const PetscScalar *b;
1221 
1222   PetscFunctionBegin;
1223   PetscCall(VecGetArrayRead(bb, &b));
1224   PetscCall(VecGetArray(xx, &x));
1225   t = a->solve_work;
1226 
1227   PetscCall(ISGetIndices(isrow, &rout));
1228   r = rout;
1229   PetscCall(ISGetIndices(iscol, &cout));
1230   c = cout;
1231 
1232   /* forward solve the lower triangular */
1233   idx  = 3 * r[0];
1234   t[0] = b[idx];
1235   t[1] = b[1 + idx];
1236   t[2] = b[2 + idx];
1237   for (i = 1; i < n; i++) {
1238     v   = aa + 9 * ai[i];
1239     vi  = aj + ai[i];
1240     nz  = ai[i + 1] - ai[i];
1241     idx = 3 * r[i];
1242     s1  = b[idx];
1243     s2  = b[1 + idx];
1244     s3  = b[2 + idx];
1245     for (m = 0; m < nz; m++) {
1246       idx = 3 * vi[m];
1247       x1  = t[idx];
1248       x2  = t[1 + idx];
1249       x3  = t[2 + idx];
1250       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1251       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1252       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1253       v += 9;
1254     }
1255     idx        = 3 * i;
1256     t[idx]     = s1;
1257     t[1 + idx] = s2;
1258     t[2 + idx] = s3;
1259   }
1260   /* backward solve the upper triangular */
1261   for (i = n - 1; i >= 0; i--) {
1262     v   = aa + 9 * (adiag[i + 1] + 1);
1263     vi  = aj + adiag[i + 1] + 1;
1264     nz  = adiag[i] - adiag[i + 1] - 1;
1265     idt = 3 * i;
1266     s1  = t[idt];
1267     s2  = t[1 + idt];
1268     s3  = t[2 + idt];
1269     for (m = 0; m < nz; m++) {
1270       idx = 3 * vi[m];
1271       x1  = t[idx];
1272       x2  = t[1 + idx];
1273       x3  = t[2 + idx];
1274       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1275       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1276       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1277       v += 9;
1278     }
1279     idc    = 3 * c[i];
1280     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1281     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1282     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1283   }
1284   PetscCall(ISRestoreIndices(isrow, &rout));
1285   PetscCall(ISRestoreIndices(iscol, &cout));
1286   PetscCall(VecRestoreArrayRead(bb, &b));
1287   PetscCall(VecRestoreArray(xx, &x));
1288   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1289   PetscFunctionReturn(PETSC_SUCCESS);
1290 }
1291 
1292 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1293 {
1294   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1295   IS                 iscol = a->col, isrow = a->row;
1296   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1297   PetscInt           i, nz, idx, idt, idc;
1298   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1299   const MatScalar   *aa = a->a, *v;
1300   PetscScalar       *x, s1, s2, x1, x2, *t;
1301   const PetscScalar *b;
1302 
1303   PetscFunctionBegin;
1304   PetscCall(VecGetArrayRead(bb, &b));
1305   PetscCall(VecGetArray(xx, &x));
1306   t = a->solve_work;
1307 
1308   PetscCall(ISGetIndices(isrow, &rout));
1309   r = rout;
1310   PetscCall(ISGetIndices(iscol, &cout));
1311   c = cout + (n - 1);
1312 
1313   /* forward solve the lower triangular */
1314   idx  = 2 * (*r++);
1315   t[0] = b[idx];
1316   t[1] = b[1 + idx];
1317   for (i = 1; i < n; i++) {
1318     v   = aa + 4 * ai[i];
1319     vi  = aj + ai[i];
1320     nz  = diag[i] - ai[i];
1321     idx = 2 * (*r++);
1322     s1  = b[idx];
1323     s2  = b[1 + idx];
1324     while (nz--) {
1325       idx = 2 * (*vi++);
1326       x1  = t[idx];
1327       x2  = t[1 + idx];
1328       s1 -= v[0] * x1 + v[2] * x2;
1329       s2 -= v[1] * x1 + v[3] * x2;
1330       v += 4;
1331     }
1332     idx        = 2 * i;
1333     t[idx]     = s1;
1334     t[1 + idx] = s2;
1335   }
1336   /* backward solve the upper triangular */
1337   for (i = n - 1; i >= 0; i--) {
1338     v   = aa + 4 * diag[i] + 4;
1339     vi  = aj + diag[i] + 1;
1340     nz  = ai[i + 1] - diag[i] - 1;
1341     idt = 2 * i;
1342     s1  = t[idt];
1343     s2  = t[1 + idt];
1344     while (nz--) {
1345       idx = 2 * (*vi++);
1346       x1  = t[idx];
1347       x2  = t[1 + idx];
1348       s1 -= v[0] * x1 + v[2] * x2;
1349       s2 -= v[1] * x1 + v[3] * x2;
1350       v += 4;
1351     }
1352     idc    = 2 * (*c--);
1353     v      = aa + 4 * diag[i];
1354     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1355     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1356   }
1357   PetscCall(ISRestoreIndices(isrow, &rout));
1358   PetscCall(ISRestoreIndices(iscol, &cout));
1359   PetscCall(VecRestoreArrayRead(bb, &b));
1360   PetscCall(VecRestoreArray(xx, &x));
1361   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1362   PetscFunctionReturn(PETSC_SUCCESS);
1363 }
1364 
1365 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1366 {
1367   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1368   IS                 iscol = a->col, isrow = a->row;
1369   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1370   PetscInt           i, nz, idx, jdx, idt, idc, m;
1371   const PetscInt    *r, *c, *rout, *cout;
1372   const MatScalar   *aa = a->a, *v;
1373   PetscScalar       *x, s1, s2, x1, x2, *t;
1374   const PetscScalar *b;
1375 
1376   PetscFunctionBegin;
1377   PetscCall(VecGetArrayRead(bb, &b));
1378   PetscCall(VecGetArray(xx, &x));
1379   t = a->solve_work;
1380 
1381   PetscCall(ISGetIndices(isrow, &rout));
1382   r = rout;
1383   PetscCall(ISGetIndices(iscol, &cout));
1384   c = cout;
1385 
1386   /* forward solve the lower triangular */
1387   idx  = 2 * r[0];
1388   t[0] = b[idx];
1389   t[1] = b[1 + idx];
1390   for (i = 1; i < n; i++) {
1391     v   = aa + 4 * ai[i];
1392     vi  = aj + ai[i];
1393     nz  = ai[i + 1] - ai[i];
1394     idx = 2 * r[i];
1395     s1  = b[idx];
1396     s2  = b[1 + idx];
1397     for (m = 0; m < nz; m++) {
1398       jdx = 2 * vi[m];
1399       x1  = t[jdx];
1400       x2  = t[1 + jdx];
1401       s1 -= v[0] * x1 + v[2] * x2;
1402       s2 -= v[1] * x1 + v[3] * x2;
1403       v += 4;
1404     }
1405     idx        = 2 * i;
1406     t[idx]     = s1;
1407     t[1 + idx] = s2;
1408   }
1409   /* backward solve the upper triangular */
1410   for (i = n - 1; i >= 0; i--) {
1411     v   = aa + 4 * (adiag[i + 1] + 1);
1412     vi  = aj + adiag[i + 1] + 1;
1413     nz  = adiag[i] - adiag[i + 1] - 1;
1414     idt = 2 * i;
1415     s1  = t[idt];
1416     s2  = t[1 + idt];
1417     for (m = 0; m < nz; m++) {
1418       idx = 2 * vi[m];
1419       x1  = t[idx];
1420       x2  = t[1 + idx];
1421       s1 -= v[0] * x1 + v[2] * x2;
1422       s2 -= v[1] * x1 + v[3] * x2;
1423       v += 4;
1424     }
1425     idc    = 2 * c[i];
1426     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1427     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1428   }
1429   PetscCall(ISRestoreIndices(isrow, &rout));
1430   PetscCall(ISRestoreIndices(iscol, &cout));
1431   PetscCall(VecRestoreArrayRead(bb, &b));
1432   PetscCall(VecRestoreArray(xx, &x));
1433   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1434   PetscFunctionReturn(PETSC_SUCCESS);
1435 }
1436 
1437 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1438 {
1439   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1440   IS                 iscol = a->col, isrow = a->row;
1441   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1442   PetscInt           i, nz;
1443   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1444   const MatScalar   *aa = a->a, *v;
1445   PetscScalar       *x, s1, *t;
1446   const PetscScalar *b;
1447 
1448   PetscFunctionBegin;
1449   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1450 
1451   PetscCall(VecGetArrayRead(bb, &b));
1452   PetscCall(VecGetArray(xx, &x));
1453   t = a->solve_work;
1454 
1455   PetscCall(ISGetIndices(isrow, &rout));
1456   r = rout;
1457   PetscCall(ISGetIndices(iscol, &cout));
1458   c = cout + (n - 1);
1459 
1460   /* forward solve the lower triangular */
1461   t[0] = b[*r++];
1462   for (i = 1; i < n; i++) {
1463     v  = aa + ai[i];
1464     vi = aj + ai[i];
1465     nz = diag[i] - ai[i];
1466     s1 = b[*r++];
1467     while (nz--) s1 -= (*v++) * t[*vi++];
1468     t[i] = s1;
1469   }
1470   /* backward solve the upper triangular */
1471   for (i = n - 1; i >= 0; i--) {
1472     v  = aa + diag[i] + 1;
1473     vi = aj + diag[i] + 1;
1474     nz = ai[i + 1] - diag[i] - 1;
1475     s1 = t[i];
1476     while (nz--) s1 -= (*v++) * t[*vi++];
1477     x[*c--] = t[i] = aa[diag[i]] * s1;
1478   }
1479 
1480   PetscCall(ISRestoreIndices(isrow, &rout));
1481   PetscCall(ISRestoreIndices(iscol, &cout));
1482   PetscCall(VecRestoreArrayRead(bb, &b));
1483   PetscCall(VecRestoreArray(xx, &x));
1484   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1485   PetscFunctionReturn(PETSC_SUCCESS);
1486 }
1487 
1488 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1489 {
1490   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1491   IS                 iscol = a->col, isrow = a->row;
1492   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1493   const PetscInt    *rout, *cout, *r, *c;
1494   PetscScalar       *x, *tmp, sum;
1495   const PetscScalar *b;
1496   const MatScalar   *aa = a->a, *v;
1497 
1498   PetscFunctionBegin;
1499   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1500 
1501   PetscCall(VecGetArrayRead(bb, &b));
1502   PetscCall(VecGetArray(xx, &x));
1503   tmp = a->solve_work;
1504 
1505   PetscCall(ISGetIndices(isrow, &rout));
1506   r = rout;
1507   PetscCall(ISGetIndices(iscol, &cout));
1508   c = cout;
1509 
1510   /* forward solve the lower triangular */
1511   tmp[0] = b[r[0]];
1512   v      = aa;
1513   vi     = aj;
1514   for (i = 1; i < n; i++) {
1515     nz  = ai[i + 1] - ai[i];
1516     sum = b[r[i]];
1517     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1518     tmp[i] = sum;
1519     v += nz;
1520     vi += nz;
1521   }
1522 
1523   /* backward solve the upper triangular */
1524   for (i = n - 1; i >= 0; i--) {
1525     v   = aa + adiag[i + 1] + 1;
1526     vi  = aj + adiag[i + 1] + 1;
1527     nz  = adiag[i] - adiag[i + 1] - 1;
1528     sum = tmp[i];
1529     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1530     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1531   }
1532 
1533   PetscCall(ISRestoreIndices(isrow, &rout));
1534   PetscCall(ISRestoreIndices(iscol, &cout));
1535   PetscCall(VecRestoreArrayRead(bb, &b));
1536   PetscCall(VecRestoreArray(xx, &x));
1537   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1538   PetscFunctionReturn(PETSC_SUCCESS);
1539 }
1540