xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5 {
6   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
7   IS                 iscol = a->col, isrow = a->row;
8   const PetscInt    *r, *c, *rout, *cout;
9   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10   PetscInt           i, nz;
11   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
12   const MatScalar   *aa = a->a, *v;
13   PetscScalar       *x, *s, *t, *ls;
14   const PetscScalar *b;
15 
16   PetscFunctionBegin;
17   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
18   PetscCall(VecGetArrayRead(bb, &b));
19   PetscCall(VecGetArray(xx, &x));
20   t = a->solve_work;
21 
22   PetscCall(ISGetIndices(isrow, &rout));
23   r = rout;
24   PetscCall(ISGetIndices(iscol, &cout));
25   c = cout + (n - 1);
26 
27   /* forward solve the lower triangular */
28   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
29   for (i = 1; i < n; i++) {
30     v  = aa + bs2 * ai[i];
31     vi = aj + ai[i];
32     nz = a->diag[i] - ai[i];
33     s  = t + bs * i;
34     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
35     while (nz--) {
36       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
37       v += bs2;
38     }
39   }
40   /* backward solve the upper triangular */
41   ls = a->solve_work + A->cmap->n;
42   for (i = n - 1; i >= 0; i--) {
43     v  = aa + bs2 * (a->diag[i] + 1);
44     vi = aj + a->diag[i] + 1;
45     nz = ai[i + 1] - a->diag[i] - 1;
46     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
47     while (nz--) {
48       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
49       v += bs2;
50     }
51     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
52     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
53   }
54 
55   PetscCall(ISRestoreIndices(isrow, &rout));
56   PetscCall(ISRestoreIndices(iscol, &cout));
57   PetscCall(VecRestoreArrayRead(bb, &b));
58   PetscCall(VecRestoreArray(xx, &x));
59   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
60   PetscFunctionReturn(PETSC_SUCCESS);
61 }
62 
63 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
64 {
65   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
66   IS                 iscol = a->col, isrow = a->row;
67   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
68   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
69   PetscInt           i, nz, idx, idt, idc;
70   const MatScalar   *aa = a->a, *v;
71   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
72   const PetscScalar *b;
73 
74   PetscFunctionBegin;
75   PetscCall(VecGetArrayRead(bb, &b));
76   PetscCall(VecGetArray(xx, &x));
77   t = a->solve_work;
78 
79   PetscCall(ISGetIndices(isrow, &rout));
80   r = rout;
81   PetscCall(ISGetIndices(iscol, &cout));
82   c = cout + (n - 1);
83 
84   /* forward solve the lower triangular */
85   idx  = 7 * (*r++);
86   t[0] = b[idx];
87   t[1] = b[1 + idx];
88   t[2] = b[2 + idx];
89   t[3] = b[3 + idx];
90   t[4] = b[4 + idx];
91   t[5] = b[5 + idx];
92   t[6] = b[6 + idx];
93 
94   for (i = 1; i < n; i++) {
95     v   = aa + 49 * ai[i];
96     vi  = aj + ai[i];
97     nz  = diag[i] - ai[i];
98     idx = 7 * (*r++);
99     s1  = b[idx];
100     s2  = b[1 + idx];
101     s3  = b[2 + idx];
102     s4  = b[3 + idx];
103     s5  = b[4 + idx];
104     s6  = b[5 + idx];
105     s7  = b[6 + idx];
106     while (nz--) {
107       idx = 7 * (*vi++);
108       x1  = t[idx];
109       x2  = t[1 + idx];
110       x3  = t[2 + idx];
111       x4  = t[3 + idx];
112       x5  = t[4 + idx];
113       x6  = t[5 + idx];
114       x7  = t[6 + idx];
115       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
116       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
117       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
118       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
119       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
120       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
121       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
122       v += 49;
123     }
124     idx        = 7 * i;
125     t[idx]     = s1;
126     t[1 + idx] = s2;
127     t[2 + idx] = s3;
128     t[3 + idx] = s4;
129     t[4 + idx] = s5;
130     t[5 + idx] = s6;
131     t[6 + idx] = s7;
132   }
133   /* backward solve the upper triangular */
134   for (i = n - 1; i >= 0; i--) {
135     v   = aa + 49 * diag[i] + 49;
136     vi  = aj + diag[i] + 1;
137     nz  = ai[i + 1] - diag[i] - 1;
138     idt = 7 * i;
139     s1  = t[idt];
140     s2  = t[1 + idt];
141     s3  = t[2 + idt];
142     s4  = t[3 + idt];
143     s5  = t[4 + idt];
144     s6  = t[5 + idt];
145     s7  = t[6 + idt];
146     while (nz--) {
147       idx = 7 * (*vi++);
148       x1  = t[idx];
149       x2  = t[1 + idx];
150       x3  = t[2 + idx];
151       x4  = t[3 + idx];
152       x5  = t[4 + idx];
153       x6  = t[5 + idx];
154       x7  = t[6 + idx];
155       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
156       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
157       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
158       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
159       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
160       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
161       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
162       v += 49;
163     }
164     idc    = 7 * (*c--);
165     v      = aa + 49 * diag[i];
166     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
167     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
168     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
169     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
170     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
171     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
172     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
173   }
174 
175   PetscCall(ISRestoreIndices(isrow, &rout));
176   PetscCall(ISRestoreIndices(iscol, &cout));
177   PetscCall(VecRestoreArrayRead(bb, &b));
178   PetscCall(VecRestoreArray(xx, &x));
179   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184 {
185   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
186   IS                 iscol = a->col, isrow = a->row;
187   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
188   const PetscInt     n = a->mbs, *rout, *cout, *vi;
189   PetscInt           i, nz, idx, idt, idc, m;
190   const MatScalar   *aa = a->a, *v;
191   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
192   const PetscScalar *b;
193 
194   PetscFunctionBegin;
195   PetscCall(VecGetArrayRead(bb, &b));
196   PetscCall(VecGetArray(xx, &x));
197   t = a->solve_work;
198 
199   PetscCall(ISGetIndices(isrow, &rout));
200   r = rout;
201   PetscCall(ISGetIndices(iscol, &cout));
202   c = cout;
203 
204   /* forward solve the lower triangular */
205   idx  = 7 * r[0];
206   t[0] = b[idx];
207   t[1] = b[1 + idx];
208   t[2] = b[2 + idx];
209   t[3] = b[3 + idx];
210   t[4] = b[4 + idx];
211   t[5] = b[5 + idx];
212   t[6] = b[6 + idx];
213 
214   for (i = 1; i < n; i++) {
215     v   = aa + 49 * ai[i];
216     vi  = aj + ai[i];
217     nz  = ai[i + 1] - ai[i];
218     idx = 7 * r[i];
219     s1  = b[idx];
220     s2  = b[1 + idx];
221     s3  = b[2 + idx];
222     s4  = b[3 + idx];
223     s5  = b[4 + idx];
224     s6  = b[5 + idx];
225     s7  = b[6 + idx];
226     for (m = 0; m < nz; m++) {
227       idx = 7 * vi[m];
228       x1  = t[idx];
229       x2  = t[1 + idx];
230       x3  = t[2 + idx];
231       x4  = t[3 + idx];
232       x5  = t[4 + idx];
233       x6  = t[5 + idx];
234       x7  = t[6 + idx];
235       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
236       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
237       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
238       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
239       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
240       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
241       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
242       v += 49;
243     }
244     idx        = 7 * i;
245     t[idx]     = s1;
246     t[1 + idx] = s2;
247     t[2 + idx] = s3;
248     t[3 + idx] = s4;
249     t[4 + idx] = s5;
250     t[5 + idx] = s6;
251     t[6 + idx] = s7;
252   }
253   /* backward solve the upper triangular */
254   for (i = n - 1; i >= 0; i--) {
255     v   = aa + 49 * (adiag[i + 1] + 1);
256     vi  = aj + adiag[i + 1] + 1;
257     nz  = adiag[i] - adiag[i + 1] - 1;
258     idt = 7 * i;
259     s1  = t[idt];
260     s2  = t[1 + idt];
261     s3  = t[2 + idt];
262     s4  = t[3 + idt];
263     s5  = t[4 + idt];
264     s6  = t[5 + idt];
265     s7  = t[6 + idt];
266     for (m = 0; m < nz; m++) {
267       idx = 7 * vi[m];
268       x1  = t[idx];
269       x2  = t[1 + idx];
270       x3  = t[2 + idx];
271       x4  = t[3 + idx];
272       x5  = t[4 + idx];
273       x6  = t[5 + idx];
274       x7  = t[6 + idx];
275       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
276       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
277       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
278       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
279       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
280       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
281       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
282       v += 49;
283     }
284     idc    = 7 * c[i];
285     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
286     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
287     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
288     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
289     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
290     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
291     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
292   }
293 
294   PetscCall(ISRestoreIndices(isrow, &rout));
295   PetscCall(ISRestoreIndices(iscol, &cout));
296   PetscCall(VecRestoreArrayRead(bb, &b));
297   PetscCall(VecRestoreArray(xx, &x));
298   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
299   PetscFunctionReturn(PETSC_SUCCESS);
300 }
301 
302 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303 {
304   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
305   IS                 iscol = a->col, isrow = a->row;
306   const PetscInt    *r, *c, *rout, *cout;
307   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
308   PetscInt           i, nz, idx, idt, idc;
309   const MatScalar   *aa = a->a, *v;
310   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
311   const PetscScalar *b;
312 
313   PetscFunctionBegin;
314   PetscCall(VecGetArrayRead(bb, &b));
315   PetscCall(VecGetArray(xx, &x));
316   t = a->solve_work;
317 
318   PetscCall(ISGetIndices(isrow, &rout));
319   r = rout;
320   PetscCall(ISGetIndices(iscol, &cout));
321   c = cout + (n - 1);
322 
323   /* forward solve the lower triangular */
324   idx  = 6 * (*r++);
325   t[0] = b[idx];
326   t[1] = b[1 + idx];
327   t[2] = b[2 + idx];
328   t[3] = b[3 + idx];
329   t[4] = b[4 + idx];
330   t[5] = b[5 + idx];
331   for (i = 1; i < n; i++) {
332     v   = aa + 36 * ai[i];
333     vi  = aj + ai[i];
334     nz  = diag[i] - ai[i];
335     idx = 6 * (*r++);
336     s1  = b[idx];
337     s2  = b[1 + idx];
338     s3  = b[2 + idx];
339     s4  = b[3 + idx];
340     s5  = b[4 + idx];
341     s6  = b[5 + idx];
342     while (nz--) {
343       idx = 6 * (*vi++);
344       x1  = t[idx];
345       x2  = t[1 + idx];
346       x3  = t[2 + idx];
347       x4  = t[3 + idx];
348       x5  = t[4 + idx];
349       x6  = t[5 + idx];
350       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
351       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
352       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
353       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
354       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
355       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
356       v += 36;
357     }
358     idx        = 6 * i;
359     t[idx]     = s1;
360     t[1 + idx] = s2;
361     t[2 + idx] = s3;
362     t[3 + idx] = s4;
363     t[4 + idx] = s5;
364     t[5 + idx] = s6;
365   }
366   /* backward solve the upper triangular */
367   for (i = n - 1; i >= 0; i--) {
368     v   = aa + 36 * diag[i] + 36;
369     vi  = aj + diag[i] + 1;
370     nz  = ai[i + 1] - diag[i] - 1;
371     idt = 6 * i;
372     s1  = t[idt];
373     s2  = t[1 + idt];
374     s3  = t[2 + idt];
375     s4  = t[3 + idt];
376     s5  = t[4 + idt];
377     s6  = t[5 + idt];
378     while (nz--) {
379       idx = 6 * (*vi++);
380       x1  = t[idx];
381       x2  = t[1 + idx];
382       x3  = t[2 + idx];
383       x4  = t[3 + idx];
384       x5  = t[4 + idx];
385       x6  = t[5 + idx];
386       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
387       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
388       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
389       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
390       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
391       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
392       v += 36;
393     }
394     idc    = 6 * (*c--);
395     v      = aa + 36 * diag[i];
396     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
397     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
398     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
399     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
400     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
401     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
402   }
403 
404   PetscCall(ISRestoreIndices(isrow, &rout));
405   PetscCall(ISRestoreIndices(iscol, &cout));
406   PetscCall(VecRestoreArrayRead(bb, &b));
407   PetscCall(VecRestoreArray(xx, &x));
408   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
409   PetscFunctionReturn(PETSC_SUCCESS);
410 }
411 
412 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413 {
414   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
415   IS                 iscol = a->col, isrow = a->row;
416   const PetscInt    *r, *c, *rout, *cout;
417   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
418   PetscInt           i, nz, idx, idt, idc, m;
419   const MatScalar   *aa = a->a, *v;
420   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
421   const PetscScalar *b;
422 
423   PetscFunctionBegin;
424   PetscCall(VecGetArrayRead(bb, &b));
425   PetscCall(VecGetArray(xx, &x));
426   t = a->solve_work;
427 
428   PetscCall(ISGetIndices(isrow, &rout));
429   r = rout;
430   PetscCall(ISGetIndices(iscol, &cout));
431   c = cout;
432 
433   /* forward solve the lower triangular */
434   idx  = 6 * r[0];
435   t[0] = b[idx];
436   t[1] = b[1 + idx];
437   t[2] = b[2 + idx];
438   t[3] = b[3 + idx];
439   t[4] = b[4 + idx];
440   t[5] = b[5 + idx];
441   for (i = 1; i < n; i++) {
442     v   = aa + 36 * ai[i];
443     vi  = aj + ai[i];
444     nz  = ai[i + 1] - ai[i];
445     idx = 6 * r[i];
446     s1  = b[idx];
447     s2  = b[1 + idx];
448     s3  = b[2 + idx];
449     s4  = b[3 + idx];
450     s5  = b[4 + idx];
451     s6  = b[5 + idx];
452     for (m = 0; m < nz; m++) {
453       idx = 6 * vi[m];
454       x1  = t[idx];
455       x2  = t[1 + idx];
456       x3  = t[2 + idx];
457       x4  = t[3 + idx];
458       x5  = t[4 + idx];
459       x6  = t[5 + idx];
460       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
461       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
462       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
463       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
464       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
465       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
466       v += 36;
467     }
468     idx        = 6 * i;
469     t[idx]     = s1;
470     t[1 + idx] = s2;
471     t[2 + idx] = s3;
472     t[3 + idx] = s4;
473     t[4 + idx] = s5;
474     t[5 + idx] = s6;
475   }
476   /* backward solve the upper triangular */
477   for (i = n - 1; i >= 0; i--) {
478     v   = aa + 36 * (adiag[i + 1] + 1);
479     vi  = aj + adiag[i + 1] + 1;
480     nz  = adiag[i] - adiag[i + 1] - 1;
481     idt = 6 * i;
482     s1  = t[idt];
483     s2  = t[1 + idt];
484     s3  = t[2 + idt];
485     s4  = t[3 + idt];
486     s5  = t[4 + idt];
487     s6  = t[5 + idt];
488     for (m = 0; m < nz; m++) {
489       idx = 6 * vi[m];
490       x1  = t[idx];
491       x2  = t[1 + idx];
492       x3  = t[2 + idx];
493       x4  = t[3 + idx];
494       x5  = t[4 + idx];
495       x6  = t[5 + idx];
496       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
497       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
498       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
499       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
500       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
501       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
502       v += 36;
503     }
504     idc    = 6 * c[i];
505     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
506     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
507     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
508     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
509     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
510     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
511   }
512 
513   PetscCall(ISRestoreIndices(isrow, &rout));
514   PetscCall(ISRestoreIndices(iscol, &cout));
515   PetscCall(VecRestoreArrayRead(bb, &b));
516   PetscCall(VecRestoreArray(xx, &x));
517   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
518   PetscFunctionReturn(PETSC_SUCCESS);
519 }
520 
521 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522 {
523   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
524   IS                 iscol = a->col, isrow = a->row;
525   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
526   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
527   PetscInt           i, nz, idx, idt, idc;
528   const MatScalar   *aa = a->a, *v;
529   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
530   const PetscScalar *b;
531 
532   PetscFunctionBegin;
533   PetscCall(VecGetArrayRead(bb, &b));
534   PetscCall(VecGetArray(xx, &x));
535   t = a->solve_work;
536 
537   PetscCall(ISGetIndices(isrow, &rout));
538   r = rout;
539   PetscCall(ISGetIndices(iscol, &cout));
540   c = cout + (n - 1);
541 
542   /* forward solve the lower triangular */
543   idx  = 5 * (*r++);
544   t[0] = b[idx];
545   t[1] = b[1 + idx];
546   t[2] = b[2 + idx];
547   t[3] = b[3 + idx];
548   t[4] = b[4 + idx];
549   for (i = 1; i < n; i++) {
550     v   = aa + 25 * ai[i];
551     vi  = aj + ai[i];
552     nz  = diag[i] - ai[i];
553     idx = 5 * (*r++);
554     s1  = b[idx];
555     s2  = b[1 + idx];
556     s3  = b[2 + idx];
557     s4  = b[3 + idx];
558     s5  = b[4 + idx];
559     while (nz--) {
560       idx = 5 * (*vi++);
561       x1  = t[idx];
562       x2  = t[1 + idx];
563       x3  = t[2 + idx];
564       x4  = t[3 + idx];
565       x5  = t[4 + idx];
566       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
567       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
568       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
569       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
570       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
571       v += 25;
572     }
573     idx        = 5 * i;
574     t[idx]     = s1;
575     t[1 + idx] = s2;
576     t[2 + idx] = s3;
577     t[3 + idx] = s4;
578     t[4 + idx] = s5;
579   }
580   /* backward solve the upper triangular */
581   for (i = n - 1; i >= 0; i--) {
582     v   = aa + 25 * diag[i] + 25;
583     vi  = aj + diag[i] + 1;
584     nz  = ai[i + 1] - diag[i] - 1;
585     idt = 5 * i;
586     s1  = t[idt];
587     s2  = t[1 + idt];
588     s3  = t[2 + idt];
589     s4  = t[3 + idt];
590     s5  = t[4 + idt];
591     while (nz--) {
592       idx = 5 * (*vi++);
593       x1  = t[idx];
594       x2  = t[1 + idx];
595       x3  = t[2 + idx];
596       x4  = t[3 + idx];
597       x5  = t[4 + idx];
598       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
599       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
600       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
601       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
602       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
603       v += 25;
604     }
605     idc    = 5 * (*c--);
606     v      = aa + 25 * diag[i];
607     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
608     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
609     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
610     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
611     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
612   }
613 
614   PetscCall(ISRestoreIndices(isrow, &rout));
615   PetscCall(ISRestoreIndices(iscol, &cout));
616   PetscCall(VecRestoreArrayRead(bb, &b));
617   PetscCall(VecRestoreArray(xx, &x));
618   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623 {
624   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
625   IS                 iscol = a->col, isrow = a->row;
626   const PetscInt    *r, *c, *rout, *cout;
627   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
628   PetscInt           i, nz, idx, idt, idc, m;
629   const MatScalar   *aa = a->a, *v;
630   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
631   const PetscScalar *b;
632 
633   PetscFunctionBegin;
634   PetscCall(VecGetArrayRead(bb, &b));
635   PetscCall(VecGetArray(xx, &x));
636   t = a->solve_work;
637 
638   PetscCall(ISGetIndices(isrow, &rout));
639   r = rout;
640   PetscCall(ISGetIndices(iscol, &cout));
641   c = cout;
642 
643   /* forward solve the lower triangular */
644   idx  = 5 * r[0];
645   t[0] = b[idx];
646   t[1] = b[1 + idx];
647   t[2] = b[2 + idx];
648   t[3] = b[3 + idx];
649   t[4] = b[4 + idx];
650   for (i = 1; i < n; i++) {
651     v   = aa + 25 * ai[i];
652     vi  = aj + ai[i];
653     nz  = ai[i + 1] - ai[i];
654     idx = 5 * r[i];
655     s1  = b[idx];
656     s2  = b[1 + idx];
657     s3  = b[2 + idx];
658     s4  = b[3 + idx];
659     s5  = b[4 + idx];
660     for (m = 0; m < nz; m++) {
661       idx = 5 * vi[m];
662       x1  = t[idx];
663       x2  = t[1 + idx];
664       x3  = t[2 + idx];
665       x4  = t[3 + idx];
666       x5  = t[4 + idx];
667       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
668       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
669       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
670       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
671       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
672       v += 25;
673     }
674     idx        = 5 * i;
675     t[idx]     = s1;
676     t[1 + idx] = s2;
677     t[2 + idx] = s3;
678     t[3 + idx] = s4;
679     t[4 + idx] = s5;
680   }
681   /* backward solve the upper triangular */
682   for (i = n - 1; i >= 0; i--) {
683     v   = aa + 25 * (adiag[i + 1] + 1);
684     vi  = aj + adiag[i + 1] + 1;
685     nz  = adiag[i] - adiag[i + 1] - 1;
686     idt = 5 * i;
687     s1  = t[idt];
688     s2  = t[1 + idt];
689     s3  = t[2 + idt];
690     s4  = t[3 + idt];
691     s5  = t[4 + idt];
692     for (m = 0; m < nz; m++) {
693       idx = 5 * vi[m];
694       x1  = t[idx];
695       x2  = t[1 + idx];
696       x3  = t[2 + idx];
697       x4  = t[3 + idx];
698       x5  = t[4 + idx];
699       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
700       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
701       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
702       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
703       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
704       v += 25;
705     }
706     idc    = 5 * c[i];
707     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
708     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
709     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
710     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
711     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
712   }
713 
714   PetscCall(ISRestoreIndices(isrow, &rout));
715   PetscCall(ISRestoreIndices(iscol, &cout));
716   PetscCall(VecRestoreArrayRead(bb, &b));
717   PetscCall(VecRestoreArray(xx, &x));
718   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
722 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723 {
724   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
725   IS                 iscol = a->col, isrow = a->row;
726   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
727   PetscInt           i, nz, idx, idt, idc;
728   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
729   const MatScalar   *aa = a->a, *v;
730   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
731   const PetscScalar *b;
732 
733   PetscFunctionBegin;
734   PetscCall(VecGetArrayRead(bb, &b));
735   PetscCall(VecGetArray(xx, &x));
736   t = a->solve_work;
737 
738   PetscCall(ISGetIndices(isrow, &rout));
739   r = rout;
740   PetscCall(ISGetIndices(iscol, &cout));
741   c = cout + (n - 1);
742 
743   /* forward solve the lower triangular */
744   idx  = 4 * (*r++);
745   t[0] = b[idx];
746   t[1] = b[1 + idx];
747   t[2] = b[2 + idx];
748   t[3] = b[3 + idx];
749   for (i = 1; i < n; i++) {
750     v   = aa + 16 * ai[i];
751     vi  = aj + ai[i];
752     nz  = diag[i] - ai[i];
753     idx = 4 * (*r++);
754     s1  = b[idx];
755     s2  = b[1 + idx];
756     s3  = b[2 + idx];
757     s4  = b[3 + idx];
758     while (nz--) {
759       idx = 4 * (*vi++);
760       x1  = t[idx];
761       x2  = t[1 + idx];
762       x3  = t[2 + idx];
763       x4  = t[3 + idx];
764       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
765       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
766       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
767       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
768       v += 16;
769     }
770     idx        = 4 * i;
771     t[idx]     = s1;
772     t[1 + idx] = s2;
773     t[2 + idx] = s3;
774     t[3 + idx] = s4;
775   }
776   /* backward solve the upper triangular */
777   for (i = n - 1; i >= 0; i--) {
778     v   = aa + 16 * diag[i] + 16;
779     vi  = aj + diag[i] + 1;
780     nz  = ai[i + 1] - diag[i] - 1;
781     idt = 4 * i;
782     s1  = t[idt];
783     s2  = t[1 + idt];
784     s3  = t[2 + idt];
785     s4  = t[3 + idt];
786     while (nz--) {
787       idx = 4 * (*vi++);
788       x1  = t[idx];
789       x2  = t[1 + idx];
790       x3  = t[2 + idx];
791       x4  = t[3 + idx];
792       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
793       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
794       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
795       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
796       v += 16;
797     }
798     idc    = 4 * (*c--);
799     v      = aa + 16 * diag[i];
800     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
801     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
802     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
803     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
804   }
805 
806   PetscCall(ISRestoreIndices(isrow, &rout));
807   PetscCall(ISRestoreIndices(iscol, &cout));
808   PetscCall(VecRestoreArrayRead(bb, &b));
809   PetscCall(VecRestoreArray(xx, &x));
810   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
811   PetscFunctionReturn(PETSC_SUCCESS);
812 }
813 
814 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815 {
816   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
817   IS                 iscol = a->col, isrow = a->row;
818   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
819   PetscInt           i, nz, idx, idt, idc, m;
820   const PetscInt    *r, *c, *rout, *cout;
821   const MatScalar   *aa = a->a, *v;
822   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
823   const PetscScalar *b;
824 
825   PetscFunctionBegin;
826   PetscCall(VecGetArrayRead(bb, &b));
827   PetscCall(VecGetArray(xx, &x));
828   t = a->solve_work;
829 
830   PetscCall(ISGetIndices(isrow, &rout));
831   r = rout;
832   PetscCall(ISGetIndices(iscol, &cout));
833   c = cout;
834 
835   /* forward solve the lower triangular */
836   idx  = 4 * r[0];
837   t[0] = b[idx];
838   t[1] = b[1 + idx];
839   t[2] = b[2 + idx];
840   t[3] = b[3 + idx];
841   for (i = 1; i < n; i++) {
842     v   = aa + 16 * ai[i];
843     vi  = aj + ai[i];
844     nz  = ai[i + 1] - ai[i];
845     idx = 4 * r[i];
846     s1  = b[idx];
847     s2  = b[1 + idx];
848     s3  = b[2 + idx];
849     s4  = b[3 + idx];
850     for (m = 0; m < nz; m++) {
851       idx = 4 * vi[m];
852       x1  = t[idx];
853       x2  = t[1 + idx];
854       x3  = t[2 + idx];
855       x4  = t[3 + idx];
856       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
857       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
858       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
859       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
860       v += 16;
861     }
862     idx        = 4 * i;
863     t[idx]     = s1;
864     t[1 + idx] = s2;
865     t[2 + idx] = s3;
866     t[3 + idx] = s4;
867   }
868   /* backward solve the upper triangular */
869   for (i = n - 1; i >= 0; i--) {
870     v   = aa + 16 * (adiag[i + 1] + 1);
871     vi  = aj + adiag[i + 1] + 1;
872     nz  = adiag[i] - adiag[i + 1] - 1;
873     idt = 4 * i;
874     s1  = t[idt];
875     s2  = t[1 + idt];
876     s3  = t[2 + idt];
877     s4  = t[3 + idt];
878     for (m = 0; m < nz; m++) {
879       idx = 4 * vi[m];
880       x1  = t[idx];
881       x2  = t[1 + idx];
882       x3  = t[2 + idx];
883       x4  = t[3 + idx];
884       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
885       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
886       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
887       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
888       v += 16;
889     }
890     idc    = 4 * c[i];
891     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
892     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
893     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
894     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
895   }
896 
897   PetscCall(ISRestoreIndices(isrow, &rout));
898   PetscCall(ISRestoreIndices(iscol, &cout));
899   PetscCall(VecRestoreArrayRead(bb, &b));
900   PetscCall(VecRestoreArray(xx, &x));
901   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
902   PetscFunctionReturn(PETSC_SUCCESS);
903 }
904 
905 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
906 {
907   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
908   IS                 iscol = a->col, isrow = a->row;
909   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
910   PetscInt           i, nz, idx, idt, idc;
911   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
912   const MatScalar   *aa = a->a, *v;
913   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
914   PetscScalar       *x;
915   const PetscScalar *b;
916 
917   PetscFunctionBegin;
918   PetscCall(VecGetArrayRead(bb, &b));
919   PetscCall(VecGetArray(xx, &x));
920   t = (MatScalar *)a->solve_work;
921 
922   PetscCall(ISGetIndices(isrow, &rout));
923   r = rout;
924   PetscCall(ISGetIndices(iscol, &cout));
925   c = cout + (n - 1);
926 
927   /* forward solve the lower triangular */
928   idx  = 4 * (*r++);
929   t[0] = (MatScalar)b[idx];
930   t[1] = (MatScalar)b[1 + idx];
931   t[2] = (MatScalar)b[2 + idx];
932   t[3] = (MatScalar)b[3 + idx];
933   for (i = 1; i < n; i++) {
934     v   = aa + 16 * ai[i];
935     vi  = aj + ai[i];
936     nz  = diag[i] - ai[i];
937     idx = 4 * (*r++);
938     s1  = (MatScalar)b[idx];
939     s2  = (MatScalar)b[1 + idx];
940     s3  = (MatScalar)b[2 + idx];
941     s4  = (MatScalar)b[3 + idx];
942     while (nz--) {
943       idx = 4 * (*vi++);
944       x1  = t[idx];
945       x2  = t[1 + idx];
946       x3  = t[2 + idx];
947       x4  = t[3 + idx];
948       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
949       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
950       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
951       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
952       v += 16;
953     }
954     idx        = 4 * i;
955     t[idx]     = s1;
956     t[1 + idx] = s2;
957     t[2 + idx] = s3;
958     t[3 + idx] = s4;
959   }
960   /* backward solve the upper triangular */
961   for (i = n - 1; i >= 0; i--) {
962     v   = aa + 16 * diag[i] + 16;
963     vi  = aj + diag[i] + 1;
964     nz  = ai[i + 1] - diag[i] - 1;
965     idt = 4 * i;
966     s1  = t[idt];
967     s2  = t[1 + idt];
968     s3  = t[2 + idt];
969     s4  = t[3 + idt];
970     while (nz--) {
971       idx = 4 * (*vi++);
972       x1  = t[idx];
973       x2  = t[1 + idx];
974       x3  = t[2 + idx];
975       x4  = t[3 + idx];
976       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
977       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
978       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
979       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
980       v += 16;
981     }
982     idc        = 4 * (*c--);
983     v          = aa + 16 * diag[i];
984     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
985     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
986     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
987     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
988     x[idc]     = (PetscScalar)t[idt];
989     x[1 + idc] = (PetscScalar)t[1 + idt];
990     x[2 + idc] = (PetscScalar)t[2 + idt];
991     x[3 + idc] = (PetscScalar)t[3 + idt];
992   }
993 
994   PetscCall(ISRestoreIndices(isrow, &rout));
995   PetscCall(ISRestoreIndices(iscol, &cout));
996   PetscCall(VecRestoreArrayRead(bb, &b));
997   PetscCall(VecRestoreArray(xx, &x));
998   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
999   PetscFunctionReturn(PETSC_SUCCESS);
1000 }
1001 
1002 #if defined(PETSC_HAVE_SSE)
1003 
1004   #include PETSC_HAVE_SSE
1005 
1006 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1007 {
1008   /*
1009      Note: This code uses demotion of double
1010      to float when performing the mixed-mode computation.
1011      This may not be numerically reasonable for all applications.
1012   */
1013   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
1014   IS              iscol = a->col, isrow = a->row;
1015   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1016   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1017   MatScalar      *aa = a->a, *v;
1018   PetscScalar    *x, *b, *t;
1019 
1020   /* Make space in temp stack for 16 Byte Aligned arrays */
1021   float         ssealignedspace[11], *tmps, *tmpx;
1022   unsigned long offset;
1023 
1024   PetscFunctionBegin;
1025   SSE_SCOPE_BEGIN;
1026 
1027   offset = (unsigned long)ssealignedspace % 16;
1028   if (offset) offset = (16 - offset) / 4;
1029   tmps = &ssealignedspace[offset];
1030   tmpx = &ssealignedspace[offset + 4];
1031   PREFETCH_NTA(aa + 16 * ai[1]);
1032 
1033   PetscCall(VecGetArray(bb, &b));
1034   PetscCall(VecGetArray(xx, &x));
1035   t = a->solve_work;
1036 
1037   PetscCall(ISGetIndices(isrow, &rout));
1038   r = rout;
1039   PetscCall(ISGetIndices(iscol, &cout));
1040   c = cout + (n - 1);
1041 
1042   /* forward solve the lower triangular */
1043   idx  = 4 * (*r++);
1044   t[0] = b[idx];
1045   t[1] = b[1 + idx];
1046   t[2] = b[2 + idx];
1047   t[3] = b[3 + idx];
1048   v    = aa + 16 * ai[1];
1049 
1050   for (i = 1; i < n;) {
1051     PREFETCH_NTA(&v[8]);
1052     vi  = aj + ai[i];
1053     nz  = diag[i] - ai[i];
1054     idx = 4 * (*r++);
1055 
1056     /* Demote sum from double to float */
1057     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1058     LOAD_PS(tmps, XMM7);
1059 
1060     while (nz--) {
1061       PREFETCH_NTA(&v[16]);
1062       idx = 4 * (*vi++);
1063 
1064       /* Demote solution (so far) from double to float */
1065       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
1066 
1067       /* 4x4 Matrix-Vector product with negative accumulation: */
1068       SSE_INLINE_BEGIN_2(tmpx, v)
1069       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1070 
1071       /* First Column */
1072       SSE_COPY_PS(XMM0, XMM6)
1073       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1074       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1075       SSE_SUB_PS(XMM7, XMM0)
1076 
1077       /* Second Column */
1078       SSE_COPY_PS(XMM1, XMM6)
1079       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1080       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1081       SSE_SUB_PS(XMM7, XMM1)
1082 
1083       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1084 
1085       /* Third Column */
1086       SSE_COPY_PS(XMM2, XMM6)
1087       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1088       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1089       SSE_SUB_PS(XMM7, XMM2)
1090 
1091       /* Fourth Column */
1092       SSE_COPY_PS(XMM3, XMM6)
1093       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1094       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1095       SSE_SUB_PS(XMM7, XMM3)
1096       SSE_INLINE_END_2
1097 
1098       v += 16;
1099     }
1100     idx = 4 * i;
1101     v   = aa + 16 * ai[++i];
1102     PREFETCH_NTA(v);
1103     STORE_PS(tmps, XMM7);
1104 
1105     /* Promote result from float to double */
1106     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1107   }
1108   /* backward solve the upper triangular */
1109   idt  = 4 * (n - 1);
1110   ai16 = 16 * diag[n - 1];
1111   v    = aa + ai16 + 16;
1112   for (i = n - 1; i >= 0;) {
1113     PREFETCH_NTA(&v[8]);
1114     vi = aj + diag[i] + 1;
1115     nz = ai[i + 1] - diag[i] - 1;
1116 
1117     /* Demote accumulator from double to float */
1118     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1119     LOAD_PS(tmps, XMM7);
1120 
1121     while (nz--) {
1122       PREFETCH_NTA(&v[16]);
1123       idx = 4 * (*vi++);
1124 
1125       /* Demote solution (so far) from double to float */
1126       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1127 
1128       /* 4x4 Matrix-Vector Product with negative accumulation: */
1129       SSE_INLINE_BEGIN_2(tmpx, v)
1130       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1131 
1132       /* First Column */
1133       SSE_COPY_PS(XMM0, XMM6)
1134       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1135       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1136       SSE_SUB_PS(XMM7, XMM0)
1137 
1138       /* Second Column */
1139       SSE_COPY_PS(XMM1, XMM6)
1140       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1141       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1142       SSE_SUB_PS(XMM7, XMM1)
1143 
1144       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1145 
1146       /* Third Column */
1147       SSE_COPY_PS(XMM2, XMM6)
1148       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1149       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1150       SSE_SUB_PS(XMM7, XMM2)
1151 
1152       /* Fourth Column */
1153       SSE_COPY_PS(XMM3, XMM6)
1154       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1155       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1156       SSE_SUB_PS(XMM7, XMM3)
1157       SSE_INLINE_END_2
1158       v += 16;
1159     }
1160     v    = aa + ai16;
1161     ai16 = 16 * diag[--i];
1162     PREFETCH_NTA(aa + ai16 + 16);
1163     /*
1164        Scale the result by the diagonal 4x4 block,
1165        which was inverted as part of the factorization
1166     */
1167     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1168     /* First Column */
1169     SSE_COPY_PS(XMM0, XMM7)
1170     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1171     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1172 
1173     /* Second Column */
1174     SSE_COPY_PS(XMM1, XMM7)
1175     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1176     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1177     SSE_ADD_PS(XMM0, XMM1)
1178 
1179     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1180 
1181     /* Third Column */
1182     SSE_COPY_PS(XMM2, XMM7)
1183     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1184     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1185     SSE_ADD_PS(XMM0, XMM2)
1186 
1187     /* Fourth Column */
1188     SSE_COPY_PS(XMM3, XMM7)
1189     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1190     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1191     SSE_ADD_PS(XMM0, XMM3)
1192 
1193     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1194     SSE_INLINE_END_3
1195 
1196     /* Promote solution from float to double */
1197     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1198 
1199     /* Apply reordering to t and stream into x.    */
1200     /* This way, x doesn't pollute the cache.      */
1201     /* Be careful with size: 2 doubles = 4 floats! */
1202     idc = 4 * (*c--);
1203     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1204     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1205     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1206     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1207     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1208     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1209     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1210     SSE_INLINE_END_2
1211     v = aa + ai16 + 16;
1212     idt -= 4;
1213   }
1214 
1215   PetscCall(ISRestoreIndices(isrow, &rout));
1216   PetscCall(ISRestoreIndices(iscol, &cout));
1217   PetscCall(VecRestoreArray(bb, &b));
1218   PetscCall(VecRestoreArray(xx, &x));
1219   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1220   SSE_SCOPE_END;
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 #endif
1225 
1226 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1227 {
1228   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1229   IS                 iscol = a->col, isrow = a->row;
1230   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1231   PetscInt           i, nz, idx, idt, idc;
1232   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1233   const MatScalar   *aa = a->a, *v;
1234   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1235   const PetscScalar *b;
1236 
1237   PetscFunctionBegin;
1238   PetscCall(VecGetArrayRead(bb, &b));
1239   PetscCall(VecGetArray(xx, &x));
1240   t = a->solve_work;
1241 
1242   PetscCall(ISGetIndices(isrow, &rout));
1243   r = rout;
1244   PetscCall(ISGetIndices(iscol, &cout));
1245   c = cout + (n - 1);
1246 
1247   /* forward solve the lower triangular */
1248   idx  = 3 * (*r++);
1249   t[0] = b[idx];
1250   t[1] = b[1 + idx];
1251   t[2] = b[2 + idx];
1252   for (i = 1; i < n; i++) {
1253     v   = aa + 9 * ai[i];
1254     vi  = aj + ai[i];
1255     nz  = diag[i] - ai[i];
1256     idx = 3 * (*r++);
1257     s1  = b[idx];
1258     s2  = b[1 + idx];
1259     s3  = b[2 + idx];
1260     while (nz--) {
1261       idx = 3 * (*vi++);
1262       x1  = t[idx];
1263       x2  = t[1 + idx];
1264       x3  = t[2 + idx];
1265       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1266       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1267       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1268       v += 9;
1269     }
1270     idx        = 3 * i;
1271     t[idx]     = s1;
1272     t[1 + idx] = s2;
1273     t[2 + idx] = s3;
1274   }
1275   /* backward solve the upper triangular */
1276   for (i = n - 1; i >= 0; i--) {
1277     v   = aa + 9 * diag[i] + 9;
1278     vi  = aj + diag[i] + 1;
1279     nz  = ai[i + 1] - diag[i] - 1;
1280     idt = 3 * i;
1281     s1  = t[idt];
1282     s2  = t[1 + idt];
1283     s3  = t[2 + idt];
1284     while (nz--) {
1285       idx = 3 * (*vi++);
1286       x1  = t[idx];
1287       x2  = t[1 + idx];
1288       x3  = t[2 + idx];
1289       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1290       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1291       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1292       v += 9;
1293     }
1294     idc    = 3 * (*c--);
1295     v      = aa + 9 * diag[i];
1296     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1297     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1298     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1299   }
1300   PetscCall(ISRestoreIndices(isrow, &rout));
1301   PetscCall(ISRestoreIndices(iscol, &cout));
1302   PetscCall(VecRestoreArrayRead(bb, &b));
1303   PetscCall(VecRestoreArray(xx, &x));
1304   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1305   PetscFunctionReturn(PETSC_SUCCESS);
1306 }
1307 
1308 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1309 {
1310   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1311   IS                 iscol = a->col, isrow = a->row;
1312   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1313   PetscInt           i, nz, idx, idt, idc, m;
1314   const PetscInt    *r, *c, *rout, *cout;
1315   const MatScalar   *aa = a->a, *v;
1316   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1317   const PetscScalar *b;
1318 
1319   PetscFunctionBegin;
1320   PetscCall(VecGetArrayRead(bb, &b));
1321   PetscCall(VecGetArray(xx, &x));
1322   t = a->solve_work;
1323 
1324   PetscCall(ISGetIndices(isrow, &rout));
1325   r = rout;
1326   PetscCall(ISGetIndices(iscol, &cout));
1327   c = cout;
1328 
1329   /* forward solve the lower triangular */
1330   idx  = 3 * r[0];
1331   t[0] = b[idx];
1332   t[1] = b[1 + idx];
1333   t[2] = b[2 + idx];
1334   for (i = 1; i < n; i++) {
1335     v   = aa + 9 * ai[i];
1336     vi  = aj + ai[i];
1337     nz  = ai[i + 1] - ai[i];
1338     idx = 3 * r[i];
1339     s1  = b[idx];
1340     s2  = b[1 + idx];
1341     s3  = b[2 + idx];
1342     for (m = 0; m < nz; m++) {
1343       idx = 3 * vi[m];
1344       x1  = t[idx];
1345       x2  = t[1 + idx];
1346       x3  = t[2 + idx];
1347       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1348       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1349       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1350       v += 9;
1351     }
1352     idx        = 3 * i;
1353     t[idx]     = s1;
1354     t[1 + idx] = s2;
1355     t[2 + idx] = s3;
1356   }
1357   /* backward solve the upper triangular */
1358   for (i = n - 1; i >= 0; i--) {
1359     v   = aa + 9 * (adiag[i + 1] + 1);
1360     vi  = aj + adiag[i + 1] + 1;
1361     nz  = adiag[i] - adiag[i + 1] - 1;
1362     idt = 3 * i;
1363     s1  = t[idt];
1364     s2  = t[1 + idt];
1365     s3  = t[2 + idt];
1366     for (m = 0; m < nz; m++) {
1367       idx = 3 * vi[m];
1368       x1  = t[idx];
1369       x2  = t[1 + idx];
1370       x3  = t[2 + idx];
1371       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1372       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1373       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1374       v += 9;
1375     }
1376     idc    = 3 * c[i];
1377     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1378     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1379     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1380   }
1381   PetscCall(ISRestoreIndices(isrow, &rout));
1382   PetscCall(ISRestoreIndices(iscol, &cout));
1383   PetscCall(VecRestoreArrayRead(bb, &b));
1384   PetscCall(VecRestoreArray(xx, &x));
1385   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1390 {
1391   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1392   IS                 iscol = a->col, isrow = a->row;
1393   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1394   PetscInt           i, nz, idx, idt, idc;
1395   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1396   const MatScalar   *aa = a->a, *v;
1397   PetscScalar       *x, s1, s2, x1, x2, *t;
1398   const PetscScalar *b;
1399 
1400   PetscFunctionBegin;
1401   PetscCall(VecGetArrayRead(bb, &b));
1402   PetscCall(VecGetArray(xx, &x));
1403   t = a->solve_work;
1404 
1405   PetscCall(ISGetIndices(isrow, &rout));
1406   r = rout;
1407   PetscCall(ISGetIndices(iscol, &cout));
1408   c = cout + (n - 1);
1409 
1410   /* forward solve the lower triangular */
1411   idx  = 2 * (*r++);
1412   t[0] = b[idx];
1413   t[1] = b[1 + idx];
1414   for (i = 1; i < n; i++) {
1415     v   = aa + 4 * ai[i];
1416     vi  = aj + ai[i];
1417     nz  = diag[i] - ai[i];
1418     idx = 2 * (*r++);
1419     s1  = b[idx];
1420     s2  = b[1 + idx];
1421     while (nz--) {
1422       idx = 2 * (*vi++);
1423       x1  = t[idx];
1424       x2  = t[1 + idx];
1425       s1 -= v[0] * x1 + v[2] * x2;
1426       s2 -= v[1] * x1 + v[3] * x2;
1427       v += 4;
1428     }
1429     idx        = 2 * i;
1430     t[idx]     = s1;
1431     t[1 + idx] = s2;
1432   }
1433   /* backward solve the upper triangular */
1434   for (i = n - 1; i >= 0; i--) {
1435     v   = aa + 4 * diag[i] + 4;
1436     vi  = aj + diag[i] + 1;
1437     nz  = ai[i + 1] - diag[i] - 1;
1438     idt = 2 * i;
1439     s1  = t[idt];
1440     s2  = t[1 + idt];
1441     while (nz--) {
1442       idx = 2 * (*vi++);
1443       x1  = t[idx];
1444       x2  = t[1 + idx];
1445       s1 -= v[0] * x1 + v[2] * x2;
1446       s2 -= v[1] * x1 + v[3] * x2;
1447       v += 4;
1448     }
1449     idc    = 2 * (*c--);
1450     v      = aa + 4 * diag[i];
1451     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1452     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1453   }
1454   PetscCall(ISRestoreIndices(isrow, &rout));
1455   PetscCall(ISRestoreIndices(iscol, &cout));
1456   PetscCall(VecRestoreArrayRead(bb, &b));
1457   PetscCall(VecRestoreArray(xx, &x));
1458   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1463 {
1464   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1465   IS                 iscol = a->col, isrow = a->row;
1466   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1467   PetscInt           i, nz, idx, jdx, idt, idc, m;
1468   const PetscInt    *r, *c, *rout, *cout;
1469   const MatScalar   *aa = a->a, *v;
1470   PetscScalar       *x, s1, s2, x1, x2, *t;
1471   const PetscScalar *b;
1472 
1473   PetscFunctionBegin;
1474   PetscCall(VecGetArrayRead(bb, &b));
1475   PetscCall(VecGetArray(xx, &x));
1476   t = a->solve_work;
1477 
1478   PetscCall(ISGetIndices(isrow, &rout));
1479   r = rout;
1480   PetscCall(ISGetIndices(iscol, &cout));
1481   c = cout;
1482 
1483   /* forward solve the lower triangular */
1484   idx  = 2 * r[0];
1485   t[0] = b[idx];
1486   t[1] = b[1 + idx];
1487   for (i = 1; i < n; i++) {
1488     v   = aa + 4 * ai[i];
1489     vi  = aj + ai[i];
1490     nz  = ai[i + 1] - ai[i];
1491     idx = 2 * r[i];
1492     s1  = b[idx];
1493     s2  = b[1 + idx];
1494     for (m = 0; m < nz; m++) {
1495       jdx = 2 * vi[m];
1496       x1  = t[jdx];
1497       x2  = t[1 + jdx];
1498       s1 -= v[0] * x1 + v[2] * x2;
1499       s2 -= v[1] * x1 + v[3] * x2;
1500       v += 4;
1501     }
1502     idx        = 2 * i;
1503     t[idx]     = s1;
1504     t[1 + idx] = s2;
1505   }
1506   /* backward solve the upper triangular */
1507   for (i = n - 1; i >= 0; i--) {
1508     v   = aa + 4 * (adiag[i + 1] + 1);
1509     vi  = aj + adiag[i + 1] + 1;
1510     nz  = adiag[i] - adiag[i + 1] - 1;
1511     idt = 2 * i;
1512     s1  = t[idt];
1513     s2  = t[1 + idt];
1514     for (m = 0; m < nz; m++) {
1515       idx = 2 * vi[m];
1516       x1  = t[idx];
1517       x2  = t[1 + idx];
1518       s1 -= v[0] * x1 + v[2] * x2;
1519       s2 -= v[1] * x1 + v[3] * x2;
1520       v += 4;
1521     }
1522     idc    = 2 * c[i];
1523     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1524     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1525   }
1526   PetscCall(ISRestoreIndices(isrow, &rout));
1527   PetscCall(ISRestoreIndices(iscol, &cout));
1528   PetscCall(VecRestoreArrayRead(bb, &b));
1529   PetscCall(VecRestoreArray(xx, &x));
1530   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 
1534 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1535 {
1536   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1537   IS                 iscol = a->col, isrow = a->row;
1538   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1539   PetscInt           i, nz;
1540   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1541   const MatScalar   *aa = a->a, *v;
1542   PetscScalar       *x, s1, *t;
1543   const PetscScalar *b;
1544 
1545   PetscFunctionBegin;
1546   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1547 
1548   PetscCall(VecGetArrayRead(bb, &b));
1549   PetscCall(VecGetArray(xx, &x));
1550   t = a->solve_work;
1551 
1552   PetscCall(ISGetIndices(isrow, &rout));
1553   r = rout;
1554   PetscCall(ISGetIndices(iscol, &cout));
1555   c = cout + (n - 1);
1556 
1557   /* forward solve the lower triangular */
1558   t[0] = b[*r++];
1559   for (i = 1; i < n; i++) {
1560     v  = aa + ai[i];
1561     vi = aj + ai[i];
1562     nz = diag[i] - ai[i];
1563     s1 = b[*r++];
1564     while (nz--) s1 -= (*v++) * t[*vi++];
1565     t[i] = s1;
1566   }
1567   /* backward solve the upper triangular */
1568   for (i = n - 1; i >= 0; i--) {
1569     v  = aa + diag[i] + 1;
1570     vi = aj + diag[i] + 1;
1571     nz = ai[i + 1] - diag[i] - 1;
1572     s1 = t[i];
1573     while (nz--) s1 -= (*v++) * t[*vi++];
1574     x[*c--] = t[i] = aa[diag[i]] * s1;
1575   }
1576 
1577   PetscCall(ISRestoreIndices(isrow, &rout));
1578   PetscCall(ISRestoreIndices(iscol, &cout));
1579   PetscCall(VecRestoreArrayRead(bb, &b));
1580   PetscCall(VecRestoreArray(xx, &x));
1581   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1582   PetscFunctionReturn(PETSC_SUCCESS);
1583 }
1584 
1585 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1586 {
1587   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1588   IS                 iscol = a->col, isrow = a->row;
1589   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1590   const PetscInt    *rout, *cout, *r, *c;
1591   PetscScalar       *x, *tmp, sum;
1592   const PetscScalar *b;
1593   const MatScalar   *aa = a->a, *v;
1594 
1595   PetscFunctionBegin;
1596   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1597 
1598   PetscCall(VecGetArrayRead(bb, &b));
1599   PetscCall(VecGetArray(xx, &x));
1600   tmp = a->solve_work;
1601 
1602   PetscCall(ISGetIndices(isrow, &rout));
1603   r = rout;
1604   PetscCall(ISGetIndices(iscol, &cout));
1605   c = cout;
1606 
1607   /* forward solve the lower triangular */
1608   tmp[0] = b[r[0]];
1609   v      = aa;
1610   vi     = aj;
1611   for (i = 1; i < n; i++) {
1612     nz  = ai[i + 1] - ai[i];
1613     sum = b[r[i]];
1614     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1615     tmp[i] = sum;
1616     v += nz;
1617     vi += nz;
1618   }
1619 
1620   /* backward solve the upper triangular */
1621   for (i = n - 1; i >= 0; i--) {
1622     v   = aa + adiag[i + 1] + 1;
1623     vi  = aj + adiag[i + 1] + 1;
1624     nz  = adiag[i] - adiag[i + 1] - 1;
1625     sum = tmp[i];
1626     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1627     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1628   }
1629 
1630   PetscCall(ISRestoreIndices(isrow, &rout));
1631   PetscCall(ISRestoreIndices(iscol, &cout));
1632   PetscCall(VecRestoreArrayRead(bb, &b));
1633   PetscCall(VecRestoreArray(xx, &x));
1634   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637