xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision df4cd43f92eaa320656440c40edb1046daee8f75)
1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3 
4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5 {
6   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
7   IS                 iscol = a->col, isrow = a->row;
8   const PetscInt    *r, *c, *rout, *cout;
9   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
10   PetscInt           i, nz;
11   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
12   const MatScalar   *aa = a->a, *v;
13   PetscScalar       *x, *s, *t, *ls;
14   const PetscScalar *b;
15 
16   PetscFunctionBegin;
17   PetscCall(VecGetArrayRead(bb, &b));
18   PetscCall(VecGetArray(xx, &x));
19   t = a->solve_work;
20 
21   PetscCall(ISGetIndices(isrow, &rout));
22   r = rout;
23   PetscCall(ISGetIndices(iscol, &cout));
24   c = cout + (n - 1);
25 
26   /* forward solve the lower triangular */
27   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
28   for (i = 1; i < n; i++) {
29     v  = aa + bs2 * ai[i];
30     vi = aj + ai[i];
31     nz = a->diag[i] - ai[i];
32     s  = t + bs * i;
33     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
34     while (nz--) {
35       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
36       v += bs2;
37     }
38   }
39   /* backward solve the upper triangular */
40   ls = a->solve_work + A->cmap->n;
41   for (i = n - 1; i >= 0; i--) {
42     v  = aa + bs2 * (a->diag[i] + 1);
43     vi = aj + a->diag[i] + 1;
44     nz = ai[i + 1] - a->diag[i] - 1;
45     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
46     while (nz--) {
47       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
48       v += bs2;
49     }
50     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
51     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
52   }
53 
54   PetscCall(ISRestoreIndices(isrow, &rout));
55   PetscCall(ISRestoreIndices(iscol, &cout));
56   PetscCall(VecRestoreArrayRead(bb, &b));
57   PetscCall(VecRestoreArray(xx, &x));
58   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
59   PetscFunctionReturn(PETSC_SUCCESS);
60 }
61 
62 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
63 {
64   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
65   IS                 iscol = a->col, isrow = a->row;
66   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
67   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
68   PetscInt           i, nz, idx, idt, idc;
69   const MatScalar   *aa = a->a, *v;
70   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
71   const PetscScalar *b;
72 
73   PetscFunctionBegin;
74   PetscCall(VecGetArrayRead(bb, &b));
75   PetscCall(VecGetArray(xx, &x));
76   t = a->solve_work;
77 
78   PetscCall(ISGetIndices(isrow, &rout));
79   r = rout;
80   PetscCall(ISGetIndices(iscol, &cout));
81   c = cout + (n - 1);
82 
83   /* forward solve the lower triangular */
84   idx  = 7 * (*r++);
85   t[0] = b[idx];
86   t[1] = b[1 + idx];
87   t[2] = b[2 + idx];
88   t[3] = b[3 + idx];
89   t[4] = b[4 + idx];
90   t[5] = b[5 + idx];
91   t[6] = b[6 + idx];
92 
93   for (i = 1; i < n; i++) {
94     v   = aa + 49 * ai[i];
95     vi  = aj + ai[i];
96     nz  = diag[i] - ai[i];
97     idx = 7 * (*r++);
98     s1  = b[idx];
99     s2  = b[1 + idx];
100     s3  = b[2 + idx];
101     s4  = b[3 + idx];
102     s5  = b[4 + idx];
103     s6  = b[5 + idx];
104     s7  = b[6 + idx];
105     while (nz--) {
106       idx = 7 * (*vi++);
107       x1  = t[idx];
108       x2  = t[1 + idx];
109       x3  = t[2 + idx];
110       x4  = t[3 + idx];
111       x5  = t[4 + idx];
112       x6  = t[5 + idx];
113       x7  = t[6 + idx];
114       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
115       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
116       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
117       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
118       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
119       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
120       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
121       v += 49;
122     }
123     idx        = 7 * i;
124     t[idx]     = s1;
125     t[1 + idx] = s2;
126     t[2 + idx] = s3;
127     t[3 + idx] = s4;
128     t[4 + idx] = s5;
129     t[5 + idx] = s6;
130     t[6 + idx] = s7;
131   }
132   /* backward solve the upper triangular */
133   for (i = n - 1; i >= 0; i--) {
134     v   = aa + 49 * diag[i] + 49;
135     vi  = aj + diag[i] + 1;
136     nz  = ai[i + 1] - diag[i] - 1;
137     idt = 7 * i;
138     s1  = t[idt];
139     s2  = t[1 + idt];
140     s3  = t[2 + idt];
141     s4  = t[3 + idt];
142     s5  = t[4 + idt];
143     s6  = t[5 + idt];
144     s7  = t[6 + idt];
145     while (nz--) {
146       idx = 7 * (*vi++);
147       x1  = t[idx];
148       x2  = t[1 + idx];
149       x3  = t[2 + idx];
150       x4  = t[3 + idx];
151       x5  = t[4 + idx];
152       x6  = t[5 + idx];
153       x7  = t[6 + idx];
154       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
155       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
156       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
157       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
158       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
159       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
160       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
161       v += 49;
162     }
163     idc    = 7 * (*c--);
164     v      = aa + 49 * diag[i];
165     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
166     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
167     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
168     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
169     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
170     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
171     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
172   }
173 
174   PetscCall(ISRestoreIndices(isrow, &rout));
175   PetscCall(ISRestoreIndices(iscol, &cout));
176   PetscCall(VecRestoreArrayRead(bb, &b));
177   PetscCall(VecRestoreArray(xx, &x));
178   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
182 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
183 {
184   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
185   IS                 iscol = a->col, isrow = a->row;
186   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
187   const PetscInt     n = a->mbs, *rout, *cout, *vi;
188   PetscInt           i, nz, idx, idt, idc, m;
189   const MatScalar   *aa = a->a, *v;
190   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
191   const PetscScalar *b;
192 
193   PetscFunctionBegin;
194   PetscCall(VecGetArrayRead(bb, &b));
195   PetscCall(VecGetArray(xx, &x));
196   t = a->solve_work;
197 
198   PetscCall(ISGetIndices(isrow, &rout));
199   r = rout;
200   PetscCall(ISGetIndices(iscol, &cout));
201   c = cout;
202 
203   /* forward solve the lower triangular */
204   idx  = 7 * r[0];
205   t[0] = b[idx];
206   t[1] = b[1 + idx];
207   t[2] = b[2 + idx];
208   t[3] = b[3 + idx];
209   t[4] = b[4 + idx];
210   t[5] = b[5 + idx];
211   t[6] = b[6 + idx];
212 
213   for (i = 1; i < n; i++) {
214     v   = aa + 49 * ai[i];
215     vi  = aj + ai[i];
216     nz  = ai[i + 1] - ai[i];
217     idx = 7 * r[i];
218     s1  = b[idx];
219     s2  = b[1 + idx];
220     s3  = b[2 + idx];
221     s4  = b[3 + idx];
222     s5  = b[4 + idx];
223     s6  = b[5 + idx];
224     s7  = b[6 + idx];
225     for (m = 0; m < nz; m++) {
226       idx = 7 * vi[m];
227       x1  = t[idx];
228       x2  = t[1 + idx];
229       x3  = t[2 + idx];
230       x4  = t[3 + idx];
231       x5  = t[4 + idx];
232       x6  = t[5 + idx];
233       x7  = t[6 + idx];
234       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
235       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
236       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
237       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
238       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
239       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
240       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
241       v += 49;
242     }
243     idx        = 7 * i;
244     t[idx]     = s1;
245     t[1 + idx] = s2;
246     t[2 + idx] = s3;
247     t[3 + idx] = s4;
248     t[4 + idx] = s5;
249     t[5 + idx] = s6;
250     t[6 + idx] = s7;
251   }
252   /* backward solve the upper triangular */
253   for (i = n - 1; i >= 0; i--) {
254     v   = aa + 49 * (adiag[i + 1] + 1);
255     vi  = aj + adiag[i + 1] + 1;
256     nz  = adiag[i] - adiag[i + 1] - 1;
257     idt = 7 * i;
258     s1  = t[idt];
259     s2  = t[1 + idt];
260     s3  = t[2 + idt];
261     s4  = t[3 + idt];
262     s5  = t[4 + idt];
263     s6  = t[5 + idt];
264     s7  = t[6 + idt];
265     for (m = 0; m < nz; m++) {
266       idx = 7 * vi[m];
267       x1  = t[idx];
268       x2  = t[1 + idx];
269       x3  = t[2 + idx];
270       x4  = t[3 + idx];
271       x5  = t[4 + idx];
272       x6  = t[5 + idx];
273       x7  = t[6 + idx];
274       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
275       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
276       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
277       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
278       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
279       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
280       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
281       v += 49;
282     }
283     idc    = 7 * c[i];
284     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
285     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
286     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
287     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
288     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
289     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
290     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
291   }
292 
293   PetscCall(ISRestoreIndices(isrow, &rout));
294   PetscCall(ISRestoreIndices(iscol, &cout));
295   PetscCall(VecRestoreArrayRead(bb, &b));
296   PetscCall(VecRestoreArray(xx, &x));
297   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
302 {
303   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
304   IS                 iscol = a->col, isrow = a->row;
305   const PetscInt    *r, *c, *rout, *cout;
306   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
307   PetscInt           i, nz, idx, idt, idc;
308   const MatScalar   *aa = a->a, *v;
309   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
310   const PetscScalar *b;
311 
312   PetscFunctionBegin;
313   PetscCall(VecGetArrayRead(bb, &b));
314   PetscCall(VecGetArray(xx, &x));
315   t = a->solve_work;
316 
317   PetscCall(ISGetIndices(isrow, &rout));
318   r = rout;
319   PetscCall(ISGetIndices(iscol, &cout));
320   c = cout + (n - 1);
321 
322   /* forward solve the lower triangular */
323   idx  = 6 * (*r++);
324   t[0] = b[idx];
325   t[1] = b[1 + idx];
326   t[2] = b[2 + idx];
327   t[3] = b[3 + idx];
328   t[4] = b[4 + idx];
329   t[5] = b[5 + idx];
330   for (i = 1; i < n; i++) {
331     v   = aa + 36 * ai[i];
332     vi  = aj + ai[i];
333     nz  = diag[i] - ai[i];
334     idx = 6 * (*r++);
335     s1  = b[idx];
336     s2  = b[1 + idx];
337     s3  = b[2 + idx];
338     s4  = b[3 + idx];
339     s5  = b[4 + idx];
340     s6  = b[5 + idx];
341     while (nz--) {
342       idx = 6 * (*vi++);
343       x1  = t[idx];
344       x2  = t[1 + idx];
345       x3  = t[2 + idx];
346       x4  = t[3 + idx];
347       x5  = t[4 + idx];
348       x6  = t[5 + idx];
349       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
350       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
351       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
352       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
353       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
354       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
355       v += 36;
356     }
357     idx        = 6 * i;
358     t[idx]     = s1;
359     t[1 + idx] = s2;
360     t[2 + idx] = s3;
361     t[3 + idx] = s4;
362     t[4 + idx] = s5;
363     t[5 + idx] = s6;
364   }
365   /* backward solve the upper triangular */
366   for (i = n - 1; i >= 0; i--) {
367     v   = aa + 36 * diag[i] + 36;
368     vi  = aj + diag[i] + 1;
369     nz  = ai[i + 1] - diag[i] - 1;
370     idt = 6 * i;
371     s1  = t[idt];
372     s2  = t[1 + idt];
373     s3  = t[2 + idt];
374     s4  = t[3 + idt];
375     s5  = t[4 + idt];
376     s6  = t[5 + idt];
377     while (nz--) {
378       idx = 6 * (*vi++);
379       x1  = t[idx];
380       x2  = t[1 + idx];
381       x3  = t[2 + idx];
382       x4  = t[3 + idx];
383       x5  = t[4 + idx];
384       x6  = t[5 + idx];
385       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
386       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
387       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
388       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
389       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
390       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
391       v += 36;
392     }
393     idc    = 6 * (*c--);
394     v      = aa + 36 * diag[i];
395     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
396     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
397     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
398     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
399     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
400     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
401   }
402 
403   PetscCall(ISRestoreIndices(isrow, &rout));
404   PetscCall(ISRestoreIndices(iscol, &cout));
405   PetscCall(VecRestoreArrayRead(bb, &b));
406   PetscCall(VecRestoreArray(xx, &x));
407   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
408   PetscFunctionReturn(PETSC_SUCCESS);
409 }
410 
411 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
412 {
413   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
414   IS                 iscol = a->col, isrow = a->row;
415   const PetscInt    *r, *c, *rout, *cout;
416   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
417   PetscInt           i, nz, idx, idt, idc, m;
418   const MatScalar   *aa = a->a, *v;
419   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
420   const PetscScalar *b;
421 
422   PetscFunctionBegin;
423   PetscCall(VecGetArrayRead(bb, &b));
424   PetscCall(VecGetArray(xx, &x));
425   t = a->solve_work;
426 
427   PetscCall(ISGetIndices(isrow, &rout));
428   r = rout;
429   PetscCall(ISGetIndices(iscol, &cout));
430   c = cout;
431 
432   /* forward solve the lower triangular */
433   idx  = 6 * r[0];
434   t[0] = b[idx];
435   t[1] = b[1 + idx];
436   t[2] = b[2 + idx];
437   t[3] = b[3 + idx];
438   t[4] = b[4 + idx];
439   t[5] = b[5 + idx];
440   for (i = 1; i < n; i++) {
441     v   = aa + 36 * ai[i];
442     vi  = aj + ai[i];
443     nz  = ai[i + 1] - ai[i];
444     idx = 6 * r[i];
445     s1  = b[idx];
446     s2  = b[1 + idx];
447     s3  = b[2 + idx];
448     s4  = b[3 + idx];
449     s5  = b[4 + idx];
450     s6  = b[5 + idx];
451     for (m = 0; m < nz; m++) {
452       idx = 6 * vi[m];
453       x1  = t[idx];
454       x2  = t[1 + idx];
455       x3  = t[2 + idx];
456       x4  = t[3 + idx];
457       x5  = t[4 + idx];
458       x6  = t[5 + idx];
459       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
460       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
461       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
462       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
463       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
464       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
465       v += 36;
466     }
467     idx        = 6 * i;
468     t[idx]     = s1;
469     t[1 + idx] = s2;
470     t[2 + idx] = s3;
471     t[3 + idx] = s4;
472     t[4 + idx] = s5;
473     t[5 + idx] = s6;
474   }
475   /* backward solve the upper triangular */
476   for (i = n - 1; i >= 0; i--) {
477     v   = aa + 36 * (adiag[i + 1] + 1);
478     vi  = aj + adiag[i + 1] + 1;
479     nz  = adiag[i] - adiag[i + 1] - 1;
480     idt = 6 * i;
481     s1  = t[idt];
482     s2  = t[1 + idt];
483     s3  = t[2 + idt];
484     s4  = t[3 + idt];
485     s5  = t[4 + idt];
486     s6  = t[5 + idt];
487     for (m = 0; m < nz; m++) {
488       idx = 6 * vi[m];
489       x1  = t[idx];
490       x2  = t[1 + idx];
491       x3  = t[2 + idx];
492       x4  = t[3 + idx];
493       x5  = t[4 + idx];
494       x6  = t[5 + idx];
495       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
496       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
497       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
498       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
499       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
500       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
501       v += 36;
502     }
503     idc    = 6 * c[i];
504     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
505     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
506     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
507     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
508     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
509     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
510   }
511 
512   PetscCall(ISRestoreIndices(isrow, &rout));
513   PetscCall(ISRestoreIndices(iscol, &cout));
514   PetscCall(VecRestoreArrayRead(bb, &b));
515   PetscCall(VecRestoreArray(xx, &x));
516   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
517   PetscFunctionReturn(PETSC_SUCCESS);
518 }
519 
520 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
521 {
522   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
523   IS                 iscol = a->col, isrow = a->row;
524   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
525   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
526   PetscInt           i, nz, idx, idt, idc;
527   const MatScalar   *aa = a->a, *v;
528   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
529   const PetscScalar *b;
530 
531   PetscFunctionBegin;
532   PetscCall(VecGetArrayRead(bb, &b));
533   PetscCall(VecGetArray(xx, &x));
534   t = a->solve_work;
535 
536   PetscCall(ISGetIndices(isrow, &rout));
537   r = rout;
538   PetscCall(ISGetIndices(iscol, &cout));
539   c = cout + (n - 1);
540 
541   /* forward solve the lower triangular */
542   idx  = 5 * (*r++);
543   t[0] = b[idx];
544   t[1] = b[1 + idx];
545   t[2] = b[2 + idx];
546   t[3] = b[3 + idx];
547   t[4] = b[4 + idx];
548   for (i = 1; i < n; i++) {
549     v   = aa + 25 * ai[i];
550     vi  = aj + ai[i];
551     nz  = diag[i] - ai[i];
552     idx = 5 * (*r++);
553     s1  = b[idx];
554     s2  = b[1 + idx];
555     s3  = b[2 + idx];
556     s4  = b[3 + idx];
557     s5  = b[4 + idx];
558     while (nz--) {
559       idx = 5 * (*vi++);
560       x1  = t[idx];
561       x2  = t[1 + idx];
562       x3  = t[2 + idx];
563       x4  = t[3 + idx];
564       x5  = t[4 + idx];
565       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
566       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
567       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
568       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
569       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
570       v += 25;
571     }
572     idx        = 5 * i;
573     t[idx]     = s1;
574     t[1 + idx] = s2;
575     t[2 + idx] = s3;
576     t[3 + idx] = s4;
577     t[4 + idx] = s5;
578   }
579   /* backward solve the upper triangular */
580   for (i = n - 1; i >= 0; i--) {
581     v   = aa + 25 * diag[i] + 25;
582     vi  = aj + diag[i] + 1;
583     nz  = ai[i + 1] - diag[i] - 1;
584     idt = 5 * i;
585     s1  = t[idt];
586     s2  = t[1 + idt];
587     s3  = t[2 + idt];
588     s4  = t[3 + idt];
589     s5  = t[4 + idt];
590     while (nz--) {
591       idx = 5 * (*vi++);
592       x1  = t[idx];
593       x2  = t[1 + idx];
594       x3  = t[2 + idx];
595       x4  = t[3 + idx];
596       x5  = t[4 + idx];
597       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
598       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
599       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
600       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
601       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
602       v += 25;
603     }
604     idc    = 5 * (*c--);
605     v      = aa + 25 * diag[i];
606     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
607     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
608     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
609     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
610     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
611   }
612 
613   PetscCall(ISRestoreIndices(isrow, &rout));
614   PetscCall(ISRestoreIndices(iscol, &cout));
615   PetscCall(VecRestoreArrayRead(bb, &b));
616   PetscCall(VecRestoreArray(xx, &x));
617   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
618   PetscFunctionReturn(PETSC_SUCCESS);
619 }
620 
621 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
622 {
623   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
624   IS                 iscol = a->col, isrow = a->row;
625   const PetscInt    *r, *c, *rout, *cout;
626   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
627   PetscInt           i, nz, idx, idt, idc, m;
628   const MatScalar   *aa = a->a, *v;
629   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
630   const PetscScalar *b;
631 
632   PetscFunctionBegin;
633   PetscCall(VecGetArrayRead(bb, &b));
634   PetscCall(VecGetArray(xx, &x));
635   t = a->solve_work;
636 
637   PetscCall(ISGetIndices(isrow, &rout));
638   r = rout;
639   PetscCall(ISGetIndices(iscol, &cout));
640   c = cout;
641 
642   /* forward solve the lower triangular */
643   idx  = 5 * r[0];
644   t[0] = b[idx];
645   t[1] = b[1 + idx];
646   t[2] = b[2 + idx];
647   t[3] = b[3 + idx];
648   t[4] = b[4 + idx];
649   for (i = 1; i < n; i++) {
650     v   = aa + 25 * ai[i];
651     vi  = aj + ai[i];
652     nz  = ai[i + 1] - ai[i];
653     idx = 5 * r[i];
654     s1  = b[idx];
655     s2  = b[1 + idx];
656     s3  = b[2 + idx];
657     s4  = b[3 + idx];
658     s5  = b[4 + idx];
659     for (m = 0; m < nz; m++) {
660       idx = 5 * vi[m];
661       x1  = t[idx];
662       x2  = t[1 + idx];
663       x3  = t[2 + idx];
664       x4  = t[3 + idx];
665       x5  = t[4 + idx];
666       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
667       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
668       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
669       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
670       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
671       v += 25;
672     }
673     idx        = 5 * i;
674     t[idx]     = s1;
675     t[1 + idx] = s2;
676     t[2 + idx] = s3;
677     t[3 + idx] = s4;
678     t[4 + idx] = s5;
679   }
680   /* backward solve the upper triangular */
681   for (i = n - 1; i >= 0; i--) {
682     v   = aa + 25 * (adiag[i + 1] + 1);
683     vi  = aj + adiag[i + 1] + 1;
684     nz  = adiag[i] - adiag[i + 1] - 1;
685     idt = 5 * i;
686     s1  = t[idt];
687     s2  = t[1 + idt];
688     s3  = t[2 + idt];
689     s4  = t[3 + idt];
690     s5  = t[4 + idt];
691     for (m = 0; m < nz; m++) {
692       idx = 5 * vi[m];
693       x1  = t[idx];
694       x2  = t[1 + idx];
695       x3  = t[2 + idx];
696       x4  = t[3 + idx];
697       x5  = t[4 + idx];
698       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
699       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
700       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
701       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
702       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
703       v += 25;
704     }
705     idc    = 5 * c[i];
706     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
707     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
708     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
709     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
710     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
711   }
712 
713   PetscCall(ISRestoreIndices(isrow, &rout));
714   PetscCall(ISRestoreIndices(iscol, &cout));
715   PetscCall(VecRestoreArrayRead(bb, &b));
716   PetscCall(VecRestoreArray(xx, &x));
717   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
718   PetscFunctionReturn(PETSC_SUCCESS);
719 }
720 
721 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
722 {
723   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
724   IS                 iscol = a->col, isrow = a->row;
725   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
726   PetscInt           i, nz, idx, idt, idc;
727   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
728   const MatScalar   *aa = a->a, *v;
729   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
730   const PetscScalar *b;
731 
732   PetscFunctionBegin;
733   PetscCall(VecGetArrayRead(bb, &b));
734   PetscCall(VecGetArray(xx, &x));
735   t = a->solve_work;
736 
737   PetscCall(ISGetIndices(isrow, &rout));
738   r = rout;
739   PetscCall(ISGetIndices(iscol, &cout));
740   c = cout + (n - 1);
741 
742   /* forward solve the lower triangular */
743   idx  = 4 * (*r++);
744   t[0] = b[idx];
745   t[1] = b[1 + idx];
746   t[2] = b[2 + idx];
747   t[3] = b[3 + idx];
748   for (i = 1; i < n; i++) {
749     v   = aa + 16 * ai[i];
750     vi  = aj + ai[i];
751     nz  = diag[i] - ai[i];
752     idx = 4 * (*r++);
753     s1  = b[idx];
754     s2  = b[1 + idx];
755     s3  = b[2 + idx];
756     s4  = b[3 + idx];
757     while (nz--) {
758       idx = 4 * (*vi++);
759       x1  = t[idx];
760       x2  = t[1 + idx];
761       x3  = t[2 + idx];
762       x4  = t[3 + idx];
763       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
764       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
765       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
766       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
767       v += 16;
768     }
769     idx        = 4 * i;
770     t[idx]     = s1;
771     t[1 + idx] = s2;
772     t[2 + idx] = s3;
773     t[3 + idx] = s4;
774   }
775   /* backward solve the upper triangular */
776   for (i = n - 1; i >= 0; i--) {
777     v   = aa + 16 * diag[i] + 16;
778     vi  = aj + diag[i] + 1;
779     nz  = ai[i + 1] - diag[i] - 1;
780     idt = 4 * i;
781     s1  = t[idt];
782     s2  = t[1 + idt];
783     s3  = t[2 + idt];
784     s4  = t[3 + idt];
785     while (nz--) {
786       idx = 4 * (*vi++);
787       x1  = t[idx];
788       x2  = t[1 + idx];
789       x3  = t[2 + idx];
790       x4  = t[3 + idx];
791       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
792       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
793       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
794       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
795       v += 16;
796     }
797     idc    = 4 * (*c--);
798     v      = aa + 16 * diag[i];
799     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
800     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
801     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
802     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
803   }
804 
805   PetscCall(ISRestoreIndices(isrow, &rout));
806   PetscCall(ISRestoreIndices(iscol, &cout));
807   PetscCall(VecRestoreArrayRead(bb, &b));
808   PetscCall(VecRestoreArray(xx, &x));
809   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
810   PetscFunctionReturn(PETSC_SUCCESS);
811 }
812 
813 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
814 {
815   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
816   IS                 iscol = a->col, isrow = a->row;
817   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
818   PetscInt           i, nz, idx, idt, idc, m;
819   const PetscInt    *r, *c, *rout, *cout;
820   const MatScalar   *aa = a->a, *v;
821   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
822   const PetscScalar *b;
823 
824   PetscFunctionBegin;
825   PetscCall(VecGetArrayRead(bb, &b));
826   PetscCall(VecGetArray(xx, &x));
827   t = a->solve_work;
828 
829   PetscCall(ISGetIndices(isrow, &rout));
830   r = rout;
831   PetscCall(ISGetIndices(iscol, &cout));
832   c = cout;
833 
834   /* forward solve the lower triangular */
835   idx  = 4 * r[0];
836   t[0] = b[idx];
837   t[1] = b[1 + idx];
838   t[2] = b[2 + idx];
839   t[3] = b[3 + idx];
840   for (i = 1; i < n; i++) {
841     v   = aa + 16 * ai[i];
842     vi  = aj + ai[i];
843     nz  = ai[i + 1] - ai[i];
844     idx = 4 * r[i];
845     s1  = b[idx];
846     s2  = b[1 + idx];
847     s3  = b[2 + idx];
848     s4  = b[3 + idx];
849     for (m = 0; m < nz; m++) {
850       idx = 4 * vi[m];
851       x1  = t[idx];
852       x2  = t[1 + idx];
853       x3  = t[2 + idx];
854       x4  = t[3 + idx];
855       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
856       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
857       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
858       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
859       v += 16;
860     }
861     idx        = 4 * i;
862     t[idx]     = s1;
863     t[1 + idx] = s2;
864     t[2 + idx] = s3;
865     t[3 + idx] = s4;
866   }
867   /* backward solve the upper triangular */
868   for (i = n - 1; i >= 0; i--) {
869     v   = aa + 16 * (adiag[i + 1] + 1);
870     vi  = aj + adiag[i + 1] + 1;
871     nz  = adiag[i] - adiag[i + 1] - 1;
872     idt = 4 * i;
873     s1  = t[idt];
874     s2  = t[1 + idt];
875     s3  = t[2 + idt];
876     s4  = t[3 + idt];
877     for (m = 0; m < nz; m++) {
878       idx = 4 * vi[m];
879       x1  = t[idx];
880       x2  = t[1 + idx];
881       x3  = t[2 + idx];
882       x4  = t[3 + idx];
883       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
884       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
885       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
886       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
887       v += 16;
888     }
889     idc    = 4 * c[i];
890     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
891     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
892     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
893     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
894   }
895 
896   PetscCall(ISRestoreIndices(isrow, &rout));
897   PetscCall(ISRestoreIndices(iscol, &cout));
898   PetscCall(VecRestoreArrayRead(bb, &b));
899   PetscCall(VecRestoreArray(xx, &x));
900   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
901   PetscFunctionReturn(PETSC_SUCCESS);
902 }
903 
904 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
905 {
906   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
907   IS                 iscol = a->col, isrow = a->row;
908   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
909   PetscInt           i, nz, idx, idt, idc;
910   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
911   const MatScalar   *aa = a->a, *v;
912   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
913   PetscScalar       *x;
914   const PetscScalar *b;
915 
916   PetscFunctionBegin;
917   PetscCall(VecGetArrayRead(bb, &b));
918   PetscCall(VecGetArray(xx, &x));
919   t = (MatScalar *)a->solve_work;
920 
921   PetscCall(ISGetIndices(isrow, &rout));
922   r = rout;
923   PetscCall(ISGetIndices(iscol, &cout));
924   c = cout + (n - 1);
925 
926   /* forward solve the lower triangular */
927   idx  = 4 * (*r++);
928   t[0] = (MatScalar)b[idx];
929   t[1] = (MatScalar)b[1 + idx];
930   t[2] = (MatScalar)b[2 + idx];
931   t[3] = (MatScalar)b[3 + idx];
932   for (i = 1; i < n; i++) {
933     v   = aa + 16 * ai[i];
934     vi  = aj + ai[i];
935     nz  = diag[i] - ai[i];
936     idx = 4 * (*r++);
937     s1  = (MatScalar)b[idx];
938     s2  = (MatScalar)b[1 + idx];
939     s3  = (MatScalar)b[2 + idx];
940     s4  = (MatScalar)b[3 + idx];
941     while (nz--) {
942       idx = 4 * (*vi++);
943       x1  = t[idx];
944       x2  = t[1 + idx];
945       x3  = t[2 + idx];
946       x4  = t[3 + idx];
947       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
948       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
949       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
950       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
951       v += 16;
952     }
953     idx        = 4 * i;
954     t[idx]     = s1;
955     t[1 + idx] = s2;
956     t[2 + idx] = s3;
957     t[3 + idx] = s4;
958   }
959   /* backward solve the upper triangular */
960   for (i = n - 1; i >= 0; i--) {
961     v   = aa + 16 * diag[i] + 16;
962     vi  = aj + diag[i] + 1;
963     nz  = ai[i + 1] - diag[i] - 1;
964     idt = 4 * i;
965     s1  = t[idt];
966     s2  = t[1 + idt];
967     s3  = t[2 + idt];
968     s4  = t[3 + idt];
969     while (nz--) {
970       idx = 4 * (*vi++);
971       x1  = t[idx];
972       x2  = t[1 + idx];
973       x3  = t[2 + idx];
974       x4  = t[3 + idx];
975       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
976       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
977       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
978       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
979       v += 16;
980     }
981     idc        = 4 * (*c--);
982     v          = aa + 16 * diag[i];
983     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
984     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
985     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
986     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
987     x[idc]     = (PetscScalar)t[idt];
988     x[1 + idc] = (PetscScalar)t[1 + idt];
989     x[2 + idc] = (PetscScalar)t[2 + idt];
990     x[3 + idc] = (PetscScalar)t[3 + idt];
991   }
992 
993   PetscCall(ISRestoreIndices(isrow, &rout));
994   PetscCall(ISRestoreIndices(iscol, &cout));
995   PetscCall(VecRestoreArrayRead(bb, &b));
996   PetscCall(VecRestoreArray(xx, &x));
997   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
998   PetscFunctionReturn(PETSC_SUCCESS);
999 }
1000 
1001 #if defined(PETSC_HAVE_SSE)
1002 
1003   #include PETSC_HAVE_SSE
1004 
1005 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1006 {
1007   /*
1008      Note: This code uses demotion of double
1009      to float when performing the mixed-mode computation.
1010      This may not be numerically reasonable for all applications.
1011   */
1012   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
1013   IS              iscol = a->col, isrow = a->row;
1014   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
1015   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
1016   MatScalar      *aa = a->a, *v;
1017   PetscScalar    *x, *b, *t;
1018 
1019   /* Make space in temp stack for 16 Byte Aligned arrays */
1020   float         ssealignedspace[11], *tmps, *tmpx;
1021   unsigned long offset;
1022 
1023   PetscFunctionBegin;
1024   SSE_SCOPE_BEGIN;
1025 
1026   offset = (unsigned long)ssealignedspace % 16;
1027   if (offset) offset = (16 - offset) / 4;
1028   tmps = &ssealignedspace[offset];
1029   tmpx = &ssealignedspace[offset + 4];
1030   PREFETCH_NTA(aa + 16 * ai[1]);
1031 
1032   PetscCall(VecGetArray(bb, &b));
1033   PetscCall(VecGetArray(xx, &x));
1034   t = a->solve_work;
1035 
1036   PetscCall(ISGetIndices(isrow, &rout));
1037   r = rout;
1038   PetscCall(ISGetIndices(iscol, &cout));
1039   c = cout + (n - 1);
1040 
1041   /* forward solve the lower triangular */
1042   idx  = 4 * (*r++);
1043   t[0] = b[idx];
1044   t[1] = b[1 + idx];
1045   t[2] = b[2 + idx];
1046   t[3] = b[3 + idx];
1047   v    = aa + 16 * ai[1];
1048 
1049   for (i = 1; i < n;) {
1050     PREFETCH_NTA(&v[8]);
1051     vi  = aj + ai[i];
1052     nz  = diag[i] - ai[i];
1053     idx = 4 * (*r++);
1054 
1055     /* Demote sum from double to float */
1056     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
1057     LOAD_PS(tmps, XMM7);
1058 
1059     while (nz--) {
1060       PREFETCH_NTA(&v[16]);
1061       idx = 4 * (*vi++);
1062 
1063       /* Demote solution (so far) from double to float */
1064       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
1065 
1066       /* 4x4 Matrix-Vector product with negative accumulation: */
1067       SSE_INLINE_BEGIN_2(tmpx, v)
1068       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1069 
1070       /* First Column */
1071       SSE_COPY_PS(XMM0, XMM6)
1072       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1073       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1074       SSE_SUB_PS(XMM7, XMM0)
1075 
1076       /* Second Column */
1077       SSE_COPY_PS(XMM1, XMM6)
1078       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1079       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1080       SSE_SUB_PS(XMM7, XMM1)
1081 
1082       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1083 
1084       /* Third Column */
1085       SSE_COPY_PS(XMM2, XMM6)
1086       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1087       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1088       SSE_SUB_PS(XMM7, XMM2)
1089 
1090       /* Fourth Column */
1091       SSE_COPY_PS(XMM3, XMM6)
1092       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1093       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1094       SSE_SUB_PS(XMM7, XMM3)
1095       SSE_INLINE_END_2
1096 
1097       v += 16;
1098     }
1099     idx = 4 * i;
1100     v   = aa + 16 * ai[++i];
1101     PREFETCH_NTA(v);
1102     STORE_PS(tmps, XMM7);
1103 
1104     /* Promote result from float to double */
1105     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
1106   }
1107   /* backward solve the upper triangular */
1108   idt  = 4 * (n - 1);
1109   ai16 = 16 * diag[n - 1];
1110   v    = aa + ai16 + 16;
1111   for (i = n - 1; i >= 0;) {
1112     PREFETCH_NTA(&v[8]);
1113     vi = aj + diag[i] + 1;
1114     nz = ai[i + 1] - diag[i] - 1;
1115 
1116     /* Demote accumulator from double to float */
1117     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
1118     LOAD_PS(tmps, XMM7);
1119 
1120     while (nz--) {
1121       PREFETCH_NTA(&v[16]);
1122       idx = 4 * (*vi++);
1123 
1124       /* Demote solution (so far) from double to float */
1125       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
1126 
1127       /* 4x4 Matrix-Vector Product with negative accumulation: */
1128       SSE_INLINE_BEGIN_2(tmpx, v)
1129       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
1130 
1131       /* First Column */
1132       SSE_COPY_PS(XMM0, XMM6)
1133       SSE_SHUFFLE(XMM0, XMM0, 0x00)
1134       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
1135       SSE_SUB_PS(XMM7, XMM0)
1136 
1137       /* Second Column */
1138       SSE_COPY_PS(XMM1, XMM6)
1139       SSE_SHUFFLE(XMM1, XMM1, 0x55)
1140       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
1141       SSE_SUB_PS(XMM7, XMM1)
1142 
1143       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
1144 
1145       /* Third Column */
1146       SSE_COPY_PS(XMM2, XMM6)
1147       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1148       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
1149       SSE_SUB_PS(XMM7, XMM2)
1150 
1151       /* Fourth Column */
1152       SSE_COPY_PS(XMM3, XMM6)
1153       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1154       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
1155       SSE_SUB_PS(XMM7, XMM3)
1156       SSE_INLINE_END_2
1157       v += 16;
1158     }
1159     v    = aa + ai16;
1160     ai16 = 16 * diag[--i];
1161     PREFETCH_NTA(aa + ai16 + 16);
1162     /*
1163        Scale the result by the diagonal 4x4 block,
1164        which was inverted as part of the factorization
1165     */
1166     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
1167     /* First Column */
1168     SSE_COPY_PS(XMM0, XMM7)
1169     SSE_SHUFFLE(XMM0, XMM0, 0x00)
1170     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
1171 
1172     /* Second Column */
1173     SSE_COPY_PS(XMM1, XMM7)
1174     SSE_SHUFFLE(XMM1, XMM1, 0x55)
1175     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
1176     SSE_ADD_PS(XMM0, XMM1)
1177 
1178     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
1179 
1180     /* Third Column */
1181     SSE_COPY_PS(XMM2, XMM7)
1182     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
1183     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
1184     SSE_ADD_PS(XMM0, XMM2)
1185 
1186     /* Fourth Column */
1187     SSE_COPY_PS(XMM3, XMM7)
1188     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
1189     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
1190     SSE_ADD_PS(XMM0, XMM3)
1191 
1192     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
1193     SSE_INLINE_END_3
1194 
1195     /* Promote solution from float to double */
1196     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
1197 
1198     /* Apply reordering to t and stream into x.    */
1199     /* This way, x doesn't pollute the cache.      */
1200     /* Be careful with size: 2 doubles = 4 floats! */
1201     idc = 4 * (*c--);
1202     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
1203     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
1204     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
1205     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
1206     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
1207     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
1208     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
1209     SSE_INLINE_END_2
1210     v = aa + ai16 + 16;
1211     idt -= 4;
1212   }
1213 
1214   PetscCall(ISRestoreIndices(isrow, &rout));
1215   PetscCall(ISRestoreIndices(iscol, &cout));
1216   PetscCall(VecRestoreArray(bb, &b));
1217   PetscCall(VecRestoreArray(xx, &x));
1218   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
1219   SSE_SCOPE_END;
1220   PetscFunctionReturn(PETSC_SUCCESS);
1221 }
1222 
1223 #endif
1224 
1225 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1226 {
1227   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1228   IS                 iscol = a->col, isrow = a->row;
1229   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1230   PetscInt           i, nz, idx, idt, idc;
1231   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1232   const MatScalar   *aa = a->a, *v;
1233   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1234   const PetscScalar *b;
1235 
1236   PetscFunctionBegin;
1237   PetscCall(VecGetArrayRead(bb, &b));
1238   PetscCall(VecGetArray(xx, &x));
1239   t = a->solve_work;
1240 
1241   PetscCall(ISGetIndices(isrow, &rout));
1242   r = rout;
1243   PetscCall(ISGetIndices(iscol, &cout));
1244   c = cout + (n - 1);
1245 
1246   /* forward solve the lower triangular */
1247   idx  = 3 * (*r++);
1248   t[0] = b[idx];
1249   t[1] = b[1 + idx];
1250   t[2] = b[2 + idx];
1251   for (i = 1; i < n; i++) {
1252     v   = aa + 9 * ai[i];
1253     vi  = aj + ai[i];
1254     nz  = diag[i] - ai[i];
1255     idx = 3 * (*r++);
1256     s1  = b[idx];
1257     s2  = b[1 + idx];
1258     s3  = b[2 + idx];
1259     while (nz--) {
1260       idx = 3 * (*vi++);
1261       x1  = t[idx];
1262       x2  = t[1 + idx];
1263       x3  = t[2 + idx];
1264       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1265       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1266       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1267       v += 9;
1268     }
1269     idx        = 3 * i;
1270     t[idx]     = s1;
1271     t[1 + idx] = s2;
1272     t[2 + idx] = s3;
1273   }
1274   /* backward solve the upper triangular */
1275   for (i = n - 1; i >= 0; i--) {
1276     v   = aa + 9 * diag[i] + 9;
1277     vi  = aj + diag[i] + 1;
1278     nz  = ai[i + 1] - diag[i] - 1;
1279     idt = 3 * i;
1280     s1  = t[idt];
1281     s2  = t[1 + idt];
1282     s3  = t[2 + idt];
1283     while (nz--) {
1284       idx = 3 * (*vi++);
1285       x1  = t[idx];
1286       x2  = t[1 + idx];
1287       x3  = t[2 + idx];
1288       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1289       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1290       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1291       v += 9;
1292     }
1293     idc    = 3 * (*c--);
1294     v      = aa + 9 * diag[i];
1295     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1296     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1297     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1298   }
1299   PetscCall(ISRestoreIndices(isrow, &rout));
1300   PetscCall(ISRestoreIndices(iscol, &cout));
1301   PetscCall(VecRestoreArrayRead(bb, &b));
1302   PetscCall(VecRestoreArray(xx, &x));
1303   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306 
1307 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1308 {
1309   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1310   IS                 iscol = a->col, isrow = a->row;
1311   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1312   PetscInt           i, nz, idx, idt, idc, m;
1313   const PetscInt    *r, *c, *rout, *cout;
1314   const MatScalar   *aa = a->a, *v;
1315   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
1316   const PetscScalar *b;
1317 
1318   PetscFunctionBegin;
1319   PetscCall(VecGetArrayRead(bb, &b));
1320   PetscCall(VecGetArray(xx, &x));
1321   t = a->solve_work;
1322 
1323   PetscCall(ISGetIndices(isrow, &rout));
1324   r = rout;
1325   PetscCall(ISGetIndices(iscol, &cout));
1326   c = cout;
1327 
1328   /* forward solve the lower triangular */
1329   idx  = 3 * r[0];
1330   t[0] = b[idx];
1331   t[1] = b[1 + idx];
1332   t[2] = b[2 + idx];
1333   for (i = 1; i < n; i++) {
1334     v   = aa + 9 * ai[i];
1335     vi  = aj + ai[i];
1336     nz  = ai[i + 1] - ai[i];
1337     idx = 3 * r[i];
1338     s1  = b[idx];
1339     s2  = b[1 + idx];
1340     s3  = b[2 + idx];
1341     for (m = 0; m < nz; m++) {
1342       idx = 3 * vi[m];
1343       x1  = t[idx];
1344       x2  = t[1 + idx];
1345       x3  = t[2 + idx];
1346       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1347       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1348       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1349       v += 9;
1350     }
1351     idx        = 3 * i;
1352     t[idx]     = s1;
1353     t[1 + idx] = s2;
1354     t[2 + idx] = s3;
1355   }
1356   /* backward solve the upper triangular */
1357   for (i = n - 1; i >= 0; i--) {
1358     v   = aa + 9 * (adiag[i + 1] + 1);
1359     vi  = aj + adiag[i + 1] + 1;
1360     nz  = adiag[i] - adiag[i + 1] - 1;
1361     idt = 3 * i;
1362     s1  = t[idt];
1363     s2  = t[1 + idt];
1364     s3  = t[2 + idt];
1365     for (m = 0; m < nz; m++) {
1366       idx = 3 * vi[m];
1367       x1  = t[idx];
1368       x2  = t[1 + idx];
1369       x3  = t[2 + idx];
1370       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
1371       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
1372       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
1373       v += 9;
1374     }
1375     idc    = 3 * c[i];
1376     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
1377     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
1378     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
1379   }
1380   PetscCall(ISRestoreIndices(isrow, &rout));
1381   PetscCall(ISRestoreIndices(iscol, &cout));
1382   PetscCall(VecRestoreArrayRead(bb, &b));
1383   PetscCall(VecRestoreArray(xx, &x));
1384   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1389 {
1390   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1391   IS                 iscol = a->col, isrow = a->row;
1392   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1393   PetscInt           i, nz, idx, idt, idc;
1394   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1395   const MatScalar   *aa = a->a, *v;
1396   PetscScalar       *x, s1, s2, x1, x2, *t;
1397   const PetscScalar *b;
1398 
1399   PetscFunctionBegin;
1400   PetscCall(VecGetArrayRead(bb, &b));
1401   PetscCall(VecGetArray(xx, &x));
1402   t = a->solve_work;
1403 
1404   PetscCall(ISGetIndices(isrow, &rout));
1405   r = rout;
1406   PetscCall(ISGetIndices(iscol, &cout));
1407   c = cout + (n - 1);
1408 
1409   /* forward solve the lower triangular */
1410   idx  = 2 * (*r++);
1411   t[0] = b[idx];
1412   t[1] = b[1 + idx];
1413   for (i = 1; i < n; i++) {
1414     v   = aa + 4 * ai[i];
1415     vi  = aj + ai[i];
1416     nz  = diag[i] - ai[i];
1417     idx = 2 * (*r++);
1418     s1  = b[idx];
1419     s2  = b[1 + idx];
1420     while (nz--) {
1421       idx = 2 * (*vi++);
1422       x1  = t[idx];
1423       x2  = t[1 + idx];
1424       s1 -= v[0] * x1 + v[2] * x2;
1425       s2 -= v[1] * x1 + v[3] * x2;
1426       v += 4;
1427     }
1428     idx        = 2 * i;
1429     t[idx]     = s1;
1430     t[1 + idx] = s2;
1431   }
1432   /* backward solve the upper triangular */
1433   for (i = n - 1; i >= 0; i--) {
1434     v   = aa + 4 * diag[i] + 4;
1435     vi  = aj + diag[i] + 1;
1436     nz  = ai[i + 1] - diag[i] - 1;
1437     idt = 2 * i;
1438     s1  = t[idt];
1439     s2  = t[1 + idt];
1440     while (nz--) {
1441       idx = 2 * (*vi++);
1442       x1  = t[idx];
1443       x2  = t[1 + idx];
1444       s1 -= v[0] * x1 + v[2] * x2;
1445       s2 -= v[1] * x1 + v[3] * x2;
1446       v += 4;
1447     }
1448     idc    = 2 * (*c--);
1449     v      = aa + 4 * diag[i];
1450     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1451     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1452   }
1453   PetscCall(ISRestoreIndices(isrow, &rout));
1454   PetscCall(ISRestoreIndices(iscol, &cout));
1455   PetscCall(VecRestoreArrayRead(bb, &b));
1456   PetscCall(VecRestoreArray(xx, &x));
1457   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1458   PetscFunctionReturn(PETSC_SUCCESS);
1459 }
1460 
1461 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1462 {
1463   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1464   IS                 iscol = a->col, isrow = a->row;
1465   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
1466   PetscInt           i, nz, idx, jdx, idt, idc, m;
1467   const PetscInt    *r, *c, *rout, *cout;
1468   const MatScalar   *aa = a->a, *v;
1469   PetscScalar       *x, s1, s2, x1, x2, *t;
1470   const PetscScalar *b;
1471 
1472   PetscFunctionBegin;
1473   PetscCall(VecGetArrayRead(bb, &b));
1474   PetscCall(VecGetArray(xx, &x));
1475   t = a->solve_work;
1476 
1477   PetscCall(ISGetIndices(isrow, &rout));
1478   r = rout;
1479   PetscCall(ISGetIndices(iscol, &cout));
1480   c = cout;
1481 
1482   /* forward solve the lower triangular */
1483   idx  = 2 * r[0];
1484   t[0] = b[idx];
1485   t[1] = b[1 + idx];
1486   for (i = 1; i < n; i++) {
1487     v   = aa + 4 * ai[i];
1488     vi  = aj + ai[i];
1489     nz  = ai[i + 1] - ai[i];
1490     idx = 2 * r[i];
1491     s1  = b[idx];
1492     s2  = b[1 + idx];
1493     for (m = 0; m < nz; m++) {
1494       jdx = 2 * vi[m];
1495       x1  = t[jdx];
1496       x2  = t[1 + jdx];
1497       s1 -= v[0] * x1 + v[2] * x2;
1498       s2 -= v[1] * x1 + v[3] * x2;
1499       v += 4;
1500     }
1501     idx        = 2 * i;
1502     t[idx]     = s1;
1503     t[1 + idx] = s2;
1504   }
1505   /* backward solve the upper triangular */
1506   for (i = n - 1; i >= 0; i--) {
1507     v   = aa + 4 * (adiag[i + 1] + 1);
1508     vi  = aj + adiag[i + 1] + 1;
1509     nz  = adiag[i] - adiag[i + 1] - 1;
1510     idt = 2 * i;
1511     s1  = t[idt];
1512     s2  = t[1 + idt];
1513     for (m = 0; m < nz; m++) {
1514       idx = 2 * vi[m];
1515       x1  = t[idx];
1516       x2  = t[1 + idx];
1517       s1 -= v[0] * x1 + v[2] * x2;
1518       s2 -= v[1] * x1 + v[3] * x2;
1519       v += 4;
1520     }
1521     idc    = 2 * c[i];
1522     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
1523     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
1524   }
1525   PetscCall(ISRestoreIndices(isrow, &rout));
1526   PetscCall(ISRestoreIndices(iscol, &cout));
1527   PetscCall(VecRestoreArrayRead(bb, &b));
1528   PetscCall(VecRestoreArray(xx, &x));
1529   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1530   PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532 
1533 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1534 {
1535   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1536   IS                 iscol = a->col, isrow = a->row;
1537   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
1538   PetscInt           i, nz;
1539   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
1540   const MatScalar   *aa = a->a, *v;
1541   PetscScalar       *x, s1, *t;
1542   const PetscScalar *b;
1543 
1544   PetscFunctionBegin;
1545   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1546 
1547   PetscCall(VecGetArrayRead(bb, &b));
1548   PetscCall(VecGetArray(xx, &x));
1549   t = a->solve_work;
1550 
1551   PetscCall(ISGetIndices(isrow, &rout));
1552   r = rout;
1553   PetscCall(ISGetIndices(iscol, &cout));
1554   c = cout + (n - 1);
1555 
1556   /* forward solve the lower triangular */
1557   t[0] = b[*r++];
1558   for (i = 1; i < n; i++) {
1559     v  = aa + ai[i];
1560     vi = aj + ai[i];
1561     nz = diag[i] - ai[i];
1562     s1 = b[*r++];
1563     while (nz--) s1 -= (*v++) * t[*vi++];
1564     t[i] = s1;
1565   }
1566   /* backward solve the upper triangular */
1567   for (i = n - 1; i >= 0; i--) {
1568     v  = aa + diag[i] + 1;
1569     vi = aj + diag[i] + 1;
1570     nz = ai[i + 1] - diag[i] - 1;
1571     s1 = t[i];
1572     while (nz--) s1 -= (*v++) * t[*vi++];
1573     x[*c--] = t[i] = aa[diag[i]] * s1;
1574   }
1575 
1576   PetscCall(ISRestoreIndices(isrow, &rout));
1577   PetscCall(ISRestoreIndices(iscol, &cout));
1578   PetscCall(VecRestoreArrayRead(bb, &b));
1579   PetscCall(VecRestoreArray(xx, &x));
1580   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1581   PetscFunctionReturn(PETSC_SUCCESS);
1582 }
1583 
1584 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1585 {
1586   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1587   IS                 iscol = a->col, isrow = a->row;
1588   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
1589   const PetscInt    *rout, *cout, *r, *c;
1590   PetscScalar       *x, *tmp, sum;
1591   const PetscScalar *b;
1592   const MatScalar   *aa = a->a, *v;
1593 
1594   PetscFunctionBegin;
1595   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
1596 
1597   PetscCall(VecGetArrayRead(bb, &b));
1598   PetscCall(VecGetArray(xx, &x));
1599   tmp = a->solve_work;
1600 
1601   PetscCall(ISGetIndices(isrow, &rout));
1602   r = rout;
1603   PetscCall(ISGetIndices(iscol, &cout));
1604   c = cout;
1605 
1606   /* forward solve the lower triangular */
1607   tmp[0] = b[r[0]];
1608   v      = aa;
1609   vi     = aj;
1610   for (i = 1; i < n; i++) {
1611     nz  = ai[i + 1] - ai[i];
1612     sum = b[r[i]];
1613     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1614     tmp[i] = sum;
1615     v += nz;
1616     vi += nz;
1617   }
1618 
1619   /* backward solve the upper triangular */
1620   for (i = n - 1; i >= 0; i--) {
1621     v   = aa + adiag[i + 1] + 1;
1622     vi  = aj + adiag[i + 1] + 1;
1623     nz  = adiag[i] - adiag[i + 1] - 1;
1624     sum = tmp[i];
1625     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
1626     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
1627   }
1628 
1629   PetscCall(ISRestoreIndices(isrow, &rout));
1630   PetscCall(ISRestoreIndices(iscol, &cout));
1631   PetscCall(VecRestoreArrayRead(bb, &b));
1632   PetscCall(VecRestoreArray(xx, &x));
1633   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636