xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
11 {
12   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
13   IS                 isrow = a->row;
14   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
15   const PetscInt    *r;
16   PetscInt           nz, *vj, k, idx, k1;
17   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
18   const MatScalar   *aa = a->a, *v, *diag;
19   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
20   const PetscScalar *b;
21 
22   PetscFunctionBegin;
23   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
24   PetscCall(VecGetArrayRead(bb, &b));
25   PetscCall(VecGetArray(xx, &x));
26   t = a->solve_work;
27   PetscCall(ISGetIndices(isrow, &r));
28   PetscCall(PetscMalloc1(bs, &xk_tmp));
29 
30   /* solve U^T * D * y = b by forward substitution */
31   xk = t;
32   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
33     idx = bs * r[k];
34     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
35   }
36   for (k = 0; k < mbs; k++) {
37     v  = aa + bs2 * ai[k];
38     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
39     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
40     nz = ai[k + 1] - ai[k];
41     vj = aj + ai[k];
42     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
43     while (nz--) {
44       /* x(:) += U(k,:)^T*(Dk*xk) */
45       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
46       vj++;
47       xj = t + (*vj) * bs;
48       v += bs2;
49     }
50     /* xk = inv(Dk)*(Dk*xk) */
51     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
52     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
53   }
54 
55   /* solve U*x = y by back substitution */
56   for (k = mbs - 1; k >= 0; k--) {
57     v  = aa + bs2 * ai[k];
58     xk = t + k * bs; /* xk */
59     nz = ai[k + 1] - ai[k];
60     vj = aj + ai[k];
61     xj = t + (*vj) * bs;
62     while (nz--) {
63       /* xk += U(k,:)*x(:) */
64       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
65       vj++;
66       v += bs2;
67       xj = t + (*vj) * bs;
68     }
69     idx = bs * r[k];
70     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
71   }
72 
73   PetscCall(PetscFree(xk_tmp));
74   PetscCall(ISRestoreIndices(isrow, &r));
75   PetscCall(VecRestoreArrayRead(bb, &b));
76   PetscCall(VecRestoreArray(xx, &x));
77   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
78   PetscFunctionReturn(PETSC_SUCCESS);
79 }
80 
81 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
82 {
83   PetscFunctionBegin;
84   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
85 }
86 
87 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
88 {
89   PetscFunctionBegin;
90   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
91 }
92 
93 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
94 {
95   PetscInt         nz, k;
96   const PetscInt  *vj, bs2 = bs * bs;
97   const MatScalar *v, *diag;
98   PetscScalar     *xk, *xj, *xk_tmp;
99 
100   PetscFunctionBegin;
101   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
102   PetscCall(PetscMalloc1(bs, &xk_tmp));
103   for (k = 0; k < mbs; k++) {
104     v  = aa + bs2 * ai[k];
105     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
106     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
107     nz = ai[k + 1] - ai[k];
108     vj = aj + ai[k];
109     xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
110     while (nz--) {
111       /* x(:) += U(k,:)^T*(Dk*xk) */
112       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
113       vj++;
114       xj = x + (size_t)(*vj) * bs;
115       v += bs2;
116     }
117     /* xk = inv(Dk)*(Dk*xk) */
118     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
119     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
120   }
121   PetscCall(PetscFree(xk_tmp));
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
126 {
127   PetscInt         nz, k;
128   const PetscInt  *vj, bs2 = bs * bs;
129   const MatScalar *v;
130   PetscScalar     *xk, *xj;
131 
132   PetscFunctionBegin;
133   for (k = mbs - 1; k >= 0; k--) {
134     v  = aa + bs2 * ai[k];
135     xk = x + k * bs; /* xk */
136     nz = ai[k + 1] - ai[k];
137     vj = aj + ai[k];
138     xj = x + (size_t)(*vj) * bs;
139     while (nz--) {
140       /* xk += U(k,:)*x(:) */
141       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
142       vj++;
143       v += bs2;
144       xj = x + (size_t)(*vj) * bs;
145     }
146   }
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
150 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
151 {
152   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
153   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
154   PetscInt           bs = A->rmap->bs;
155   const MatScalar   *aa = a->a;
156   PetscScalar       *x;
157   const PetscScalar *b;
158 
159   PetscFunctionBegin;
160   PetscCall(VecGetArrayRead(bb, &b));
161   PetscCall(VecGetArray(xx, &x));
162 
163   /* solve U^T * D * y = b by forward substitution */
164   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
165   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
166 
167   /* solve U*x = y by back substitution */
168   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
169 
170   PetscCall(VecRestoreArrayRead(bb, &b));
171   PetscCall(VecRestoreArray(xx, &x));
172   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
177 {
178   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
179   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
180   PetscInt           bs = A->rmap->bs;
181   const MatScalar   *aa = a->a;
182   const PetscScalar *b;
183   PetscScalar       *x;
184 
185   PetscFunctionBegin;
186   PetscCall(VecGetArrayRead(bb, &b));
187   PetscCall(VecGetArray(xx, &x));
188   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
189   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
190   PetscCall(VecRestoreArrayRead(bb, &b));
191   PetscCall(VecRestoreArray(xx, &x));
192   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
197 {
198   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
199   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
200   PetscInt           bs = A->rmap->bs;
201   const MatScalar   *aa = a->a;
202   const PetscScalar *b;
203   PetscScalar       *x;
204 
205   PetscFunctionBegin;
206   PetscCall(VecGetArrayRead(bb, &b));
207   PetscCall(VecGetArray(xx, &x));
208   PetscCall(PetscArraycpy(x, b, bs * mbs));
209   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
210   PetscCall(VecRestoreArrayRead(bb, &b));
211   PetscCall(VecRestoreArray(xx, &x));
212   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
217 {
218   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
219   IS                 isrow = a->row;
220   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
221   PetscInt           nz, k, idx;
222   const MatScalar   *aa = a->a, *v, *d;
223   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
224   const PetscScalar *b;
225 
226   PetscFunctionBegin;
227   PetscCall(VecGetArrayRead(bb, &b));
228   PetscCall(VecGetArray(xx, &x));
229   t = a->solve_work;
230   PetscCall(ISGetIndices(isrow, &r));
231 
232   /* solve U^T * D * y = b by forward substitution */
233   tp = t;
234   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
235     idx   = 7 * r[k];
236     tp[0] = b[idx];
237     tp[1] = b[idx + 1];
238     tp[2] = b[idx + 2];
239     tp[3] = b[idx + 3];
240     tp[4] = b[idx + 4];
241     tp[5] = b[idx + 5];
242     tp[6] = b[idx + 6];
243     tp += 7;
244   }
245 
246   for (k = 0; k < mbs; k++) {
247     v  = aa + 49 * ai[k];
248     vj = aj + ai[k];
249     tp = t + k * 7;
250     x0 = tp[0];
251     x1 = tp[1];
252     x2 = tp[2];
253     x3 = tp[3];
254     x4 = tp[4];
255     x5 = tp[5];
256     x6 = tp[6];
257     nz = ai[k + 1] - ai[k];
258     tp = t + (*vj) * 7;
259     while (nz--) {
260       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
261       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
262       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
263       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
264       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
265       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
266       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
267       vj++;
268       tp = t + (*vj) * 7;
269       v += 49;
270     }
271 
272     /* xk = inv(Dk)*(Dk*xk) */
273     d     = aa + k * 49; /* ptr to inv(Dk) */
274     tp    = t + k * 7;
275     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
276     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
277     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
278     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
279     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
280     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
281     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
282   }
283 
284   /* solve U*x = y by back substitution */
285   for (k = mbs - 1; k >= 0; k--) {
286     v  = aa + 49 * ai[k];
287     vj = aj + ai[k];
288     tp = t + k * 7;
289     x0 = tp[0];
290     x1 = tp[1];
291     x2 = tp[2];
292     x3 = tp[3];
293     x4 = tp[4];
294     x5 = tp[5];
295     x6 = tp[6]; /* xk */
296     nz = ai[k + 1] - ai[k];
297 
298     tp = t + (*vj) * 7;
299     while (nz--) {
300       /* xk += U(k,:)*x(:) */
301       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
302       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
303       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
304       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
305       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
306       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
307       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
308       vj++;
309       tp = t + (*vj) * 7;
310       v += 49;
311     }
312     tp         = t + k * 7;
313     tp[0]      = x0;
314     tp[1]      = x1;
315     tp[2]      = x2;
316     tp[3]      = x3;
317     tp[4]      = x4;
318     tp[5]      = x5;
319     tp[6]      = x6;
320     idx        = 7 * r[k];
321     x[idx]     = x0;
322     x[idx + 1] = x1;
323     x[idx + 2] = x2;
324     x[idx + 3] = x3;
325     x[idx + 4] = x4;
326     x[idx + 5] = x5;
327     x[idx + 6] = x6;
328   }
329 
330   PetscCall(ISRestoreIndices(isrow, &r));
331   PetscCall(VecRestoreArrayRead(bb, &b));
332   PetscCall(VecRestoreArray(xx, &x));
333   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
338 {
339   const MatScalar *v, *d;
340   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
341   PetscInt         nz, k;
342   const PetscInt  *vj;
343 
344   PetscFunctionBegin;
345   for (k = 0; k < mbs; k++) {
346     v  = aa + 49 * ai[k];
347     xp = x + k * 7;
348     x0 = xp[0];
349     x1 = xp[1];
350     x2 = xp[2];
351     x3 = xp[3];
352     x4 = xp[4];
353     x5 = xp[5];
354     x6 = xp[6]; /* Dk*xk = k-th block of x */
355     nz = ai[k + 1] - ai[k];
356     vj = aj + ai[k];
357     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
358     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
359     while (nz--) {
360       xp = x + (*vj) * 7;
361       /* x(:) += U(k,:)^T*(Dk*xk) */
362       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
363       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
364       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
365       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
366       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
367       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
368       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
369       vj++;
370       v += 49;
371     }
372     /* xk = inv(Dk)*(Dk*xk) */
373     d     = aa + k * 49; /* ptr to inv(Dk) */
374     xp    = x + k * 7;
375     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
376     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
377     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
378     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
379     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
380     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
381     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
382   }
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
386 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
387 {
388   const MatScalar *v;
389   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
390   PetscInt         nz, k;
391   const PetscInt  *vj;
392 
393   PetscFunctionBegin;
394   for (k = mbs - 1; k >= 0; k--) {
395     v  = aa + 49 * ai[k];
396     xp = x + k * 7;
397     x0 = xp[0];
398     x1 = xp[1];
399     x2 = xp[2];
400     x3 = xp[3];
401     x4 = xp[4];
402     x5 = xp[5];
403     x6 = xp[6]; /* xk */
404     nz = ai[k + 1] - ai[k];
405     vj = aj + ai[k];
406     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
407     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
408     while (nz--) {
409       xp = x + (*vj) * 7;
410       /* xk += U(k,:)*x(:) */
411       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
412       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
413       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
414       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
415       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
416       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
417       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
418       vj++;
419       v += 49;
420     }
421     xp    = x + k * 7;
422     xp[0] = x0;
423     xp[1] = x1;
424     xp[2] = x2;
425     xp[3] = x3;
426     xp[4] = x4;
427     xp[5] = x5;
428     xp[6] = x6;
429   }
430   PetscFunctionReturn(PETSC_SUCCESS);
431 }
432 
433 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
434 {
435   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
436   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
437   const MatScalar   *aa = a->a;
438   PetscScalar       *x;
439   const PetscScalar *b;
440 
441   PetscFunctionBegin;
442   PetscCall(VecGetArrayRead(bb, &b));
443   PetscCall(VecGetArray(xx, &x));
444 
445   /* solve U^T * D * y = b by forward substitution */
446   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
447   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
448 
449   /* solve U*x = y by back substitution */
450   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
451 
452   PetscCall(VecRestoreArrayRead(bb, &b));
453   PetscCall(VecRestoreArray(xx, &x));
454   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
455   PetscFunctionReturn(PETSC_SUCCESS);
456 }
457 
458 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
459 {
460   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
461   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
462   const MatScalar   *aa = a->a;
463   PetscScalar       *x;
464   const PetscScalar *b;
465 
466   PetscFunctionBegin;
467   PetscCall(VecGetArrayRead(bb, &b));
468   PetscCall(VecGetArray(xx, &x));
469   PetscCall(PetscArraycpy(x, b, 7 * mbs));
470   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
471   PetscCall(VecRestoreArrayRead(bb, &b));
472   PetscCall(VecRestoreArray(xx, &x));
473   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
474   PetscFunctionReturn(PETSC_SUCCESS);
475 }
476 
477 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
478 {
479   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
480   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
481   const MatScalar   *aa = a->a;
482   PetscScalar       *x;
483   const PetscScalar *b;
484 
485   PetscFunctionBegin;
486   PetscCall(VecGetArrayRead(bb, &b));
487   PetscCall(VecGetArray(xx, &x));
488   PetscCall(PetscArraycpy(x, b, 7 * mbs));
489   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
490   PetscCall(VecRestoreArrayRead(bb, &b));
491   PetscCall(VecRestoreArray(xx, &x));
492   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
496 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
497 {
498   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
499   IS                 isrow = a->row;
500   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
501   PetscInt           nz, k, idx;
502   const MatScalar   *aa = a->a, *v, *d;
503   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
504   const PetscScalar *b;
505 
506   PetscFunctionBegin;
507   PetscCall(VecGetArrayRead(bb, &b));
508   PetscCall(VecGetArray(xx, &x));
509   t = a->solve_work;
510   PetscCall(ISGetIndices(isrow, &r));
511 
512   /* solve U^T * D * y = b by forward substitution */
513   tp = t;
514   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
515     idx   = 6 * r[k];
516     tp[0] = b[idx];
517     tp[1] = b[idx + 1];
518     tp[2] = b[idx + 2];
519     tp[3] = b[idx + 3];
520     tp[4] = b[idx + 4];
521     tp[5] = b[idx + 5];
522     tp += 6;
523   }
524 
525   for (k = 0; k < mbs; k++) {
526     v  = aa + 36 * ai[k];
527     vj = aj + ai[k];
528     tp = t + k * 6;
529     x0 = tp[0];
530     x1 = tp[1];
531     x2 = tp[2];
532     x3 = tp[3];
533     x4 = tp[4];
534     x5 = tp[5];
535     nz = ai[k + 1] - ai[k];
536     tp = t + (*vj) * 6;
537     while (nz--) {
538       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
539       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
540       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
541       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
542       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
543       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
544       vj++;
545       tp = t + (*vj) * 6;
546       v += 36;
547     }
548 
549     /* xk = inv(Dk)*(Dk*xk) */
550     d     = aa + k * 36; /* ptr to inv(Dk) */
551     tp    = t + k * 6;
552     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
553     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
554     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
555     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
556     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
557     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
558   }
559 
560   /* solve U*x = y by back substitution */
561   for (k = mbs - 1; k >= 0; k--) {
562     v  = aa + 36 * ai[k];
563     vj = aj + ai[k];
564     tp = t + k * 6;
565     x0 = tp[0];
566     x1 = tp[1];
567     x2 = tp[2];
568     x3 = tp[3];
569     x4 = tp[4];
570     x5 = tp[5]; /* xk */
571     nz = ai[k + 1] - ai[k];
572 
573     tp = t + (*vj) * 6;
574     while (nz--) {
575       /* xk += U(k,:)*x(:) */
576       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
577       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
578       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
579       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
580       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
581       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
582       vj++;
583       tp = t + (*vj) * 6;
584       v += 36;
585     }
586     tp         = t + k * 6;
587     tp[0]      = x0;
588     tp[1]      = x1;
589     tp[2]      = x2;
590     tp[3]      = x3;
591     tp[4]      = x4;
592     tp[5]      = x5;
593     idx        = 6 * r[k];
594     x[idx]     = x0;
595     x[idx + 1] = x1;
596     x[idx + 2] = x2;
597     x[idx + 3] = x3;
598     x[idx + 4] = x4;
599     x[idx + 5] = x5;
600   }
601 
602   PetscCall(ISRestoreIndices(isrow, &r));
603   PetscCall(VecRestoreArrayRead(bb, &b));
604   PetscCall(VecRestoreArray(xx, &x));
605   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
606   PetscFunctionReturn(PETSC_SUCCESS);
607 }
608 
609 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
610 {
611   const MatScalar *v, *d;
612   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
613   PetscInt         nz, k;
614   const PetscInt  *vj;
615 
616   PetscFunctionBegin;
617   for (k = 0; k < mbs; k++) {
618     v  = aa + 36 * ai[k];
619     xp = x + k * 6;
620     x0 = xp[0];
621     x1 = xp[1];
622     x2 = xp[2];
623     x3 = xp[3];
624     x4 = xp[4];
625     x5 = xp[5]; /* Dk*xk = k-th block of x */
626     nz = ai[k + 1] - ai[k];
627     vj = aj + ai[k];
628     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
629     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
630     while (nz--) {
631       xp = x + (*vj) * 6;
632       /* x(:) += U(k,:)^T*(Dk*xk) */
633       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
634       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
635       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
636       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
637       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
638       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
639       vj++;
640       v += 36;
641     }
642     /* xk = inv(Dk)*(Dk*xk) */
643     d     = aa + k * 36; /* ptr to inv(Dk) */
644     xp    = x + k * 6;
645     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
646     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
647     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
648     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
649     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
650     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
651   }
652   PetscFunctionReturn(PETSC_SUCCESS);
653 }
654 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
655 {
656   const MatScalar *v;
657   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
658   PetscInt         nz, k;
659   const PetscInt  *vj;
660 
661   PetscFunctionBegin;
662   for (k = mbs - 1; k >= 0; k--) {
663     v  = aa + 36 * ai[k];
664     xp = x + k * 6;
665     x0 = xp[0];
666     x1 = xp[1];
667     x2 = xp[2];
668     x3 = xp[3];
669     x4 = xp[4];
670     x5 = xp[5]; /* xk */
671     nz = ai[k + 1] - ai[k];
672     vj = aj + ai[k];
673     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
674     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
675     while (nz--) {
676       xp = x + (*vj) * 6;
677       /* xk += U(k,:)*x(:) */
678       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
679       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
680       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
681       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
682       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
683       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
684       vj++;
685       v += 36;
686     }
687     xp    = x + k * 6;
688     xp[0] = x0;
689     xp[1] = x1;
690     xp[2] = x2;
691     xp[3] = x3;
692     xp[4] = x4;
693     xp[5] = x5;
694   }
695   PetscFunctionReturn(PETSC_SUCCESS);
696 }
697 
698 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
699 {
700   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
701   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
702   const MatScalar   *aa = a->a;
703   PetscScalar       *x;
704   const PetscScalar *b;
705 
706   PetscFunctionBegin;
707   PetscCall(VecGetArrayRead(bb, &b));
708   PetscCall(VecGetArray(xx, &x));
709 
710   /* solve U^T * D * y = b by forward substitution */
711   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
712   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
713 
714   /* solve U*x = y by back substitution */
715   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
716 
717   PetscCall(VecRestoreArrayRead(bb, &b));
718   PetscCall(VecRestoreArray(xx, &x));
719   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
720   PetscFunctionReturn(PETSC_SUCCESS);
721 }
722 
723 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
724 {
725   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
726   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
727   const MatScalar   *aa = a->a;
728   PetscScalar       *x;
729   const PetscScalar *b;
730 
731   PetscFunctionBegin;
732   PetscCall(VecGetArrayRead(bb, &b));
733   PetscCall(VecGetArray(xx, &x));
734   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
735   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
736   PetscCall(VecRestoreArrayRead(bb, &b));
737   PetscCall(VecRestoreArray(xx, &x));
738   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
739   PetscFunctionReturn(PETSC_SUCCESS);
740 }
741 
742 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
743 {
744   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
745   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
746   const MatScalar   *aa = a->a;
747   PetscScalar       *x;
748   const PetscScalar *b;
749 
750   PetscFunctionBegin;
751   PetscCall(VecGetArrayRead(bb, &b));
752   PetscCall(VecGetArray(xx, &x));
753   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
754   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
755   PetscCall(VecRestoreArrayRead(bb, &b));
756   PetscCall(VecRestoreArray(xx, &x));
757   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
758   PetscFunctionReturn(PETSC_SUCCESS);
759 }
760 
761 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
762 {
763   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
764   IS                 isrow = a->row;
765   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
766   const PetscInt    *r, *vj;
767   PetscInt           nz, k, idx;
768   const MatScalar   *aa = a->a, *v, *diag;
769   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
770   const PetscScalar *b;
771 
772   PetscFunctionBegin;
773   PetscCall(VecGetArrayRead(bb, &b));
774   PetscCall(VecGetArray(xx, &x));
775   t = a->solve_work;
776   PetscCall(ISGetIndices(isrow, &r));
777 
778   /* solve U^T * D * y = b by forward substitution */
779   tp = t;
780   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
781     idx   = 5 * r[k];
782     tp[0] = b[idx];
783     tp[1] = b[idx + 1];
784     tp[2] = b[idx + 2];
785     tp[3] = b[idx + 3];
786     tp[4] = b[idx + 4];
787     tp += 5;
788   }
789 
790   for (k = 0; k < mbs; k++) {
791     v  = aa + 25 * ai[k];
792     vj = aj + ai[k];
793     tp = t + k * 5;
794     x0 = tp[0];
795     x1 = tp[1];
796     x2 = tp[2];
797     x3 = tp[3];
798     x4 = tp[4];
799     nz = ai[k + 1] - ai[k];
800 
801     tp = t + (*vj) * 5;
802     while (nz--) {
803       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
804       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
805       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
806       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
807       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
808       vj++;
809       tp = t + (*vj) * 5;
810       v += 25;
811     }
812 
813     /* xk = inv(Dk)*(Dk*xk) */
814     diag  = aa + k * 25; /* ptr to inv(Dk) */
815     tp    = t + k * 5;
816     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
817     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
818     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
819     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
820     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
821   }
822 
823   /* solve U*x = y by back substitution */
824   for (k = mbs - 1; k >= 0; k--) {
825     v  = aa + 25 * ai[k];
826     vj = aj + ai[k];
827     tp = t + k * 5;
828     x0 = tp[0];
829     x1 = tp[1];
830     x2 = tp[2];
831     x3 = tp[3];
832     x4 = tp[4]; /* xk */
833     nz = ai[k + 1] - ai[k];
834 
835     tp = t + (*vj) * 5;
836     while (nz--) {
837       /* xk += U(k,:)*x(:) */
838       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
839       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
840       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
841       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
842       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
843       vj++;
844       tp = t + (*vj) * 5;
845       v += 25;
846     }
847     tp         = t + k * 5;
848     tp[0]      = x0;
849     tp[1]      = x1;
850     tp[2]      = x2;
851     tp[3]      = x3;
852     tp[4]      = x4;
853     idx        = 5 * r[k];
854     x[idx]     = x0;
855     x[idx + 1] = x1;
856     x[idx + 2] = x2;
857     x[idx + 3] = x3;
858     x[idx + 4] = x4;
859   }
860 
861   PetscCall(ISRestoreIndices(isrow, &r));
862   PetscCall(VecRestoreArrayRead(bb, &b));
863   PetscCall(VecRestoreArray(xx, &x));
864   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
865   PetscFunctionReturn(PETSC_SUCCESS);
866 }
867 
868 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
869 {
870   const MatScalar *v, *diag;
871   PetscScalar     *xp, x0, x1, x2, x3, x4;
872   PetscInt         nz, k;
873   const PetscInt  *vj;
874 
875   PetscFunctionBegin;
876   for (k = 0; k < mbs; k++) {
877     v  = aa + 25 * ai[k];
878     xp = x + k * 5;
879     x0 = xp[0];
880     x1 = xp[1];
881     x2 = xp[2];
882     x3 = xp[3];
883     x4 = xp[4]; /* Dk*xk = k-th block of x */
884     nz = ai[k + 1] - ai[k];
885     vj = aj + ai[k];
886     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
887     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
888     while (nz--) {
889       xp = x + (*vj) * 5;
890       /* x(:) += U(k,:)^T*(Dk*xk) */
891       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
892       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
893       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
894       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
895       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
896       vj++;
897       v += 25;
898     }
899     /* xk = inv(Dk)*(Dk*xk) */
900     diag  = aa + k * 25; /* ptr to inv(Dk) */
901     xp    = x + k * 5;
902     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
903     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
904     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
905     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
906     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
907   }
908   PetscFunctionReturn(PETSC_SUCCESS);
909 }
910 
911 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
912 {
913   const MatScalar *v;
914   PetscScalar     *xp, x0, x1, x2, x3, x4;
915   PetscInt         nz, k;
916   const PetscInt  *vj;
917 
918   PetscFunctionBegin;
919   for (k = mbs - 1; k >= 0; k--) {
920     v  = aa + 25 * ai[k];
921     xp = x + k * 5;
922     x0 = xp[0];
923     x1 = xp[1];
924     x2 = xp[2];
925     x3 = xp[3];
926     x4 = xp[4]; /* xk */
927     nz = ai[k + 1] - ai[k];
928     vj = aj + ai[k];
929     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
930     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
931     while (nz--) {
932       xp = x + (*vj) * 5;
933       /* xk += U(k,:)*x(:) */
934       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
935       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
936       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
937       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
938       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
939       vj++;
940       v += 25;
941     }
942     xp    = x + k * 5;
943     xp[0] = x0;
944     xp[1] = x1;
945     xp[2] = x2;
946     xp[3] = x3;
947     xp[4] = x4;
948   }
949   PetscFunctionReturn(PETSC_SUCCESS);
950 }
951 
952 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
953 {
954   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
955   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
956   const MatScalar   *aa = a->a;
957   PetscScalar       *x;
958   const PetscScalar *b;
959 
960   PetscFunctionBegin;
961   PetscCall(VecGetArrayRead(bb, &b));
962   PetscCall(VecGetArray(xx, &x));
963 
964   /* solve U^T * D * y = b by forward substitution */
965   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
966   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
967 
968   /* solve U*x = y by back substitution */
969   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
970 
971   PetscCall(VecRestoreArrayRead(bb, &b));
972   PetscCall(VecRestoreArray(xx, &x));
973   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
974   PetscFunctionReturn(PETSC_SUCCESS);
975 }
976 
977 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
978 {
979   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
980   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
981   const MatScalar   *aa = a->a;
982   PetscScalar       *x;
983   const PetscScalar *b;
984 
985   PetscFunctionBegin;
986   PetscCall(VecGetArrayRead(bb, &b));
987   PetscCall(VecGetArray(xx, &x));
988   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
989   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
990   PetscCall(VecRestoreArrayRead(bb, &b));
991   PetscCall(VecRestoreArray(xx, &x));
992   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
993   PetscFunctionReturn(PETSC_SUCCESS);
994 }
995 
996 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
997 {
998   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
999   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1000   const MatScalar   *aa = a->a;
1001   PetscScalar       *x;
1002   const PetscScalar *b;
1003 
1004   PetscFunctionBegin;
1005   PetscCall(VecGetArrayRead(bb, &b));
1006   PetscCall(VecGetArray(xx, &x));
1007   PetscCall(PetscArraycpy(x, b, 5 * mbs));
1008   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1009   PetscCall(VecRestoreArrayRead(bb, &b));
1010   PetscCall(VecRestoreArray(xx, &x));
1011   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1012   PetscFunctionReturn(PETSC_SUCCESS);
1013 }
1014 
1015 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1016 {
1017   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1018   IS                 isrow = a->row;
1019   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1020   const PetscInt    *r, *vj;
1021   PetscInt           nz, k, idx;
1022   const MatScalar   *aa = a->a, *v, *diag;
1023   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
1024   const PetscScalar *b;
1025 
1026   PetscFunctionBegin;
1027   PetscCall(VecGetArrayRead(bb, &b));
1028   PetscCall(VecGetArray(xx, &x));
1029   t = a->solve_work;
1030   PetscCall(ISGetIndices(isrow, &r));
1031 
1032   /* solve U^T * D * y = b by forward substitution */
1033   tp = t;
1034   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1035     idx   = 4 * r[k];
1036     tp[0] = b[idx];
1037     tp[1] = b[idx + 1];
1038     tp[2] = b[idx + 2];
1039     tp[3] = b[idx + 3];
1040     tp += 4;
1041   }
1042 
1043   for (k = 0; k < mbs; k++) {
1044     v  = aa + 16 * ai[k];
1045     vj = aj + ai[k];
1046     tp = t + k * 4;
1047     x0 = tp[0];
1048     x1 = tp[1];
1049     x2 = tp[2];
1050     x3 = tp[3];
1051     nz = ai[k + 1] - ai[k];
1052 
1053     tp = t + (*vj) * 4;
1054     while (nz--) {
1055       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1056       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1057       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1058       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1059       vj++;
1060       tp = t + (*vj) * 4;
1061       v += 16;
1062     }
1063 
1064     /* xk = inv(Dk)*(Dk*xk) */
1065     diag  = aa + k * 16; /* ptr to inv(Dk) */
1066     tp    = t + k * 4;
1067     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1068     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1069     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1070     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1071   }
1072 
1073   /* solve U*x = y by back substitution */
1074   for (k = mbs - 1; k >= 0; k--) {
1075     v  = aa + 16 * ai[k];
1076     vj = aj + ai[k];
1077     tp = t + k * 4;
1078     x0 = tp[0];
1079     x1 = tp[1];
1080     x2 = tp[2];
1081     x3 = tp[3]; /* xk */
1082     nz = ai[k + 1] - ai[k];
1083 
1084     tp = t + (*vj) * 4;
1085     while (nz--) {
1086       /* xk += U(k,:)*x(:) */
1087       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1088       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1089       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1090       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1091       vj++;
1092       tp = t + (*vj) * 4;
1093       v += 16;
1094     }
1095     tp         = t + k * 4;
1096     tp[0]      = x0;
1097     tp[1]      = x1;
1098     tp[2]      = x2;
1099     tp[3]      = x3;
1100     idx        = 4 * r[k];
1101     x[idx]     = x0;
1102     x[idx + 1] = x1;
1103     x[idx + 2] = x2;
1104     x[idx + 3] = x3;
1105   }
1106 
1107   PetscCall(ISRestoreIndices(isrow, &r));
1108   PetscCall(VecRestoreArrayRead(bb, &b));
1109   PetscCall(VecRestoreArray(xx, &x));
1110   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1111   PetscFunctionReturn(PETSC_SUCCESS);
1112 }
1113 
1114 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1115 {
1116   const MatScalar *v, *diag;
1117   PetscScalar     *xp, x0, x1, x2, x3;
1118   PetscInt         nz, k;
1119   const PetscInt  *vj;
1120 
1121   PetscFunctionBegin;
1122   for (k = 0; k < mbs; k++) {
1123     v  = aa + 16 * ai[k];
1124     xp = x + k * 4;
1125     x0 = xp[0];
1126     x1 = xp[1];
1127     x2 = xp[2];
1128     x3 = xp[3]; /* Dk*xk = k-th block of x */
1129     nz = ai[k + 1] - ai[k];
1130     vj = aj + ai[k];
1131     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1132     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1133     while (nz--) {
1134       xp = x + (*vj) * 4;
1135       /* x(:) += U(k,:)^T*(Dk*xk) */
1136       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1137       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1138       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1139       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1140       vj++;
1141       v += 16;
1142     }
1143     /* xk = inv(Dk)*(Dk*xk) */
1144     diag  = aa + k * 16; /* ptr to inv(Dk) */
1145     xp    = x + k * 4;
1146     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1147     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1148     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1149     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1150   }
1151   PetscFunctionReturn(PETSC_SUCCESS);
1152 }
1153 
1154 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1155 {
1156   const MatScalar *v;
1157   PetscScalar     *xp, x0, x1, x2, x3;
1158   PetscInt         nz, k;
1159   const PetscInt  *vj;
1160 
1161   PetscFunctionBegin;
1162   for (k = mbs - 1; k >= 0; k--) {
1163     v  = aa + 16 * ai[k];
1164     xp = x + k * 4;
1165     x0 = xp[0];
1166     x1 = xp[1];
1167     x2 = xp[2];
1168     x3 = xp[3]; /* xk */
1169     nz = ai[k + 1] - ai[k];
1170     vj = aj + ai[k];
1171     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1172     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1173     while (nz--) {
1174       xp = x + (*vj) * 4;
1175       /* xk += U(k,:)*x(:) */
1176       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1177       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1178       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1179       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1180       vj++;
1181       v += 16;
1182     }
1183     xp    = x + k * 4;
1184     xp[0] = x0;
1185     xp[1] = x1;
1186     xp[2] = x2;
1187     xp[3] = x3;
1188   }
1189   PetscFunctionReturn(PETSC_SUCCESS);
1190 }
1191 
1192 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1193 {
1194   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1195   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1196   const MatScalar   *aa = a->a;
1197   PetscScalar       *x;
1198   const PetscScalar *b;
1199 
1200   PetscFunctionBegin;
1201   PetscCall(VecGetArrayRead(bb, &b));
1202   PetscCall(VecGetArray(xx, &x));
1203 
1204   /* solve U^T * D * y = b by forward substitution */
1205   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1206   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1207 
1208   /* solve U*x = y by back substitution */
1209   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1210   PetscCall(VecRestoreArrayRead(bb, &b));
1211   PetscCall(VecRestoreArray(xx, &x));
1212   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1217 {
1218   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1219   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1220   const MatScalar   *aa = a->a;
1221   PetscScalar       *x;
1222   const PetscScalar *b;
1223 
1224   PetscFunctionBegin;
1225   PetscCall(VecGetArrayRead(bb, &b));
1226   PetscCall(VecGetArray(xx, &x));
1227   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1228   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1229   PetscCall(VecRestoreArrayRead(bb, &b));
1230   PetscCall(VecRestoreArray(xx, &x));
1231   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1232   PetscFunctionReturn(PETSC_SUCCESS);
1233 }
1234 
1235 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1236 {
1237   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1238   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1239   const MatScalar   *aa = a->a;
1240   PetscScalar       *x;
1241   const PetscScalar *b;
1242 
1243   PetscFunctionBegin;
1244   PetscCall(VecGetArrayRead(bb, &b));
1245   PetscCall(VecGetArray(xx, &x));
1246   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1247   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1248   PetscCall(VecRestoreArrayRead(bb, &b));
1249   PetscCall(VecRestoreArray(xx, &x));
1250   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1251   PetscFunctionReturn(PETSC_SUCCESS);
1252 }
1253 
1254 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1255 {
1256   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1257   IS                 isrow = a->row;
1258   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1259   const PetscInt    *r;
1260   PetscInt           nz, k, idx;
1261   const PetscInt    *vj;
1262   const MatScalar   *aa = a->a, *v, *diag;
1263   PetscScalar       *x, x0, x1, x2, *t, *tp;
1264   const PetscScalar *b;
1265 
1266   PetscFunctionBegin;
1267   PetscCall(VecGetArrayRead(bb, &b));
1268   PetscCall(VecGetArray(xx, &x));
1269   t = a->solve_work;
1270   PetscCall(ISGetIndices(isrow, &r));
1271 
1272   /* solve U^T * D * y = b by forward substitution */
1273   tp = t;
1274   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1275     idx   = 3 * r[k];
1276     tp[0] = b[idx];
1277     tp[1] = b[idx + 1];
1278     tp[2] = b[idx + 2];
1279     tp += 3;
1280   }
1281 
1282   for (k = 0; k < mbs; k++) {
1283     v  = aa + 9 * ai[k];
1284     vj = aj + ai[k];
1285     tp = t + k * 3;
1286     x0 = tp[0];
1287     x1 = tp[1];
1288     x2 = tp[2];
1289     nz = ai[k + 1] - ai[k];
1290 
1291     tp = t + (*vj) * 3;
1292     while (nz--) {
1293       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1294       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1295       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1296       vj++;
1297       tp = t + (*vj) * 3;
1298       v += 9;
1299     }
1300 
1301     /* xk = inv(Dk)*(Dk*xk) */
1302     diag  = aa + k * 9; /* ptr to inv(Dk) */
1303     tp    = t + k * 3;
1304     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1305     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1306     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1307   }
1308 
1309   /* solve U*x = y by back substitution */
1310   for (k = mbs - 1; k >= 0; k--) {
1311     v  = aa + 9 * ai[k];
1312     vj = aj + ai[k];
1313     tp = t + k * 3;
1314     x0 = tp[0];
1315     x1 = tp[1];
1316     x2 = tp[2]; /* xk */
1317     nz = ai[k + 1] - ai[k];
1318 
1319     tp = t + (*vj) * 3;
1320     while (nz--) {
1321       /* xk += U(k,:)*x(:) */
1322       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1323       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1324       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1325       vj++;
1326       tp = t + (*vj) * 3;
1327       v += 9;
1328     }
1329     tp         = t + k * 3;
1330     tp[0]      = x0;
1331     tp[1]      = x1;
1332     tp[2]      = x2;
1333     idx        = 3 * r[k];
1334     x[idx]     = x0;
1335     x[idx + 1] = x1;
1336     x[idx + 2] = x2;
1337   }
1338 
1339   PetscCall(ISRestoreIndices(isrow, &r));
1340   PetscCall(VecRestoreArrayRead(bb, &b));
1341   PetscCall(VecRestoreArray(xx, &x));
1342   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1343   PetscFunctionReturn(PETSC_SUCCESS);
1344 }
1345 
1346 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1347 {
1348   const MatScalar *v, *diag;
1349   PetscScalar     *xp, x0, x1, x2;
1350   PetscInt         nz, k;
1351   const PetscInt  *vj;
1352 
1353   PetscFunctionBegin;
1354   for (k = 0; k < mbs; k++) {
1355     v  = aa + 9 * ai[k];
1356     xp = x + k * 3;
1357     x0 = xp[0];
1358     x1 = xp[1];
1359     x2 = xp[2]; /* Dk*xk = k-th block of x */
1360     nz = ai[k + 1] - ai[k];
1361     vj = aj + ai[k];
1362     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1363     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364     while (nz--) {
1365       xp = x + (*vj) * 3;
1366       /* x(:) += U(k,:)^T*(Dk*xk) */
1367       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1368       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1369       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1370       vj++;
1371       v += 9;
1372     }
1373     /* xk = inv(Dk)*(Dk*xk) */
1374     diag  = aa + k * 9; /* ptr to inv(Dk) */
1375     xp    = x + k * 3;
1376     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1377     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1378     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1379   }
1380   PetscFunctionReturn(PETSC_SUCCESS);
1381 }
1382 
1383 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1384 {
1385   const MatScalar *v;
1386   PetscScalar     *xp, x0, x1, x2;
1387   PetscInt         nz, k;
1388   const PetscInt  *vj;
1389 
1390   PetscFunctionBegin;
1391   for (k = mbs - 1; k >= 0; k--) {
1392     v  = aa + 9 * ai[k];
1393     xp = x + k * 3;
1394     x0 = xp[0];
1395     x1 = xp[1];
1396     x2 = xp[2]; /* xk */
1397     nz = ai[k + 1] - ai[k];
1398     vj = aj + ai[k];
1399     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1400     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1401     while (nz--) {
1402       xp = x + (*vj) * 3;
1403       /* xk += U(k,:)*x(:) */
1404       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1405       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1406       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1407       vj++;
1408       v += 9;
1409     }
1410     xp    = x + k * 3;
1411     xp[0] = x0;
1412     xp[1] = x1;
1413     xp[2] = x2;
1414   }
1415   PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417 
1418 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1419 {
1420   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1421   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1422   const MatScalar   *aa = a->a;
1423   PetscScalar       *x;
1424   const PetscScalar *b;
1425 
1426   PetscFunctionBegin;
1427   PetscCall(VecGetArrayRead(bb, &b));
1428   PetscCall(VecGetArray(xx, &x));
1429 
1430   /* solve U^T * D * y = b by forward substitution */
1431   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1432   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1433 
1434   /* solve U*x = y by back substitution */
1435   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1436 
1437   PetscCall(VecRestoreArrayRead(bb, &b));
1438   PetscCall(VecRestoreArray(xx, &x));
1439   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1440   PetscFunctionReturn(PETSC_SUCCESS);
1441 }
1442 
1443 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1444 {
1445   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1446   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1447   const MatScalar   *aa = a->a;
1448   PetscScalar       *x;
1449   const PetscScalar *b;
1450 
1451   PetscFunctionBegin;
1452   PetscCall(VecGetArrayRead(bb, &b));
1453   PetscCall(VecGetArray(xx, &x));
1454   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1455   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1456   PetscCall(VecRestoreArrayRead(bb, &b));
1457   PetscCall(VecRestoreArray(xx, &x));
1458   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1459   PetscFunctionReturn(PETSC_SUCCESS);
1460 }
1461 
1462 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1463 {
1464   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1465   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1466   const MatScalar   *aa = a->a;
1467   PetscScalar       *x;
1468   const PetscScalar *b;
1469 
1470   PetscFunctionBegin;
1471   PetscCall(VecGetArrayRead(bb, &b));
1472   PetscCall(VecGetArray(xx, &x));
1473   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1474   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1475   PetscCall(VecRestoreArrayRead(bb, &b));
1476   PetscCall(VecRestoreArray(xx, &x));
1477   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1478   PetscFunctionReturn(PETSC_SUCCESS);
1479 }
1480 
1481 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1482 {
1483   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1484   IS                 isrow = a->row;
1485   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1486   const PetscInt    *r, *vj;
1487   PetscInt           nz, k, k2, idx;
1488   const MatScalar   *aa = a->a, *v, *diag;
1489   PetscScalar       *x, x0, x1, *t;
1490   const PetscScalar *b;
1491 
1492   PetscFunctionBegin;
1493   PetscCall(VecGetArrayRead(bb, &b));
1494   PetscCall(VecGetArray(xx, &x));
1495   t = a->solve_work;
1496   PetscCall(ISGetIndices(isrow, &r));
1497 
1498   /* solve U^T * D * y = perm(b) by forward substitution */
1499   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1500     idx          = 2 * r[k];
1501     t[k * 2]     = b[idx];
1502     t[k * 2 + 1] = b[idx + 1];
1503   }
1504   for (k = 0; k < mbs; k++) {
1505     v  = aa + 4 * ai[k];
1506     vj = aj + ai[k];
1507     k2 = k * 2;
1508     x0 = t[k2];
1509     x1 = t[k2 + 1];
1510     nz = ai[k + 1] - ai[k];
1511     while (nz--) {
1512       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1513       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1514       vj++;
1515       v += 4;
1516     }
1517     diag      = aa + k * 4; /* ptr to inv(Dk) */
1518     t[k2]     = diag[0] * x0 + diag[2] * x1;
1519     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1520   }
1521 
1522   /* solve U*x = y by back substitution */
1523   for (k = mbs - 1; k >= 0; k--) {
1524     v  = aa + 4 * ai[k];
1525     vj = aj + ai[k];
1526     k2 = k * 2;
1527     x0 = t[k2];
1528     x1 = t[k2 + 1];
1529     nz = ai[k + 1] - ai[k];
1530     while (nz--) {
1531       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1532       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1533       vj++;
1534       v += 4;
1535     }
1536     t[k2]      = x0;
1537     t[k2 + 1]  = x1;
1538     idx        = 2 * r[k];
1539     x[idx]     = x0;
1540     x[idx + 1] = x1;
1541   }
1542 
1543   PetscCall(ISRestoreIndices(isrow, &r));
1544   PetscCall(VecRestoreArrayRead(bb, &b));
1545   PetscCall(VecRestoreArray(xx, &x));
1546   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1547   PetscFunctionReturn(PETSC_SUCCESS);
1548 }
1549 
1550 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1551 {
1552   const MatScalar *v, *diag;
1553   PetscScalar      x0, x1;
1554   PetscInt         nz, k, k2;
1555   const PetscInt  *vj;
1556 
1557   PetscFunctionBegin;
1558   for (k = 0; k < mbs; k++) {
1559     v  = aa + 4 * ai[k];
1560     vj = aj + ai[k];
1561     k2 = k * 2;
1562     x0 = x[k2];
1563     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1564     nz = ai[k + 1] - ai[k];
1565     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1566     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1567     while (nz--) {
1568       /* x(:) += U(k,:)^T*(Dk*xk) */
1569       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1570       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1571       vj++;
1572       v += 4;
1573     }
1574     /* xk = inv(Dk)*(Dk*xk) */
1575     diag      = aa + k * 4; /* ptr to inv(Dk) */
1576     x[k2]     = diag[0] * x0 + diag[2] * x1;
1577     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1578   }
1579   PetscFunctionReturn(PETSC_SUCCESS);
1580 }
1581 
1582 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1583 {
1584   const MatScalar *v;
1585   PetscScalar      x0, x1;
1586   PetscInt         nz, k, k2;
1587   const PetscInt  *vj;
1588 
1589   PetscFunctionBegin;
1590   for (k = mbs - 1; k >= 0; k--) {
1591     v  = aa + 4 * ai[k];
1592     vj = aj + ai[k];
1593     k2 = k * 2;
1594     x0 = x[k2];
1595     x1 = x[k2 + 1]; /* xk */
1596     nz = ai[k + 1] - ai[k];
1597     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1598     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1599     while (nz--) {
1600       /* xk += U(k,:)*x(:) */
1601       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1602       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1603       vj++;
1604       v += 4;
1605     }
1606     x[k2]     = x0;
1607     x[k2 + 1] = x1;
1608   }
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1613 {
1614   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1615   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1616   const MatScalar   *aa = a->a;
1617   PetscScalar       *x;
1618   const PetscScalar *b;
1619 
1620   PetscFunctionBegin;
1621   PetscCall(VecGetArrayRead(bb, &b));
1622   PetscCall(VecGetArray(xx, &x));
1623 
1624   /* solve U^T * D * y = b by forward substitution */
1625   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1626   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1627 
1628   /* solve U*x = y by back substitution */
1629   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1630 
1631   PetscCall(VecRestoreArrayRead(bb, &b));
1632   PetscCall(VecRestoreArray(xx, &x));
1633   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636 
1637 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1638 {
1639   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1640   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1641   const MatScalar   *aa = a->a;
1642   PetscScalar       *x;
1643   const PetscScalar *b;
1644 
1645   PetscFunctionBegin;
1646   PetscCall(VecGetArrayRead(bb, &b));
1647   PetscCall(VecGetArray(xx, &x));
1648   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1649   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1650   PetscCall(VecRestoreArrayRead(bb, &b));
1651   PetscCall(VecRestoreArray(xx, &x));
1652   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1653   PetscFunctionReturn(PETSC_SUCCESS);
1654 }
1655 
1656 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1657 {
1658   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1659   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1660   const MatScalar   *aa = a->a;
1661   PetscScalar       *x;
1662   const PetscScalar *b;
1663 
1664   PetscFunctionBegin;
1665   PetscCall(VecGetArrayRead(bb, &b));
1666   PetscCall(VecGetArray(xx, &x));
1667   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1668   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1669   PetscCall(VecRestoreArrayRead(bb, &b));
1670   PetscCall(VecRestoreArray(xx, &x));
1671   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1672   PetscFunctionReturn(PETSC_SUCCESS);
1673 }
1674 
1675 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1676 {
1677   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1678   IS                 isrow = a->row;
1679   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1680   const MatScalar   *aa = a->a, *v;
1681   const PetscScalar *b;
1682   PetscScalar       *x, xk, *t;
1683   PetscInt           nz, k, j;
1684 
1685   PetscFunctionBegin;
1686   PetscCall(VecGetArrayRead(bb, &b));
1687   PetscCall(VecGetArray(xx, &x));
1688   t = a->solve_work;
1689   PetscCall(ISGetIndices(isrow, &rp));
1690 
1691   /* solve U^T*D*y = perm(b) by forward substitution */
1692   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1693   for (k = 0; k < mbs; k++) {
1694     v  = aa + ai[k];
1695     vj = aj + ai[k];
1696     xk = t[k];
1697     nz = ai[k + 1] - ai[k] - 1;
1698     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1699     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1700   }
1701 
1702   /* solve U*perm(x) = y by back substitution */
1703   for (k = mbs - 1; k >= 0; k--) {
1704     v  = aa + adiag[k] - 1;
1705     vj = aj + adiag[k] - 1;
1706     nz = ai[k + 1] - ai[k] - 1;
1707     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1708     x[rp[k]] = t[k];
1709   }
1710 
1711   PetscCall(ISRestoreIndices(isrow, &rp));
1712   PetscCall(VecRestoreArrayRead(bb, &b));
1713   PetscCall(VecRestoreArray(xx, &x));
1714   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1715   PetscFunctionReturn(PETSC_SUCCESS);
1716 }
1717 
1718 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1719 {
1720   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1721   IS                 isrow = a->row;
1722   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1723   const MatScalar   *aa = a->a, *v;
1724   PetscScalar       *x, xk, *t;
1725   const PetscScalar *b;
1726   PetscInt           nz, k;
1727 
1728   PetscFunctionBegin;
1729   PetscCall(VecGetArrayRead(bb, &b));
1730   PetscCall(VecGetArray(xx, &x));
1731   t = a->solve_work;
1732   PetscCall(ISGetIndices(isrow, &rp));
1733 
1734   /* solve U^T*D*y = perm(b) by forward substitution */
1735   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1736   for (k = 0; k < mbs; k++) {
1737     v  = aa + ai[k] + 1;
1738     vj = aj + ai[k] + 1;
1739     xk = t[k];
1740     nz = ai[k + 1] - ai[k] - 1;
1741     while (nz--) t[*vj++] += (*v++) * xk;
1742     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1743   }
1744 
1745   /* solve U*perm(x) = y by back substitution */
1746   for (k = mbs - 1; k >= 0; k--) {
1747     v  = aa + ai[k] + 1;
1748     vj = aj + ai[k] + 1;
1749     nz = ai[k + 1] - ai[k] - 1;
1750     while (nz--) t[k] += (*v++) * t[*vj++];
1751     x[rp[k]] = t[k];
1752   }
1753 
1754   PetscCall(ISRestoreIndices(isrow, &rp));
1755   PetscCall(VecRestoreArrayRead(bb, &b));
1756   PetscCall(VecRestoreArray(xx, &x));
1757   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1758   PetscFunctionReturn(PETSC_SUCCESS);
1759 }
1760 
1761 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1762 {
1763   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1764   IS                 isrow = a->row;
1765   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1766   const MatScalar   *aa = a->a, *v;
1767   PetscReal          diagk;
1768   PetscScalar       *x, xk;
1769   const PetscScalar *b;
1770   PetscInt           nz, k;
1771 
1772   PetscFunctionBegin;
1773   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1774   PetscCall(VecGetArrayRead(bb, &b));
1775   PetscCall(VecGetArray(xx, &x));
1776   PetscCall(ISGetIndices(isrow, &rp));
1777 
1778   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1779   for (k = 0; k < mbs; k++) {
1780     v  = aa + ai[k];
1781     vj = aj + ai[k];
1782     xk = x[k];
1783     nz = ai[k + 1] - ai[k] - 1;
1784     while (nz--) x[*vj++] += (*v++) * xk;
1785 
1786     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1787     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1788     x[k] = xk * PetscSqrtReal(diagk);
1789   }
1790   PetscCall(ISRestoreIndices(isrow, &rp));
1791   PetscCall(VecRestoreArrayRead(bb, &b));
1792   PetscCall(VecRestoreArray(xx, &x));
1793   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1794   PetscFunctionReturn(PETSC_SUCCESS);
1795 }
1796 
1797 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1798 {
1799   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1800   IS                 isrow = a->row;
1801   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1802   const MatScalar   *aa = a->a, *v;
1803   PetscReal          diagk;
1804   PetscScalar       *x, xk;
1805   const PetscScalar *b;
1806   PetscInt           nz, k;
1807 
1808   PetscFunctionBegin;
1809   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1810   PetscCall(VecGetArrayRead(bb, &b));
1811   PetscCall(VecGetArray(xx, &x));
1812   PetscCall(ISGetIndices(isrow, &rp));
1813 
1814   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1815   for (k = 0; k < mbs; k++) {
1816     v  = aa + ai[k] + 1;
1817     vj = aj + ai[k] + 1;
1818     xk = x[k];
1819     nz = ai[k + 1] - ai[k] - 1;
1820     while (nz--) x[*vj++] += (*v++) * xk;
1821 
1822     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1823     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1824     x[k] = xk * PetscSqrtReal(diagk);
1825   }
1826   PetscCall(ISRestoreIndices(isrow, &rp));
1827   PetscCall(VecRestoreArrayRead(bb, &b));
1828   PetscCall(VecRestoreArray(xx, &x));
1829   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1830   PetscFunctionReturn(PETSC_SUCCESS);
1831 }
1832 
1833 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1834 {
1835   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1836   IS                 isrow = a->row;
1837   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1838   const MatScalar   *aa = a->a, *v;
1839   PetscReal          diagk;
1840   PetscScalar       *x, *t;
1841   const PetscScalar *b;
1842   PetscInt           nz, k;
1843 
1844   PetscFunctionBegin;
1845   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1846   PetscCall(VecGetArrayRead(bb, &b));
1847   PetscCall(VecGetArray(xx, &x));
1848   t = a->solve_work;
1849   PetscCall(ISGetIndices(isrow, &rp));
1850 
1851   for (k = mbs - 1; k >= 0; k--) {
1852     v     = aa + ai[k];
1853     vj    = aj + ai[k];
1854     diagk = PetscRealPart(aa[adiag[k]]);
1855     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1856     t[k] = b[k] * PetscSqrtReal(diagk);
1857     nz   = ai[k + 1] - ai[k] - 1;
1858     while (nz--) t[k] += (*v++) * t[*vj++];
1859     x[rp[k]] = t[k];
1860   }
1861   PetscCall(ISRestoreIndices(isrow, &rp));
1862   PetscCall(VecRestoreArrayRead(bb, &b));
1863   PetscCall(VecRestoreArray(xx, &x));
1864   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1865   PetscFunctionReturn(PETSC_SUCCESS);
1866 }
1867 
1868 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1869 {
1870   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1871   IS                 isrow = a->row;
1872   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1873   const MatScalar   *aa = a->a, *v;
1874   PetscReal          diagk;
1875   PetscScalar       *x, *t;
1876   const PetscScalar *b;
1877   PetscInt           nz, k;
1878 
1879   PetscFunctionBegin;
1880   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1881   PetscCall(VecGetArrayRead(bb, &b));
1882   PetscCall(VecGetArray(xx, &x));
1883   t = a->solve_work;
1884   PetscCall(ISGetIndices(isrow, &rp));
1885 
1886   for (k = mbs - 1; k >= 0; k--) {
1887     v     = aa + ai[k] + 1;
1888     vj    = aj + ai[k] + 1;
1889     diagk = PetscRealPart(aa[ai[k]]);
1890     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1891     t[k] = b[k] * PetscSqrtReal(diagk);
1892     nz   = ai[k + 1] - ai[k] - 1;
1893     while (nz--) t[k] += (*v++) * t[*vj++];
1894     x[rp[k]] = t[k];
1895   }
1896   PetscCall(ISRestoreIndices(isrow, &rp));
1897   PetscCall(VecRestoreArrayRead(bb, &b));
1898   PetscCall(VecRestoreArray(xx, &x));
1899   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1900   PetscFunctionReturn(PETSC_SUCCESS);
1901 }
1902 
1903 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1904 {
1905   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1906 
1907   PetscFunctionBegin;
1908   if (A->rmap->bs == 1) {
1909     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1910   } else {
1911     IS                 isrow = a->row;
1912     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1913     const MatScalar   *aa = a->a, *v;
1914     PetscScalar       *x, *t;
1915     const PetscScalar *b;
1916     PetscInt           nz, k, n, i, j;
1917 
1918     if (bb->n > a->solves_work_n) {
1919       PetscCall(PetscFree(a->solves_work));
1920       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1921       a->solves_work_n = bb->n;
1922     }
1923     n = bb->n;
1924     PetscCall(VecGetArrayRead(bb->v, &b));
1925     PetscCall(VecGetArray(xx->v, &x));
1926     t = a->solves_work;
1927 
1928     PetscCall(ISGetIndices(isrow, &rp));
1929 
1930     /* solve U^T*D*y = perm(b) by forward substitution */
1931     for (k = 0; k < mbs; k++) {
1932       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1933     }
1934     for (k = 0; k < mbs; k++) {
1935       v  = aa + ai[k];
1936       vj = aj + ai[k];
1937       nz = ai[k + 1] - ai[k] - 1;
1938       for (j = 0; j < nz; j++) {
1939         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1940         v++;
1941         vj++;
1942       }
1943       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1944     }
1945 
1946     /* solve U*perm(x) = y by back substitution */
1947     for (k = mbs - 1; k >= 0; k--) {
1948       v  = aa + ai[k] - 1;
1949       vj = aj + ai[k] - 1;
1950       nz = ai[k + 1] - ai[k] - 1;
1951       for (j = 0; j < nz; j++) {
1952         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1953         v++;
1954         vj++;
1955       }
1956       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1957     }
1958 
1959     PetscCall(ISRestoreIndices(isrow, &rp));
1960     PetscCall(VecRestoreArrayRead(bb->v, &b));
1961     PetscCall(VecRestoreArray(xx->v, &x));
1962     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1963   }
1964   PetscFunctionReturn(PETSC_SUCCESS);
1965 }
1966 
1967 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1968 {
1969   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1970 
1971   PetscFunctionBegin;
1972   if (A->rmap->bs == 1) {
1973     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1974   } else {
1975     IS                 isrow = a->row;
1976     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1977     const MatScalar   *aa = a->a, *v;
1978     PetscScalar       *x, *t;
1979     const PetscScalar *b;
1980     PetscInt           nz, k, n, i;
1981 
1982     if (bb->n > a->solves_work_n) {
1983       PetscCall(PetscFree(a->solves_work));
1984       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1985       a->solves_work_n = bb->n;
1986     }
1987     n = bb->n;
1988     PetscCall(VecGetArrayRead(bb->v, &b));
1989     PetscCall(VecGetArray(xx->v, &x));
1990     t = a->solves_work;
1991 
1992     PetscCall(ISGetIndices(isrow, &rp));
1993 
1994     /* solve U^T*D*y = perm(b) by forward substitution */
1995     for (k = 0; k < mbs; k++) {
1996       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1997     }
1998     for (k = 0; k < mbs; k++) {
1999       v  = aa + ai[k];
2000       vj = aj + ai[k];
2001       nz = ai[k + 1] - ai[k];
2002       while (nz--) {
2003         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2004         v++;
2005         vj++;
2006       }
2007       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2008     }
2009 
2010     /* solve U*perm(x) = y by back substitution */
2011     for (k = mbs - 1; k >= 0; k--) {
2012       v  = aa + ai[k];
2013       vj = aj + ai[k];
2014       nz = ai[k + 1] - ai[k];
2015       while (nz--) {
2016         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2017         v++;
2018         vj++;
2019       }
2020       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2021     }
2022 
2023     PetscCall(ISRestoreIndices(isrow, &rp));
2024     PetscCall(VecRestoreArrayRead(bb->v, &b));
2025     PetscCall(VecRestoreArray(xx->v, &x));
2026     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2027   }
2028   PetscFunctionReturn(PETSC_SUCCESS);
2029 }
2030 
2031 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2032 {
2033   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2034   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2035   const MatScalar   *aa = a->a, *v;
2036   const PetscScalar *b;
2037   PetscScalar       *x, xi;
2038   PetscInt           nz, i, j;
2039 
2040   PetscFunctionBegin;
2041   PetscCall(VecGetArrayRead(bb, &b));
2042   PetscCall(VecGetArray(xx, &x));
2043   /* solve U^T*D*y = b by forward substitution */
2044   PetscCall(PetscArraycpy(x, b, mbs));
2045   for (i = 0; i < mbs; i++) {
2046     v  = aa + ai[i];
2047     vj = aj + ai[i];
2048     xi = x[i];
2049     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2050     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2051     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2052   }
2053   /* solve U*x = y by backward substitution */
2054   for (i = mbs - 2; i >= 0; i--) {
2055     xi = x[i];
2056     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2057     vj = aj + adiag[i] - 1;
2058     nz = ai[i + 1] - ai[i] - 1;
2059     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2060     x[i] = xi;
2061   }
2062   PetscCall(VecRestoreArrayRead(bb, &b));
2063   PetscCall(VecRestoreArray(xx, &x));
2064   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2069 {
2070   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2071   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2072   const MatScalar   *aa = a->a, *v;
2073   const PetscScalar *b;
2074   PetscScalar       *x, xi;
2075   PetscInt           nz, i, j, neq, ldb, ldx;
2076   PetscBool          isdense;
2077 
2078   PetscFunctionBegin;
2079   if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2080   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2081   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2082   if (X != B) {
2083     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2084     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2085   }
2086   PetscCall(MatDenseGetArrayRead(B, &b));
2087   PetscCall(MatDenseGetLDA(B, &ldb));
2088   PetscCall(MatDenseGetArray(X, &x));
2089   PetscCall(MatDenseGetLDA(X, &ldx));
2090   for (neq = 0; neq < B->cmap->n; neq++) {
2091     /* solve U^T*D*y = b by forward substitution */
2092     PetscCall(PetscArraycpy(x, b, mbs));
2093     for (i = 0; i < mbs; i++) {
2094       v  = aa + ai[i];
2095       vj = aj + ai[i];
2096       xi = x[i];
2097       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2098       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2099       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2100     }
2101     /* solve U*x = y by backward substitution */
2102     for (i = mbs - 2; i >= 0; i--) {
2103       xi = x[i];
2104       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2105       vj = aj + adiag[i] - 1;
2106       nz = ai[i + 1] - ai[i] - 1;
2107       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2108       x[i] = xi;
2109     }
2110     b += ldb;
2111     x += ldx;
2112   }
2113   PetscCall(MatDenseRestoreArrayRead(B, &b));
2114   PetscCall(MatDenseRestoreArray(X, &x));
2115   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2116   PetscFunctionReturn(PETSC_SUCCESS);
2117 }
2118 
2119 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2120 {
2121   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2122   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2123   const MatScalar   *aa = a->a, *v;
2124   PetscScalar       *x, xk;
2125   const PetscScalar *b;
2126   PetscInt           nz, k;
2127 
2128   PetscFunctionBegin;
2129   PetscCall(VecGetArrayRead(bb, &b));
2130   PetscCall(VecGetArray(xx, &x));
2131 
2132   /* solve U^T*D*y = b by forward substitution */
2133   PetscCall(PetscArraycpy(x, b, mbs));
2134   for (k = 0; k < mbs; k++) {
2135     v  = aa + ai[k] + 1;
2136     vj = aj + ai[k] + 1;
2137     xk = x[k];
2138     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2139     while (nz--) x[*vj++] += (*v++) * xk;
2140     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2141   }
2142 
2143   /* solve U*x = y by back substitution */
2144   for (k = mbs - 2; k >= 0; k--) {
2145     v  = aa + ai[k] + 1;
2146     vj = aj + ai[k] + 1;
2147     xk = x[k];
2148     nz = ai[k + 1] - ai[k] - 1;
2149     while (nz--) xk += (*v++) * x[*vj++];
2150     x[k] = xk;
2151   }
2152 
2153   PetscCall(VecRestoreArrayRead(bb, &b));
2154   PetscCall(VecRestoreArray(xx, &x));
2155   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2156   PetscFunctionReturn(PETSC_SUCCESS);
2157 }
2158 
2159 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2160 {
2161   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2162   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2163   const MatScalar   *aa = a->a, *v;
2164   PetscReal          diagk;
2165   PetscScalar       *x;
2166   const PetscScalar *b;
2167   PetscInt           nz, k;
2168 
2169   PetscFunctionBegin;
2170   /* solve U^T*D^(1/2)*x = b by forward substitution */
2171   PetscCall(VecGetArrayRead(bb, &b));
2172   PetscCall(VecGetArray(xx, &x));
2173   PetscCall(PetscArraycpy(x, b, mbs));
2174   for (k = 0; k < mbs; k++) {
2175     v  = aa + ai[k];
2176     vj = aj + ai[k];
2177     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2178     while (nz--) x[*vj++] += (*v++) * x[k];
2179     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2180     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2181     x[k] *= PetscSqrtReal(diagk);
2182   }
2183   PetscCall(VecRestoreArrayRead(bb, &b));
2184   PetscCall(VecRestoreArray(xx, &x));
2185   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2186   PetscFunctionReturn(PETSC_SUCCESS);
2187 }
2188 
2189 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2190 {
2191   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2192   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2193   const MatScalar   *aa = a->a, *v;
2194   PetscReal          diagk;
2195   PetscScalar       *x;
2196   const PetscScalar *b;
2197   PetscInt           nz, k;
2198 
2199   PetscFunctionBegin;
2200   /* solve U^T*D^(1/2)*x = b by forward substitution */
2201   PetscCall(VecGetArrayRead(bb, &b));
2202   PetscCall(VecGetArray(xx, &x));
2203   PetscCall(PetscArraycpy(x, b, mbs));
2204   for (k = 0; k < mbs; k++) {
2205     v  = aa + ai[k] + 1;
2206     vj = aj + ai[k] + 1;
2207     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2208     while (nz--) x[*vj++] += (*v++) * x[k];
2209     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2210     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2211     x[k] *= PetscSqrtReal(diagk);
2212   }
2213   PetscCall(VecRestoreArrayRead(bb, &b));
2214   PetscCall(VecRestoreArray(xx, &x));
2215   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2216   PetscFunctionReturn(PETSC_SUCCESS);
2217 }
2218 
2219 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2220 {
2221   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2222   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2223   const MatScalar   *aa = a->a, *v;
2224   PetscReal          diagk;
2225   PetscScalar       *x;
2226   const PetscScalar *b;
2227   PetscInt           nz, k;
2228 
2229   PetscFunctionBegin;
2230   /* solve D^(1/2)*U*x = b by back substitution */
2231   PetscCall(VecGetArrayRead(bb, &b));
2232   PetscCall(VecGetArray(xx, &x));
2233 
2234   for (k = mbs - 1; k >= 0; k--) {
2235     v     = aa + ai[k];
2236     vj    = aj + ai[k];
2237     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2238     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2239     x[k] = PetscSqrtReal(diagk) * b[k];
2240     nz   = ai[k + 1] - ai[k] - 1;
2241     while (nz--) x[k] += (*v++) * x[*vj++];
2242   }
2243   PetscCall(VecRestoreArrayRead(bb, &b));
2244   PetscCall(VecRestoreArray(xx, &x));
2245   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2246   PetscFunctionReturn(PETSC_SUCCESS);
2247 }
2248 
2249 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2250 {
2251   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2252   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2253   const MatScalar   *aa = a->a, *v;
2254   PetscReal          diagk;
2255   PetscScalar       *x;
2256   const PetscScalar *b;
2257   PetscInt           nz, k;
2258 
2259   PetscFunctionBegin;
2260   /* solve D^(1/2)*U*x = b by back substitution */
2261   PetscCall(VecGetArrayRead(bb, &b));
2262   PetscCall(VecGetArray(xx, &x));
2263 
2264   for (k = mbs - 1; k >= 0; k--) {
2265     v     = aa + ai[k] + 1;
2266     vj    = aj + ai[k] + 1;
2267     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2268     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2269     x[k] = PetscSqrtReal(diagk) * b[k];
2270     nz   = ai[k + 1] - ai[k] - 1;
2271     while (nz--) x[k] += (*v++) * x[*vj++];
2272   }
2273   PetscCall(VecRestoreArrayRead(bb, &b));
2274   PetscCall(VecRestoreArray(xx, &x));
2275   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2276   PetscFunctionReturn(PETSC_SUCCESS);
2277 }
2278 
2279 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2280 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2281 {
2282   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2283   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2284   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2285   PetscInt        m, reallocs = 0, *levtmp;
2286   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2287   PetscInt        incrlev, *lev, shift, prow, nz;
2288   PetscReal       f = info->fill, levels = info->levels;
2289   PetscBool       perm_identity;
2290 
2291   PetscFunctionBegin;
2292   /* check whether perm is the identity mapping */
2293   PetscCall(ISIdentity(perm, &perm_identity));
2294 
2295   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2296   a->permute = PETSC_FALSE;
2297   ai         = a->i;
2298   aj         = a->j;
2299 
2300   /* initialization */
2301   PetscCall(ISGetIndices(perm, &rip));
2302   umax = (PetscInt)(f * ai[mbs] + 1);
2303   PetscCall(PetscMalloc1(umax, &lev));
2304   umax += mbs + 1;
2305   shift = mbs + 1;
2306   PetscCall(PetscMalloc1(mbs + 1, &iu));
2307   PetscCall(PetscMalloc1(umax, &ju));
2308   iu[0] = mbs + 1;
2309   juidx = mbs + 1;
2310   /* prowl: linked list for pivot row */
2311   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2312   /* q: linked list for col index */
2313 
2314   for (i = 0; i < mbs; i++) {
2315     prowl[i] = mbs;
2316     q[i]     = 0;
2317   }
2318 
2319   /* for each row k */
2320   for (k = 0; k < mbs; k++) {
2321     nzk  = 0;
2322     q[k] = mbs;
2323     /* copy current row into linked list */
2324     nz = ai[rip[k] + 1] - ai[rip[k]];
2325     j  = ai[rip[k]];
2326     while (nz--) {
2327       vj = rip[aj[j++]];
2328       if (vj > k) {
2329         qm = k;
2330         do {
2331           m  = qm;
2332           qm = q[m];
2333         } while (qm < vj);
2334         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2335         nzk++;
2336         q[m]       = vj;
2337         q[vj]      = qm;
2338         levtmp[vj] = 0; /* initialize lev for nonzero element */
2339       }
2340     }
2341 
2342     /* modify nonzero structure of k-th row by computing fill-in
2343        for each row prow to be merged in */
2344     prow = k;
2345     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2346 
2347     while (prow < k) {
2348       /* merge row prow into k-th row */
2349       jmin = iu[prow] + 1;
2350       jmax = iu[prow + 1];
2351       qm   = k;
2352       for (j = jmin; j < jmax; j++) {
2353         incrlev = lev[j - shift] + 1;
2354         if (incrlev > levels) continue;
2355         vj = ju[j];
2356         do {
2357           m  = qm;
2358           qm = q[m];
2359         } while (qm < vj);
2360         if (qm != vj) { /* a fill */
2361           nzk++;
2362           q[m]       = vj;
2363           q[vj]      = qm;
2364           qm         = vj;
2365           levtmp[vj] = incrlev;
2366         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2367       }
2368       prow = prowl[prow]; /* next pivot row */
2369     }
2370 
2371     /* add k to row list for first nonzero element in k-th row */
2372     if (nzk > 1) {
2373       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2374       prowl[k] = prowl[i];
2375       prowl[i] = k;
2376     }
2377     iu[k + 1] = iu[k] + nzk;
2378 
2379     /* allocate more space to ju and lev if needed */
2380     if (iu[k + 1] > umax) {
2381       /* estimate how much additional space we will need */
2382       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2383       /* just double the memory each time */
2384       maxadd = umax;
2385       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2386       umax += maxadd;
2387 
2388       /* allocate a longer ju */
2389       PetscCall(PetscMalloc1(umax, &jutmp));
2390       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2391       PetscCall(PetscFree(ju));
2392       ju = jutmp;
2393 
2394       PetscCall(PetscMalloc1(umax, &jutmp));
2395       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2396       PetscCall(PetscFree(lev));
2397       lev = jutmp;
2398       reallocs += 2; /* count how many times we realloc */
2399     }
2400 
2401     /* save nonzero structure of k-th row in ju */
2402     i = k;
2403     while (nzk--) {
2404       i                  = q[i];
2405       ju[juidx]          = i;
2406       lev[juidx - shift] = levtmp[i];
2407       juidx++;
2408     }
2409   }
2410 
2411 #if defined(PETSC_USE_INFO)
2412   if (ai[mbs] != 0) {
2413     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2414     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2415     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2416     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2417     PetscCall(PetscInfo(A, "for best performance.\n"));
2418   } else {
2419     PetscCall(PetscInfo(A, "Empty matrix\n"));
2420   }
2421 #endif
2422 
2423   PetscCall(ISRestoreIndices(perm, &rip));
2424   PetscCall(PetscFree3(prowl, q, levtmp));
2425   PetscCall(PetscFree(lev));
2426 
2427   /* put together the new matrix */
2428   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2429 
2430   b = (Mat_SeqSBAIJ *)(B)->data;
2431   PetscCall(PetscFree2(b->imax, b->ilen));
2432 
2433   b->singlemalloc = PETSC_FALSE;
2434   b->free_a       = PETSC_TRUE;
2435   b->free_ij      = PETSC_TRUE;
2436   /* the next line frees the default space generated by the Create() */
2437   PetscCall(PetscFree3(b->a, b->j, b->i));
2438   PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2439   b->j    = ju;
2440   b->i    = iu;
2441   b->diag = NULL;
2442   b->ilen = NULL;
2443   b->imax = NULL;
2444 
2445   PetscCall(ISDestroy(&b->row));
2446   PetscCall(ISDestroy(&b->icol));
2447   b->row  = perm;
2448   b->icol = perm;
2449   PetscCall(PetscObjectReference((PetscObject)perm));
2450   PetscCall(PetscObjectReference((PetscObject)perm));
2451   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2452   /* In b structure:  Free imax, ilen, old a, old j.
2453      Allocate idnew, solve_work, new a, new j */
2454   b->maxnz = b->nz = iu[mbs];
2455 
2456   (B)->info.factor_mallocs   = reallocs;
2457   (B)->info.fill_ratio_given = f;
2458   if (ai[mbs] != 0) {
2459     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2460   } else {
2461     (B)->info.fill_ratio_needed = 0.0;
2462   }
2463   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2464   PetscFunctionReturn(PETSC_SUCCESS);
2465 }
2466 
2467 /*
2468   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2469 */
2470 #include <petscbt.h>
2471 #include <../src/mat/utils/freespace.h>
2472 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2473 {
2474   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2475   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2476   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2477   const PetscInt    *rip;
2478   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2479   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2480   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2481   PetscReal          fill = info->fill, levels = info->levels;
2482   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2483   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2484   PetscBT            lnkbt;
2485 
2486   PetscFunctionBegin;
2487   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2488   PetscCall(MatMissingDiagonal(A, &missing, &d));
2489   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2490   if (bs > 1) {
2491     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2492     PetscFunctionReturn(PETSC_SUCCESS);
2493   }
2494 
2495   /* check whether perm is the identity mapping */
2496   PetscCall(ISIdentity(perm, &perm_identity));
2497   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2498   a->permute = PETSC_FALSE;
2499 
2500   PetscCall(PetscMalloc1(am + 1, &ui));
2501   PetscCall(PetscMalloc1(am + 1, &udiag));
2502   ui[0] = 0;
2503 
2504   /* ICC(0) without matrix ordering: simply rearrange column indices */
2505   if (!levels) {
2506     /* reuse the column pointers and row offsets for memory savings */
2507     for (i = 0; i < am; i++) {
2508       ncols     = ai[i + 1] - ai[i];
2509       ui[i + 1] = ui[i] + ncols;
2510       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2511     }
2512     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2513     cols = uj;
2514     for (i = 0; i < am; i++) {
2515       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2516       ncols = ai[i + 1] - ai[i] - 1;
2517       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2518       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2519     }
2520   } else { /* case: levels>0 */
2521     PetscCall(ISGetIndices(perm, &rip));
2522 
2523     /* initialization */
2524     /* jl: linked list for storing indices of the pivot rows
2525        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2526     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2527     for (i = 0; i < am; i++) {
2528       jl[i] = am;
2529       il[i] = 0;
2530     }
2531 
2532     /* create and initialize a linked list for storing column indices of the active row k */
2533     nlnk = am + 1;
2534     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2535 
2536     /* initial FreeSpace size is fill*(ai[am]+1) */
2537     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2538 
2539     current_space = free_space;
2540 
2541     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2542 
2543     current_space_lvl = free_space_lvl;
2544 
2545     for (k = 0; k < am; k++) { /* for each active row k */
2546       /* initialize lnk by the column indices of row k */
2547       nzk   = 0;
2548       ncols = ai[k + 1] - ai[k];
2549       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2550       cols = aj + ai[k];
2551       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2552       nzk += nlnk;
2553 
2554       /* update lnk by computing fill-in for each pivot row to be merged in */
2555       prow = jl[k]; /* 1st pivot row */
2556 
2557       while (prow < k) {
2558         nextprow = jl[prow];
2559 
2560         /* merge prow into k-th row */
2561         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2562         jmax  = ui[prow + 1];
2563         ncols = jmax - jmin;
2564         i     = jmin - ui[prow];
2565 
2566         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2567         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2568         j    = *(uj - 1);
2569         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2570         nzk += nlnk;
2571 
2572         /* update il and jl for prow */
2573         if (jmin < jmax) {
2574           il[prow] = jmin;
2575 
2576           j        = *cols;
2577           jl[prow] = jl[j];
2578           jl[j]    = prow;
2579         }
2580         prow = nextprow;
2581       }
2582 
2583       /* if free space is not available, make more free space */
2584       if (current_space->local_remaining < nzk) {
2585         i = am - k + 1;                                    /* num of unfactored rows */
2586         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2587         PetscCall(PetscFreeSpaceGet(i, &current_space));
2588         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2589         reallocs++;
2590       }
2591 
2592       /* copy data into free_space and free_space_lvl, then initialize lnk */
2593       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2594       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2595 
2596       /* add the k-th row into il and jl */
2597       if (nzk > 1) {
2598         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2599         jl[k] = jl[i];
2600         jl[i] = k;
2601         il[k] = ui[k] + 1;
2602       }
2603       uj_ptr[k]     = current_space->array;
2604       uj_lvl_ptr[k] = current_space_lvl->array;
2605 
2606       current_space->array += nzk;
2607       current_space->local_used += nzk;
2608       current_space->local_remaining -= nzk;
2609       current_space_lvl->array += nzk;
2610       current_space_lvl->local_used += nzk;
2611       current_space_lvl->local_remaining -= nzk;
2612 
2613       ui[k + 1] = ui[k] + nzk;
2614     }
2615 
2616     PetscCall(ISRestoreIndices(perm, &rip));
2617     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2618 
2619     /* destroy list of free space and other temporary array(s) */
2620     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2621     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2622     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2623     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2624 
2625   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2626 
2627   /* put together the new matrix in MATSEQSBAIJ format */
2628   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2629 
2630   b = (Mat_SeqSBAIJ *)(fact)->data;
2631   PetscCall(PetscFree2(b->imax, b->ilen));
2632 
2633   b->singlemalloc = PETSC_FALSE;
2634   b->free_a       = PETSC_TRUE;
2635   b->free_ij      = free_ij;
2636 
2637   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2638 
2639   b->j         = uj;
2640   b->i         = ui;
2641   b->diag      = udiag;
2642   b->free_diag = PETSC_TRUE;
2643   b->ilen      = NULL;
2644   b->imax      = NULL;
2645   b->row       = perm;
2646   b->col       = perm;
2647 
2648   PetscCall(PetscObjectReference((PetscObject)perm));
2649   PetscCall(PetscObjectReference((PetscObject)perm));
2650 
2651   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2652 
2653   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2654 
2655   b->maxnz = b->nz = ui[am];
2656 
2657   fact->info.factor_mallocs   = reallocs;
2658   fact->info.fill_ratio_given = fill;
2659   if (ai[am] != 0) {
2660     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2661   } else {
2662     fact->info.fill_ratio_needed = 0.0;
2663   }
2664 #if defined(PETSC_USE_INFO)
2665   if (ai[am] != 0) {
2666     PetscReal af = fact->info.fill_ratio_needed;
2667     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2668     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2669     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2670   } else {
2671     PetscCall(PetscInfo(A, "Empty matrix\n"));
2672   }
2673 #endif
2674   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2675   PetscFunctionReturn(PETSC_SUCCESS);
2676 }
2677 
2678 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2679 {
2680   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2681   Mat_SeqSBAIJ      *b;
2682   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2683   PetscInt           bs = A->rmap->bs, am = a->mbs;
2684   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2685   PetscInt           reallocs = 0, i, *ui;
2686   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2687   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2688   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2689   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2690   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2691   PetscBT            lnkbt;
2692 
2693   PetscFunctionBegin;
2694   /*
2695    This code originally uses Modified Sparse Row (MSR) storage
2696    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2697    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2698    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2699    thus the original code in MSR format is still used for these cases.
2700    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2701    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2702   */
2703   if (bs > 1) {
2704     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2705     PetscFunctionReturn(PETSC_SUCCESS);
2706   }
2707 
2708   /* check whether perm is the identity mapping */
2709   PetscCall(ISIdentity(perm, &perm_identity));
2710   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2711   a->permute = PETSC_FALSE;
2712 
2713   /* special case that simply copies fill pattern */
2714   if (!levels) {
2715     /* reuse the column pointers and row offsets for memory savings */
2716     ui           = a->i;
2717     uj           = a->j;
2718     free_ij      = PETSC_FALSE;
2719     ratio_needed = 1.0;
2720   } else { /* case: levels>0 */
2721     PetscCall(ISGetIndices(perm, &rip));
2722 
2723     /* initialization */
2724     PetscCall(PetscMalloc1(am + 1, &ui));
2725     ui[0] = 0;
2726 
2727     /* jl: linked list for storing indices of the pivot rows
2728        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2729     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2730     for (i = 0; i < am; i++) {
2731       jl[i] = am;
2732       il[i] = 0;
2733     }
2734 
2735     /* create and initialize a linked list for storing column indices of the active row k */
2736     nlnk = am + 1;
2737     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2738 
2739     /* initial FreeSpace size is fill*(ai[am]+1) */
2740     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2741 
2742     current_space = free_space;
2743 
2744     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2745 
2746     current_space_lvl = free_space_lvl;
2747 
2748     for (k = 0; k < am; k++) { /* for each active row k */
2749       /* initialize lnk by the column indices of row rip[k] */
2750       nzk   = 0;
2751       ncols = ai[rip[k] + 1] - ai[rip[k]];
2752       cols  = aj + ai[rip[k]];
2753       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2754       nzk += nlnk;
2755 
2756       /* update lnk by computing fill-in for each pivot row to be merged in */
2757       prow = jl[k]; /* 1st pivot row */
2758 
2759       while (prow < k) {
2760         nextprow = jl[prow];
2761 
2762         /* merge prow into k-th row */
2763         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2764         jmax     = ui[prow + 1];
2765         ncols    = jmax - jmin;
2766         i        = jmin - ui[prow];
2767         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2768         j        = *(uj_lvl_ptr[prow] + i - 1);
2769         cols_lvl = uj_lvl_ptr[prow] + i;
2770         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2771         nzk += nlnk;
2772 
2773         /* update il and jl for prow */
2774         if (jmin < jmax) {
2775           il[prow] = jmin;
2776           j        = *cols;
2777           jl[prow] = jl[j];
2778           jl[j]    = prow;
2779         }
2780         prow = nextprow;
2781       }
2782 
2783       /* if free space is not available, make more free space */
2784       if (current_space->local_remaining < nzk) {
2785         i = am - k + 1;                                                             /* num of unfactored rows */
2786         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2787         PetscCall(PetscFreeSpaceGet(i, &current_space));
2788         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2789         reallocs++;
2790       }
2791 
2792       /* copy data into free_space and free_space_lvl, then initialize lnk */
2793       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2794 
2795       /* add the k-th row into il and jl */
2796       if (nzk - 1 > 0) {
2797         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2798         jl[k] = jl[i];
2799         jl[i] = k;
2800         il[k] = ui[k] + 1;
2801       }
2802       uj_ptr[k]     = current_space->array;
2803       uj_lvl_ptr[k] = current_space_lvl->array;
2804 
2805       current_space->array += nzk;
2806       current_space->local_used += nzk;
2807       current_space->local_remaining -= nzk;
2808       current_space_lvl->array += nzk;
2809       current_space_lvl->local_used += nzk;
2810       current_space_lvl->local_remaining -= nzk;
2811 
2812       ui[k + 1] = ui[k] + nzk;
2813     }
2814 
2815     PetscCall(ISRestoreIndices(perm, &rip));
2816     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2817 
2818     /* destroy list of free space and other temporary array(s) */
2819     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2820     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2821     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2822     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2823     if (ai[am] != 0) {
2824       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2825     } else {
2826       ratio_needed = 0.0;
2827     }
2828   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2829 
2830   /* put together the new matrix in MATSEQSBAIJ format */
2831   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2832 
2833   b = (Mat_SeqSBAIJ *)(fact)->data;
2834 
2835   PetscCall(PetscFree2(b->imax, b->ilen));
2836 
2837   b->singlemalloc = PETSC_FALSE;
2838   b->free_a       = PETSC_TRUE;
2839   b->free_ij      = free_ij;
2840 
2841   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2842 
2843   b->j             = uj;
2844   b->i             = ui;
2845   b->diag          = NULL;
2846   b->ilen          = NULL;
2847   b->imax          = NULL;
2848   b->row           = perm;
2849   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2850 
2851   PetscCall(PetscObjectReference((PetscObject)perm));
2852   b->icol = perm;
2853   PetscCall(PetscObjectReference((PetscObject)perm));
2854   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2855 
2856   b->maxnz = b->nz = ui[am];
2857 
2858   fact->info.factor_mallocs    = reallocs;
2859   fact->info.fill_ratio_given  = fill;
2860   fact->info.fill_ratio_needed = ratio_needed;
2861 #if defined(PETSC_USE_INFO)
2862   if (ai[am] != 0) {
2863     PetscReal af = fact->info.fill_ratio_needed;
2864     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2865     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2866     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2867   } else {
2868     PetscCall(PetscInfo(A, "Empty matrix\n"));
2869   }
2870 #endif
2871   if (perm_identity) {
2872     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2873   } else {
2874     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2875   }
2876   PetscFunctionReturn(PETSC_SUCCESS);
2877 }
2878