xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision e8c0849ab8fe171bed529bea27238c9b402db591)
1 /*
2     Factorization code for SBAIJ format.
3 */
4 
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <petsc/private/kernels/blockinvert.h>
8 
MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)9 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
10 {
11   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
12   IS                 isrow = a->row;
13   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
14   const PetscInt    *r;
15   PetscInt           nz, *vj, k, idx, k1;
16   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
17   const MatScalar   *aa = a->a, *v, *diag;
18   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
19   const PetscScalar *b;
20 
21   PetscFunctionBegin;
22   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
23   PetscCall(VecGetArrayRead(bb, &b));
24   PetscCall(VecGetArray(xx, &x));
25   t = a->solve_work;
26   PetscCall(ISGetIndices(isrow, &r));
27   PetscCall(PetscMalloc1(bs, &xk_tmp));
28 
29   /* solve U^T * D * y = b by forward substitution */
30   xk = t;
31   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
32     idx = bs * r[k];
33     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
34   }
35   for (k = 0; k < mbs; k++) {
36     v  = aa + bs2 * ai[k];
37     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
38     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
39     nz = ai[k + 1] - ai[k];
40     vj = aj + ai[k];
41     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
42     while (nz--) {
43       /* x(:) += U(k,:)^T*(Dk*xk) */
44       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
45       vj++;
46       xj = t + (*vj) * bs;
47       v += bs2;
48     }
49     /* xk = inv(Dk)*(Dk*xk) */
50     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
51     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
52   }
53 
54   /* solve U*x = y by back substitution */
55   for (k = mbs - 1; k >= 0; k--) {
56     v  = aa + bs2 * ai[k];
57     xk = t + k * bs; /* xk */
58     nz = ai[k + 1] - ai[k];
59     vj = aj + ai[k];
60     xj = t + (*vj) * bs;
61     while (nz--) {
62       /* xk += U(k,:)*x(:) */
63       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
64       vj++;
65       v += bs2;
66       xj = t + (*vj) * bs;
67     }
68     idx = bs * r[k];
69     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
70   }
71 
72   PetscCall(PetscFree(xk_tmp));
73   PetscCall(ISRestoreIndices(isrow, &r));
74   PetscCall(VecRestoreArrayRead(bb, &b));
75   PetscCall(VecRestoreArray(xx, &x));
76   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
77   PetscFunctionReturn(PETSC_SUCCESS);
78 }
79 
MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)80 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
81 {
82   PetscFunctionBegin;
83   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
84 }
85 
MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
87 {
88   PetscFunctionBegin;
89   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
90 }
91 
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)92 static PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
93 {
94   PetscInt         nz, k;
95   const PetscInt  *vj, bs2 = bs * bs;
96   const MatScalar *v, *diag;
97   PetscScalar     *xk, *xj, *xk_tmp;
98 
99   PetscFunctionBegin;
100   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
101   PetscCall(PetscMalloc1(bs, &xk_tmp));
102   for (k = 0; k < mbs; k++) {
103     v  = aa + bs2 * ai[k];
104     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
105     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
106     nz = ai[k + 1] - ai[k];
107     vj = aj + ai[k];
108     xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
109     while (nz--) {
110       /* x(:) += U(k,:)^T*(Dk*xk) */
111       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
112       vj++;
113       xj = x + (size_t)(*vj) * bs;
114       v += bs2;
115     }
116     /* xk = inv(Dk)*(Dk*xk) */
117     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
118     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
119   }
120   PetscCall(PetscFree(xk_tmp));
121   PetscFunctionReturn(PETSC_SUCCESS);
122 }
123 
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)124 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
125 {
126   PetscInt         nz, k;
127   const PetscInt  *vj, bs2 = bs * bs;
128   const MatScalar *v;
129   PetscScalar     *xk, *xj;
130 
131   PetscFunctionBegin;
132   for (k = mbs - 1; k >= 0; k--) {
133     v  = aa + bs2 * ai[k];
134     xk = x + k * bs; /* xk */
135     nz = ai[k + 1] - ai[k];
136     vj = aj + ai[k];
137     xj = x + (size_t)(*vj) * bs;
138     while (nz--) {
139       /* xk += U(k,:)*x(:) */
140       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
141       vj++;
142       v += bs2;
143       xj = x + (size_t)(*vj) * bs;
144     }
145   }
146   PetscFunctionReturn(PETSC_SUCCESS);
147 }
148 
MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)149 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
150 {
151   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
152   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
153   PetscInt           bs = A->rmap->bs;
154   const MatScalar   *aa = a->a;
155   PetscScalar       *x;
156   const PetscScalar *b;
157 
158   PetscFunctionBegin;
159   PetscCall(VecGetArrayRead(bb, &b));
160   PetscCall(VecGetArray(xx, &x));
161 
162   /* solve U^T * D * y = b by forward substitution */
163   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
164   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
165 
166   /* solve U*x = y by back substitution */
167   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
168 
169   PetscCall(VecRestoreArrayRead(bb, &b));
170   PetscCall(VecRestoreArray(xx, &x));
171   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
172   PetscFunctionReturn(PETSC_SUCCESS);
173 }
174 
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
176 {
177   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
178   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
179   PetscInt           bs = A->rmap->bs;
180   const MatScalar   *aa = a->a;
181   const PetscScalar *b;
182   PetscScalar       *x;
183 
184   PetscFunctionBegin;
185   PetscCall(VecGetArrayRead(bb, &b));
186   PetscCall(VecGetArray(xx, &x));
187   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
188   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
189   PetscCall(VecRestoreArrayRead(bb, &b));
190   PetscCall(VecRestoreArray(xx, &x));
191   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)195 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
196 {
197   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
198   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
199   PetscInt           bs = A->rmap->bs;
200   const MatScalar   *aa = a->a;
201   const PetscScalar *b;
202   PetscScalar       *x;
203 
204   PetscFunctionBegin;
205   PetscCall(VecGetArrayRead(bb, &b));
206   PetscCall(VecGetArray(xx, &x));
207   PetscCall(PetscArraycpy(x, b, bs * mbs));
208   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
209   PetscCall(VecRestoreArrayRead(bb, &b));
210   PetscCall(VecRestoreArray(xx, &x));
211   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
216 {
217   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
218   IS                 isrow = a->row;
219   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
220   PetscInt           nz, k, idx;
221   const MatScalar   *aa = a->a, *v, *d;
222   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
223   const PetscScalar *b;
224 
225   PetscFunctionBegin;
226   PetscCall(VecGetArrayRead(bb, &b));
227   PetscCall(VecGetArray(xx, &x));
228   t = a->solve_work;
229   PetscCall(ISGetIndices(isrow, &r));
230 
231   /* solve U^T * D * y = b by forward substitution */
232   tp = t;
233   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
234     idx   = 7 * r[k];
235     tp[0] = b[idx];
236     tp[1] = b[idx + 1];
237     tp[2] = b[idx + 2];
238     tp[3] = b[idx + 3];
239     tp[4] = b[idx + 4];
240     tp[5] = b[idx + 5];
241     tp[6] = b[idx + 6];
242     tp += 7;
243   }
244 
245   for (k = 0; k < mbs; k++) {
246     v  = aa + 49 * ai[k];
247     vj = aj + ai[k];
248     tp = t + k * 7;
249     x0 = tp[0];
250     x1 = tp[1];
251     x2 = tp[2];
252     x3 = tp[3];
253     x4 = tp[4];
254     x5 = tp[5];
255     x6 = tp[6];
256     nz = ai[k + 1] - ai[k];
257     tp = t + (*vj) * 7;
258     while (nz--) {
259       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
260       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
261       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
262       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
263       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
264       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
265       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
266       vj++;
267       tp = t + (*vj) * 7;
268       v += 49;
269     }
270 
271     /* xk = inv(Dk)*(Dk*xk) */
272     d     = aa + k * 49; /* ptr to inv(Dk) */
273     tp    = t + k * 7;
274     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
275     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
276     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
277     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
278     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
279     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
280     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
281   }
282 
283   /* solve U*x = y by back substitution */
284   for (k = mbs - 1; k >= 0; k--) {
285     v  = aa + 49 * ai[k];
286     vj = aj + ai[k];
287     tp = t + k * 7;
288     x0 = tp[0];
289     x1 = tp[1];
290     x2 = tp[2];
291     x3 = tp[3];
292     x4 = tp[4];
293     x5 = tp[5];
294     x6 = tp[6]; /* xk */
295     nz = ai[k + 1] - ai[k];
296 
297     tp = t + (*vj) * 7;
298     while (nz--) {
299       /* xk += U(k,:)*x(:) */
300       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
301       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
302       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
303       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
304       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
305       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
306       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
307       vj++;
308       tp = t + (*vj) * 7;
309       v += 49;
310     }
311     tp         = t + k * 7;
312     tp[0]      = x0;
313     tp[1]      = x1;
314     tp[2]      = x2;
315     tp[3]      = x3;
316     tp[4]      = x4;
317     tp[5]      = x5;
318     tp[6]      = x6;
319     idx        = 7 * r[k];
320     x[idx]     = x0;
321     x[idx + 1] = x1;
322     x[idx + 2] = x2;
323     x[idx + 3] = x3;
324     x[idx + 4] = x4;
325     x[idx + 5] = x5;
326     x[idx + 6] = x6;
327   }
328 
329   PetscCall(ISRestoreIndices(isrow, &r));
330   PetscCall(VecRestoreArrayRead(bb, &b));
331   PetscCall(VecRestoreArray(xx, &x));
332   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335 
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)336 static PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
337 {
338   const MatScalar *v, *d;
339   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
340   PetscInt         nz, k;
341   const PetscInt  *vj;
342 
343   PetscFunctionBegin;
344   for (k = 0; k < mbs; k++) {
345     v  = aa + 49 * ai[k];
346     xp = x + k * 7;
347     x0 = xp[0];
348     x1 = xp[1];
349     x2 = xp[2];
350     x3 = xp[3];
351     x4 = xp[4];
352     x5 = xp[5];
353     x6 = xp[6]; /* Dk*xk = k-th block of x */
354     nz = ai[k + 1] - ai[k];
355     vj = aj + ai[k];
356     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
357     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358     while (nz--) {
359       xp = x + (*vj) * 7;
360       /* x(:) += U(k,:)^T*(Dk*xk) */
361       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
362       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
363       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
364       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
365       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
366       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
367       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
368       vj++;
369       v += 49;
370     }
371     /* xk = inv(Dk)*(Dk*xk) */
372     d     = aa + k * 49; /* ptr to inv(Dk) */
373     xp    = x + k * 7;
374     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
375     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
376     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
377     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
378     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
379     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
380     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
381   }
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)385 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
386 {
387   const MatScalar *v;
388   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
389   PetscInt         nz, k;
390   const PetscInt  *vj;
391 
392   PetscFunctionBegin;
393   for (k = mbs - 1; k >= 0; k--) {
394     v  = aa + 49 * ai[k];
395     xp = x + k * 7;
396     x0 = xp[0];
397     x1 = xp[1];
398     x2 = xp[2];
399     x3 = xp[3];
400     x4 = xp[4];
401     x5 = xp[5];
402     x6 = xp[6]; /* xk */
403     nz = ai[k + 1] - ai[k];
404     vj = aj + ai[k];
405     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
406     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
407     while (nz--) {
408       xp = x + (*vj) * 7;
409       /* xk += U(k,:)*x(:) */
410       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
411       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
412       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
413       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
414       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
415       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
416       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
417       vj++;
418       v += 49;
419     }
420     xp    = x + k * 7;
421     xp[0] = x0;
422     xp[1] = x1;
423     xp[2] = x2;
424     xp[3] = x3;
425     xp[4] = x4;
426     xp[5] = x5;
427     xp[6] = x6;
428   }
429   PetscFunctionReturn(PETSC_SUCCESS);
430 }
431 
MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)432 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
433 {
434   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
435   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
436   const MatScalar   *aa = a->a;
437   PetscScalar       *x;
438   const PetscScalar *b;
439 
440   PetscFunctionBegin;
441   PetscCall(VecGetArrayRead(bb, &b));
442   PetscCall(VecGetArray(xx, &x));
443 
444   /* solve U^T * D * y = b by forward substitution */
445   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
446   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
447 
448   /* solve U*x = y by back substitution */
449   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
450 
451   PetscCall(VecRestoreArrayRead(bb, &b));
452   PetscCall(VecRestoreArray(xx, &x));
453   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
454   PetscFunctionReturn(PETSC_SUCCESS);
455 }
456 
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)457 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
458 {
459   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
460   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
461   const MatScalar   *aa = a->a;
462   PetscScalar       *x;
463   const PetscScalar *b;
464 
465   PetscFunctionBegin;
466   PetscCall(VecGetArrayRead(bb, &b));
467   PetscCall(VecGetArray(xx, &x));
468   PetscCall(PetscArraycpy(x, b, 7 * mbs));
469   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
470   PetscCall(VecRestoreArrayRead(bb, &b));
471   PetscCall(VecRestoreArray(xx, &x));
472   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)476 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
477 {
478   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
479   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
480   const MatScalar   *aa = a->a;
481   PetscScalar       *x;
482   const PetscScalar *b;
483 
484   PetscFunctionBegin;
485   PetscCall(VecGetArrayRead(bb, &b));
486   PetscCall(VecGetArray(xx, &x));
487   PetscCall(PetscArraycpy(x, b, 7 * mbs));
488   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
489   PetscCall(VecRestoreArrayRead(bb, &b));
490   PetscCall(VecRestoreArray(xx, &x));
491   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
492   PetscFunctionReturn(PETSC_SUCCESS);
493 }
494 
MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)495 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
496 {
497   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
498   IS                 isrow = a->row;
499   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
500   PetscInt           nz, k, idx;
501   const MatScalar   *aa = a->a, *v, *d;
502   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
503   const PetscScalar *b;
504 
505   PetscFunctionBegin;
506   PetscCall(VecGetArrayRead(bb, &b));
507   PetscCall(VecGetArray(xx, &x));
508   t = a->solve_work;
509   PetscCall(ISGetIndices(isrow, &r));
510 
511   /* solve U^T * D * y = b by forward substitution */
512   tp = t;
513   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
514     idx   = 6 * r[k];
515     tp[0] = b[idx];
516     tp[1] = b[idx + 1];
517     tp[2] = b[idx + 2];
518     tp[3] = b[idx + 3];
519     tp[4] = b[idx + 4];
520     tp[5] = b[idx + 5];
521     tp += 6;
522   }
523 
524   for (k = 0; k < mbs; k++) {
525     v  = aa + 36 * ai[k];
526     vj = aj + ai[k];
527     tp = t + k * 6;
528     x0 = tp[0];
529     x1 = tp[1];
530     x2 = tp[2];
531     x3 = tp[3];
532     x4 = tp[4];
533     x5 = tp[5];
534     nz = ai[k + 1] - ai[k];
535     tp = t + (*vj) * 6;
536     while (nz--) {
537       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
538       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
539       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
540       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
541       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
542       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
543       vj++;
544       tp = t + (*vj) * 6;
545       v += 36;
546     }
547 
548     /* xk = inv(Dk)*(Dk*xk) */
549     d     = aa + k * 36; /* ptr to inv(Dk) */
550     tp    = t + k * 6;
551     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
552     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
553     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
554     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
555     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
556     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
557   }
558 
559   /* solve U*x = y by back substitution */
560   for (k = mbs - 1; k >= 0; k--) {
561     v  = aa + 36 * ai[k];
562     vj = aj + ai[k];
563     tp = t + k * 6;
564     x0 = tp[0];
565     x1 = tp[1];
566     x2 = tp[2];
567     x3 = tp[3];
568     x4 = tp[4];
569     x5 = tp[5]; /* xk */
570     nz = ai[k + 1] - ai[k];
571 
572     tp = t + (*vj) * 6;
573     while (nz--) {
574       /* xk += U(k,:)*x(:) */
575       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
576       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
577       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
578       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
579       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
580       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
581       vj++;
582       tp = t + (*vj) * 6;
583       v += 36;
584     }
585     tp         = t + k * 6;
586     tp[0]      = x0;
587     tp[1]      = x1;
588     tp[2]      = x2;
589     tp[3]      = x3;
590     tp[4]      = x4;
591     tp[5]      = x5;
592     idx        = 6 * r[k];
593     x[idx]     = x0;
594     x[idx + 1] = x1;
595     x[idx + 2] = x2;
596     x[idx + 3] = x3;
597     x[idx + 4] = x4;
598     x[idx + 5] = x5;
599   }
600 
601   PetscCall(ISRestoreIndices(isrow, &r));
602   PetscCall(VecRestoreArrayRead(bb, &b));
603   PetscCall(VecRestoreArray(xx, &x));
604   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
605   PetscFunctionReturn(PETSC_SUCCESS);
606 }
607 
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)608 static PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
609 {
610   const MatScalar *v, *d;
611   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
612   PetscInt         nz, k;
613   const PetscInt  *vj;
614 
615   PetscFunctionBegin;
616   for (k = 0; k < mbs; k++) {
617     v  = aa + 36 * ai[k];
618     xp = x + k * 6;
619     x0 = xp[0];
620     x1 = xp[1];
621     x2 = xp[2];
622     x3 = xp[3];
623     x4 = xp[4];
624     x5 = xp[5]; /* Dk*xk = k-th block of x */
625     nz = ai[k + 1] - ai[k];
626     vj = aj + ai[k];
627     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
628     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
629     while (nz--) {
630       xp = x + (*vj) * 6;
631       /* x(:) += U(k,:)^T*(Dk*xk) */
632       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
633       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
634       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
635       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
636       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
637       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
638       vj++;
639       v += 36;
640     }
641     /* xk = inv(Dk)*(Dk*xk) */
642     d     = aa + k * 36; /* ptr to inv(Dk) */
643     xp    = x + k * 6;
644     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
645     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
646     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
647     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
648     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
649     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
650   }
651   PetscFunctionReturn(PETSC_SUCCESS);
652 }
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)653 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
654 {
655   const MatScalar *v;
656   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
657   PetscInt         nz, k;
658   const PetscInt  *vj;
659 
660   PetscFunctionBegin;
661   for (k = mbs - 1; k >= 0; k--) {
662     v  = aa + 36 * ai[k];
663     xp = x + k * 6;
664     x0 = xp[0];
665     x1 = xp[1];
666     x2 = xp[2];
667     x3 = xp[3];
668     x4 = xp[4];
669     x5 = xp[5]; /* xk */
670     nz = ai[k + 1] - ai[k];
671     vj = aj + ai[k];
672     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
673     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
674     while (nz--) {
675       xp = x + (*vj) * 6;
676       /* xk += U(k,:)*x(:) */
677       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
678       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
679       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
680       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
681       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
682       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
683       vj++;
684       v += 36;
685     }
686     xp    = x + k * 6;
687     xp[0] = x0;
688     xp[1] = x1;
689     xp[2] = x2;
690     xp[3] = x3;
691     xp[4] = x4;
692     xp[5] = x5;
693   }
694   PetscFunctionReturn(PETSC_SUCCESS);
695 }
696 
MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)697 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
698 {
699   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
700   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
701   const MatScalar   *aa = a->a;
702   PetscScalar       *x;
703   const PetscScalar *b;
704 
705   PetscFunctionBegin;
706   PetscCall(VecGetArrayRead(bb, &b));
707   PetscCall(VecGetArray(xx, &x));
708 
709   /* solve U^T * D * y = b by forward substitution */
710   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
711   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
712 
713   /* solve U*x = y by back substitution */
714   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
715 
716   PetscCall(VecRestoreArrayRead(bb, &b));
717   PetscCall(VecRestoreArray(xx, &x));
718   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
719   PetscFunctionReturn(PETSC_SUCCESS);
720 }
721 
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)722 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
723 {
724   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
725   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
726   const MatScalar   *aa = a->a;
727   PetscScalar       *x;
728   const PetscScalar *b;
729 
730   PetscFunctionBegin;
731   PetscCall(VecGetArrayRead(bb, &b));
732   PetscCall(VecGetArray(xx, &x));
733   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
734   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
735   PetscCall(VecRestoreArrayRead(bb, &b));
736   PetscCall(VecRestoreArray(xx, &x));
737   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
738   PetscFunctionReturn(PETSC_SUCCESS);
739 }
740 
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)741 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
742 {
743   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
744   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
745   const MatScalar   *aa = a->a;
746   PetscScalar       *x;
747   const PetscScalar *b;
748 
749   PetscFunctionBegin;
750   PetscCall(VecGetArrayRead(bb, &b));
751   PetscCall(VecGetArray(xx, &x));
752   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
753   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
754   PetscCall(VecRestoreArrayRead(bb, &b));
755   PetscCall(VecRestoreArray(xx, &x));
756   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
757   PetscFunctionReturn(PETSC_SUCCESS);
758 }
759 
MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)760 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
761 {
762   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
763   IS                 isrow = a->row;
764   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
765   const PetscInt    *r, *vj;
766   PetscInt           nz, k, idx;
767   const MatScalar   *aa = a->a, *v, *diag;
768   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
769   const PetscScalar *b;
770 
771   PetscFunctionBegin;
772   PetscCall(VecGetArrayRead(bb, &b));
773   PetscCall(VecGetArray(xx, &x));
774   t = a->solve_work;
775   PetscCall(ISGetIndices(isrow, &r));
776 
777   /* solve U^T * D * y = b by forward substitution */
778   tp = t;
779   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
780     idx   = 5 * r[k];
781     tp[0] = b[idx];
782     tp[1] = b[idx + 1];
783     tp[2] = b[idx + 2];
784     tp[3] = b[idx + 3];
785     tp[4] = b[idx + 4];
786     tp += 5;
787   }
788 
789   for (k = 0; k < mbs; k++) {
790     v  = aa + 25 * ai[k];
791     vj = aj + ai[k];
792     tp = t + k * 5;
793     x0 = tp[0];
794     x1 = tp[1];
795     x2 = tp[2];
796     x3 = tp[3];
797     x4 = tp[4];
798     nz = ai[k + 1] - ai[k];
799 
800     tp = t + (*vj) * 5;
801     while (nz--) {
802       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
803       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
804       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
805       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
806       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
807       vj++;
808       tp = t + (*vj) * 5;
809       v += 25;
810     }
811 
812     /* xk = inv(Dk)*(Dk*xk) */
813     diag  = aa + k * 25; /* ptr to inv(Dk) */
814     tp    = t + k * 5;
815     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
816     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
817     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
818     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
819     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
820   }
821 
822   /* solve U*x = y by back substitution */
823   for (k = mbs - 1; k >= 0; k--) {
824     v  = aa + 25 * ai[k];
825     vj = aj + ai[k];
826     tp = t + k * 5;
827     x0 = tp[0];
828     x1 = tp[1];
829     x2 = tp[2];
830     x3 = tp[3];
831     x4 = tp[4]; /* xk */
832     nz = ai[k + 1] - ai[k];
833 
834     tp = t + (*vj) * 5;
835     while (nz--) {
836       /* xk += U(k,:)*x(:) */
837       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
838       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
839       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
840       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
841       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
842       vj++;
843       tp = t + (*vj) * 5;
844       v += 25;
845     }
846     tp         = t + k * 5;
847     tp[0]      = x0;
848     tp[1]      = x1;
849     tp[2]      = x2;
850     tp[3]      = x3;
851     tp[4]      = x4;
852     idx        = 5 * r[k];
853     x[idx]     = x0;
854     x[idx + 1] = x1;
855     x[idx + 2] = x2;
856     x[idx + 3] = x3;
857     x[idx + 4] = x4;
858   }
859 
860   PetscCall(ISRestoreIndices(isrow, &r));
861   PetscCall(VecRestoreArrayRead(bb, &b));
862   PetscCall(VecRestoreArray(xx, &x));
863   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
864   PetscFunctionReturn(PETSC_SUCCESS);
865 }
866 
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)867 static PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
868 {
869   const MatScalar *v, *diag;
870   PetscScalar     *xp, x0, x1, x2, x3, x4;
871   PetscInt         nz, k;
872   const PetscInt  *vj;
873 
874   PetscFunctionBegin;
875   for (k = 0; k < mbs; k++) {
876     v  = aa + 25 * ai[k];
877     xp = x + k * 5;
878     x0 = xp[0];
879     x1 = xp[1];
880     x2 = xp[2];
881     x3 = xp[3];
882     x4 = xp[4]; /* Dk*xk = k-th block of x */
883     nz = ai[k + 1] - ai[k];
884     vj = aj + ai[k];
885     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
886     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
887     while (nz--) {
888       xp = x + (*vj) * 5;
889       /* x(:) += U(k,:)^T*(Dk*xk) */
890       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
891       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
892       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
893       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
894       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
895       vj++;
896       v += 25;
897     }
898     /* xk = inv(Dk)*(Dk*xk) */
899     diag  = aa + k * 25; /* ptr to inv(Dk) */
900     xp    = x + k * 5;
901     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
902     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
903     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
904     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
905     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
906   }
907   PetscFunctionReturn(PETSC_SUCCESS);
908 }
909 
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)910 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
911 {
912   const MatScalar *v;
913   PetscScalar     *xp, x0, x1, x2, x3, x4;
914   PetscInt         nz, k;
915   const PetscInt  *vj;
916 
917   PetscFunctionBegin;
918   for (k = mbs - 1; k >= 0; k--) {
919     v  = aa + 25 * ai[k];
920     xp = x + k * 5;
921     x0 = xp[0];
922     x1 = xp[1];
923     x2 = xp[2];
924     x3 = xp[3];
925     x4 = xp[4]; /* xk */
926     nz = ai[k + 1] - ai[k];
927     vj = aj + ai[k];
928     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
929     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930     while (nz--) {
931       xp = x + (*vj) * 5;
932       /* xk += U(k,:)*x(:) */
933       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
934       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
935       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
936       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
937       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
938       vj++;
939       v += 25;
940     }
941     xp    = x + k * 5;
942     xp[0] = x0;
943     xp[1] = x1;
944     xp[2] = x2;
945     xp[3] = x3;
946     xp[4] = x4;
947   }
948   PetscFunctionReturn(PETSC_SUCCESS);
949 }
950 
MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)951 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952 {
953   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
954   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
955   const MatScalar   *aa = a->a;
956   PetscScalar       *x;
957   const PetscScalar *b;
958 
959   PetscFunctionBegin;
960   PetscCall(VecGetArrayRead(bb, &b));
961   PetscCall(VecGetArray(xx, &x));
962 
963   /* solve U^T * D * y = b by forward substitution */
964   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
965   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
966 
967   /* solve U*x = y by back substitution */
968   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
969 
970   PetscCall(VecRestoreArrayRead(bb, &b));
971   PetscCall(VecRestoreArray(xx, &x));
972   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
973   PetscFunctionReturn(PETSC_SUCCESS);
974 }
975 
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)976 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
977 {
978   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
979   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
980   const MatScalar   *aa = a->a;
981   PetscScalar       *x;
982   const PetscScalar *b;
983 
984   PetscFunctionBegin;
985   PetscCall(VecGetArrayRead(bb, &b));
986   PetscCall(VecGetArray(xx, &x));
987   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
988   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
989   PetscCall(VecRestoreArrayRead(bb, &b));
990   PetscCall(VecRestoreArray(xx, &x));
991   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
992   PetscFunctionReturn(PETSC_SUCCESS);
993 }
994 
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)995 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
996 {
997   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
998   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
999   const MatScalar   *aa = a->a;
1000   PetscScalar       *x;
1001   const PetscScalar *b;
1002 
1003   PetscFunctionBegin;
1004   PetscCall(VecGetArrayRead(bb, &b));
1005   PetscCall(VecGetArray(xx, &x));
1006   PetscCall(PetscArraycpy(x, b, 5 * mbs));
1007   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1008   PetscCall(VecRestoreArrayRead(bb, &b));
1009   PetscCall(VecRestoreArray(xx, &x));
1010   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1011   PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013 
MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)1014 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1015 {
1016   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1017   IS                 isrow = a->row;
1018   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1019   const PetscInt    *r, *vj;
1020   PetscInt           nz, k, idx;
1021   const MatScalar   *aa = a->a, *v, *diag;
1022   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
1023   const PetscScalar *b;
1024 
1025   PetscFunctionBegin;
1026   PetscCall(VecGetArrayRead(bb, &b));
1027   PetscCall(VecGetArray(xx, &x));
1028   t = a->solve_work;
1029   PetscCall(ISGetIndices(isrow, &r));
1030 
1031   /* solve U^T * D * y = b by forward substitution */
1032   tp = t;
1033   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1034     idx   = 4 * r[k];
1035     tp[0] = b[idx];
1036     tp[1] = b[idx + 1];
1037     tp[2] = b[idx + 2];
1038     tp[3] = b[idx + 3];
1039     tp += 4;
1040   }
1041 
1042   for (k = 0; k < mbs; k++) {
1043     v  = aa + 16 * ai[k];
1044     vj = aj + ai[k];
1045     tp = t + k * 4;
1046     x0 = tp[0];
1047     x1 = tp[1];
1048     x2 = tp[2];
1049     x3 = tp[3];
1050     nz = ai[k + 1] - ai[k];
1051 
1052     tp = t + (*vj) * 4;
1053     while (nz--) {
1054       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1055       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1056       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1057       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1058       vj++;
1059       tp = t + (*vj) * 4;
1060       v += 16;
1061     }
1062 
1063     /* xk = inv(Dk)*(Dk*xk) */
1064     diag  = aa + k * 16; /* ptr to inv(Dk) */
1065     tp    = t + k * 4;
1066     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1067     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1068     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1069     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1070   }
1071 
1072   /* solve U*x = y by back substitution */
1073   for (k = mbs - 1; k >= 0; k--) {
1074     v  = aa + 16 * ai[k];
1075     vj = aj + ai[k];
1076     tp = t + k * 4;
1077     x0 = tp[0];
1078     x1 = tp[1];
1079     x2 = tp[2];
1080     x3 = tp[3]; /* xk */
1081     nz = ai[k + 1] - ai[k];
1082 
1083     tp = t + (*vj) * 4;
1084     while (nz--) {
1085       /* xk += U(k,:)*x(:) */
1086       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1087       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1088       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1089       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1090       vj++;
1091       tp = t + (*vj) * 4;
1092       v += 16;
1093     }
1094     tp         = t + k * 4;
1095     tp[0]      = x0;
1096     tp[1]      = x1;
1097     tp[2]      = x2;
1098     tp[3]      = x3;
1099     idx        = 4 * r[k];
1100     x[idx]     = x0;
1101     x[idx + 1] = x1;
1102     x[idx + 2] = x2;
1103     x[idx + 3] = x3;
1104   }
1105 
1106   PetscCall(ISRestoreIndices(isrow, &r));
1107   PetscCall(VecRestoreArrayRead(bb, &b));
1108   PetscCall(VecRestoreArray(xx, &x));
1109   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1110   PetscFunctionReturn(PETSC_SUCCESS);
1111 }
1112 
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1113 static PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1114 {
1115   const MatScalar *v, *diag;
1116   PetscScalar     *xp, x0, x1, x2, x3;
1117   PetscInt         nz, k;
1118   const PetscInt  *vj;
1119 
1120   PetscFunctionBegin;
1121   for (k = 0; k < mbs; k++) {
1122     v  = aa + 16 * ai[k];
1123     xp = x + k * 4;
1124     x0 = xp[0];
1125     x1 = xp[1];
1126     x2 = xp[2];
1127     x3 = xp[3]; /* Dk*xk = k-th block of x */
1128     nz = ai[k + 1] - ai[k];
1129     vj = aj + ai[k];
1130     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1131     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1132     while (nz--) {
1133       xp = x + (*vj) * 4;
1134       /* x(:) += U(k,:)^T*(Dk*xk) */
1135       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1136       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1137       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1138       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1139       vj++;
1140       v += 16;
1141     }
1142     /* xk = inv(Dk)*(Dk*xk) */
1143     diag  = aa + k * 16; /* ptr to inv(Dk) */
1144     xp    = x + k * 4;
1145     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1146     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1147     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1148     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1149   }
1150   PetscFunctionReturn(PETSC_SUCCESS);
1151 }
1152 
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1153 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1154 {
1155   const MatScalar *v;
1156   PetscScalar     *xp, x0, x1, x2, x3;
1157   PetscInt         nz, k;
1158   const PetscInt  *vj;
1159 
1160   PetscFunctionBegin;
1161   for (k = mbs - 1; k >= 0; k--) {
1162     v  = aa + 16 * ai[k];
1163     xp = x + k * 4;
1164     x0 = xp[0];
1165     x1 = xp[1];
1166     x2 = xp[2];
1167     x3 = xp[3]; /* xk */
1168     nz = ai[k + 1] - ai[k];
1169     vj = aj + ai[k];
1170     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1171     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1172     while (nz--) {
1173       xp = x + (*vj) * 4;
1174       /* xk += U(k,:)*x(:) */
1175       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1176       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1177       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1178       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1179       vj++;
1180       v += 16;
1181     }
1182     xp    = x + k * 4;
1183     xp[0] = x0;
1184     xp[1] = x1;
1185     xp[2] = x2;
1186     xp[3] = x3;
1187   }
1188   PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190 
MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1191 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1192 {
1193   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1194   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1195   const MatScalar   *aa = a->a;
1196   PetscScalar       *x;
1197   const PetscScalar *b;
1198 
1199   PetscFunctionBegin;
1200   PetscCall(VecGetArrayRead(bb, &b));
1201   PetscCall(VecGetArray(xx, &x));
1202 
1203   /* solve U^T * D * y = b by forward substitution */
1204   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1205   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1206 
1207   /* solve U*x = y by back substitution */
1208   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1209   PetscCall(VecRestoreArrayRead(bb, &b));
1210   PetscCall(VecRestoreArray(xx, &x));
1211   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1212   PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214 
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1215 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1216 {
1217   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1218   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1219   const MatScalar   *aa = a->a;
1220   PetscScalar       *x;
1221   const PetscScalar *b;
1222 
1223   PetscFunctionBegin;
1224   PetscCall(VecGetArrayRead(bb, &b));
1225   PetscCall(VecGetArray(xx, &x));
1226   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1227   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1228   PetscCall(VecRestoreArrayRead(bb, &b));
1229   PetscCall(VecRestoreArray(xx, &x));
1230   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1231   PetscFunctionReturn(PETSC_SUCCESS);
1232 }
1233 
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1234 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1235 {
1236   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1237   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1238   const MatScalar   *aa = a->a;
1239   PetscScalar       *x;
1240   const PetscScalar *b;
1241 
1242   PetscFunctionBegin;
1243   PetscCall(VecGetArrayRead(bb, &b));
1244   PetscCall(VecGetArray(xx, &x));
1245   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1246   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1247   PetscCall(VecRestoreArrayRead(bb, &b));
1248   PetscCall(VecRestoreArray(xx, &x));
1249   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1250   PetscFunctionReturn(PETSC_SUCCESS);
1251 }
1252 
MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)1253 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1254 {
1255   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1256   IS                 isrow = a->row;
1257   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1258   const PetscInt    *r;
1259   PetscInt           nz, k, idx;
1260   const PetscInt    *vj;
1261   const MatScalar   *aa = a->a, *v, *diag;
1262   PetscScalar       *x, x0, x1, x2, *t, *tp;
1263   const PetscScalar *b;
1264 
1265   PetscFunctionBegin;
1266   PetscCall(VecGetArrayRead(bb, &b));
1267   PetscCall(VecGetArray(xx, &x));
1268   t = a->solve_work;
1269   PetscCall(ISGetIndices(isrow, &r));
1270 
1271   /* solve U^T * D * y = b by forward substitution */
1272   tp = t;
1273   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1274     idx   = 3 * r[k];
1275     tp[0] = b[idx];
1276     tp[1] = b[idx + 1];
1277     tp[2] = b[idx + 2];
1278     tp += 3;
1279   }
1280 
1281   for (k = 0; k < mbs; k++) {
1282     v  = aa + 9 * ai[k];
1283     vj = aj + ai[k];
1284     tp = t + k * 3;
1285     x0 = tp[0];
1286     x1 = tp[1];
1287     x2 = tp[2];
1288     nz = ai[k + 1] - ai[k];
1289 
1290     tp = t + (*vj) * 3;
1291     while (nz--) {
1292       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1293       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1294       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1295       vj++;
1296       tp = t + (*vj) * 3;
1297       v += 9;
1298     }
1299 
1300     /* xk = inv(Dk)*(Dk*xk) */
1301     diag  = aa + k * 9; /* ptr to inv(Dk) */
1302     tp    = t + k * 3;
1303     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1304     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1305     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1306   }
1307 
1308   /* solve U*x = y by back substitution */
1309   for (k = mbs - 1; k >= 0; k--) {
1310     v  = aa + 9 * ai[k];
1311     vj = aj + ai[k];
1312     tp = t + k * 3;
1313     x0 = tp[0];
1314     x1 = tp[1];
1315     x2 = tp[2]; /* xk */
1316     nz = ai[k + 1] - ai[k];
1317 
1318     tp = t + (*vj) * 3;
1319     while (nz--) {
1320       /* xk += U(k,:)*x(:) */
1321       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1322       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1323       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1324       vj++;
1325       tp = t + (*vj) * 3;
1326       v += 9;
1327     }
1328     tp         = t + k * 3;
1329     tp[0]      = x0;
1330     tp[1]      = x1;
1331     tp[2]      = x2;
1332     idx        = 3 * r[k];
1333     x[idx]     = x0;
1334     x[idx + 1] = x1;
1335     x[idx + 2] = x2;
1336   }
1337 
1338   PetscCall(ISRestoreIndices(isrow, &r));
1339   PetscCall(VecRestoreArrayRead(bb, &b));
1340   PetscCall(VecRestoreArray(xx, &x));
1341   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1342   PetscFunctionReturn(PETSC_SUCCESS);
1343 }
1344 
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1345 static PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1346 {
1347   const MatScalar *v, *diag;
1348   PetscScalar     *xp, x0, x1, x2;
1349   PetscInt         nz, k;
1350   const PetscInt  *vj;
1351 
1352   PetscFunctionBegin;
1353   for (k = 0; k < mbs; k++) {
1354     v  = aa + 9 * ai[k];
1355     xp = x + k * 3;
1356     x0 = xp[0];
1357     x1 = xp[1];
1358     x2 = xp[2]; /* Dk*xk = k-th block of x */
1359     nz = ai[k + 1] - ai[k];
1360     vj = aj + ai[k];
1361     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1362     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363     while (nz--) {
1364       xp = x + (*vj) * 3;
1365       /* x(:) += U(k,:)^T*(Dk*xk) */
1366       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1367       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1368       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1369       vj++;
1370       v += 9;
1371     }
1372     /* xk = inv(Dk)*(Dk*xk) */
1373     diag  = aa + k * 9; /* ptr to inv(Dk) */
1374     xp    = x + k * 3;
1375     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1376     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1377     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1378   }
1379   PetscFunctionReturn(PETSC_SUCCESS);
1380 }
1381 
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1382 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1383 {
1384   const MatScalar *v;
1385   PetscScalar     *xp, x0, x1, x2;
1386   PetscInt         nz, k;
1387   const PetscInt  *vj;
1388 
1389   PetscFunctionBegin;
1390   for (k = mbs - 1; k >= 0; k--) {
1391     v  = aa + 9 * ai[k];
1392     xp = x + k * 3;
1393     x0 = xp[0];
1394     x1 = xp[1];
1395     x2 = xp[2]; /* xk */
1396     nz = ai[k + 1] - ai[k];
1397     vj = aj + ai[k];
1398     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1399     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1400     while (nz--) {
1401       xp = x + (*vj) * 3;
1402       /* xk += U(k,:)*x(:) */
1403       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1404       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1405       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1406       vj++;
1407       v += 9;
1408     }
1409     xp    = x + k * 3;
1410     xp[0] = x0;
1411     xp[1] = x1;
1412     xp[2] = x2;
1413   }
1414   PetscFunctionReturn(PETSC_SUCCESS);
1415 }
1416 
MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1417 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1418 {
1419   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1420   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1421   const MatScalar   *aa = a->a;
1422   PetscScalar       *x;
1423   const PetscScalar *b;
1424 
1425   PetscFunctionBegin;
1426   PetscCall(VecGetArrayRead(bb, &b));
1427   PetscCall(VecGetArray(xx, &x));
1428 
1429   /* solve U^T * D * y = b by forward substitution */
1430   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1431   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1432 
1433   /* solve U*x = y by back substitution */
1434   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1435 
1436   PetscCall(VecRestoreArrayRead(bb, &b));
1437   PetscCall(VecRestoreArray(xx, &x));
1438   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1439   PetscFunctionReturn(PETSC_SUCCESS);
1440 }
1441 
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1442 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1443 {
1444   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1445   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1446   const MatScalar   *aa = a->a;
1447   PetscScalar       *x;
1448   const PetscScalar *b;
1449 
1450   PetscFunctionBegin;
1451   PetscCall(VecGetArrayRead(bb, &b));
1452   PetscCall(VecGetArray(xx, &x));
1453   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1454   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1455   PetscCall(VecRestoreArrayRead(bb, &b));
1456   PetscCall(VecRestoreArray(xx, &x));
1457   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1458   PetscFunctionReturn(PETSC_SUCCESS);
1459 }
1460 
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1461 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1462 {
1463   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1464   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1465   const MatScalar   *aa = a->a;
1466   PetscScalar       *x;
1467   const PetscScalar *b;
1468 
1469   PetscFunctionBegin;
1470   PetscCall(VecGetArrayRead(bb, &b));
1471   PetscCall(VecGetArray(xx, &x));
1472   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1473   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1474   PetscCall(VecRestoreArrayRead(bb, &b));
1475   PetscCall(VecRestoreArray(xx, &x));
1476   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1477   PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479 
MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)1480 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1481 {
1482   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1483   IS                 isrow = a->row;
1484   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1485   const PetscInt    *r, *vj;
1486   PetscInt           nz, k, k2, idx;
1487   const MatScalar   *aa = a->a, *v, *diag;
1488   PetscScalar       *x, x0, x1, *t;
1489   const PetscScalar *b;
1490 
1491   PetscFunctionBegin;
1492   PetscCall(VecGetArrayRead(bb, &b));
1493   PetscCall(VecGetArray(xx, &x));
1494   t = a->solve_work;
1495   PetscCall(ISGetIndices(isrow, &r));
1496 
1497   /* solve U^T * D * y = perm(b) by forward substitution */
1498   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1499     idx          = 2 * r[k];
1500     t[k * 2]     = b[idx];
1501     t[k * 2 + 1] = b[idx + 1];
1502   }
1503   for (k = 0; k < mbs; k++) {
1504     v  = aa + 4 * ai[k];
1505     vj = aj + ai[k];
1506     k2 = k * 2;
1507     x0 = t[k2];
1508     x1 = t[k2 + 1];
1509     nz = ai[k + 1] - ai[k];
1510     while (nz--) {
1511       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1512       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1513       vj++;
1514       v += 4;
1515     }
1516     diag      = aa + k * 4; /* ptr to inv(Dk) */
1517     t[k2]     = diag[0] * x0 + diag[2] * x1;
1518     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1519   }
1520 
1521   /* solve U*x = y by back substitution */
1522   for (k = mbs - 1; k >= 0; k--) {
1523     v  = aa + 4 * ai[k];
1524     vj = aj + ai[k];
1525     k2 = k * 2;
1526     x0 = t[k2];
1527     x1 = t[k2 + 1];
1528     nz = ai[k + 1] - ai[k];
1529     while (nz--) {
1530       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1531       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1532       vj++;
1533       v += 4;
1534     }
1535     t[k2]      = x0;
1536     t[k2 + 1]  = x1;
1537     idx        = 2 * r[k];
1538     x[idx]     = x0;
1539     x[idx + 1] = x1;
1540   }
1541 
1542   PetscCall(ISRestoreIndices(isrow, &r));
1543   PetscCall(VecRestoreArrayRead(bb, &b));
1544   PetscCall(VecRestoreArray(xx, &x));
1545   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1546   PetscFunctionReturn(PETSC_SUCCESS);
1547 }
1548 
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1549 static PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1550 {
1551   const MatScalar *v, *diag;
1552   PetscScalar      x0, x1;
1553   PetscInt         nz, k, k2;
1554   const PetscInt  *vj;
1555 
1556   PetscFunctionBegin;
1557   for (k = 0; k < mbs; k++) {
1558     v  = aa + 4 * ai[k];
1559     vj = aj + ai[k];
1560     k2 = k * 2;
1561     x0 = x[k2];
1562     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1563     nz = ai[k + 1] - ai[k];
1564     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1565     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1566     while (nz--) {
1567       /* x(:) += U(k,:)^T*(Dk*xk) */
1568       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1569       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1570       vj++;
1571       v += 4;
1572     }
1573     /* xk = inv(Dk)*(Dk*xk) */
1574     diag      = aa + k * 4; /* ptr to inv(Dk) */
1575     x[k2]     = diag[0] * x0 + diag[2] * x1;
1576     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1577   }
1578   PetscFunctionReturn(PETSC_SUCCESS);
1579 }
1580 
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1581 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1582 {
1583   const MatScalar *v;
1584   PetscScalar      x0, x1;
1585   PetscInt         nz, k, k2;
1586   const PetscInt  *vj;
1587 
1588   PetscFunctionBegin;
1589   for (k = mbs - 1; k >= 0; k--) {
1590     v  = aa + 4 * ai[k];
1591     vj = aj + ai[k];
1592     k2 = k * 2;
1593     x0 = x[k2];
1594     x1 = x[k2 + 1]; /* xk */
1595     nz = ai[k + 1] - ai[k];
1596     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1597     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1598     while (nz--) {
1599       /* xk += U(k,:)*x(:) */
1600       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1601       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1602       vj++;
1603       v += 4;
1604     }
1605     x[k2]     = x0;
1606     x[k2 + 1] = x1;
1607   }
1608   PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610 
MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1611 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612 {
1613   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1614   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1615   const MatScalar   *aa = a->a;
1616   PetscScalar       *x;
1617   const PetscScalar *b;
1618 
1619   PetscFunctionBegin;
1620   PetscCall(VecGetArrayRead(bb, &b));
1621   PetscCall(VecGetArray(xx, &x));
1622 
1623   /* solve U^T * D * y = b by forward substitution */
1624   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1625   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1626 
1627   /* solve U*x = y by back substitution */
1628   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1629 
1630   PetscCall(VecRestoreArrayRead(bb, &b));
1631   PetscCall(VecRestoreArray(xx, &x));
1632   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1633   PetscFunctionReturn(PETSC_SUCCESS);
1634 }
1635 
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1636 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1637 {
1638   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1639   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1640   const MatScalar   *aa = a->a;
1641   PetscScalar       *x;
1642   const PetscScalar *b;
1643 
1644   PetscFunctionBegin;
1645   PetscCall(VecGetArrayRead(bb, &b));
1646   PetscCall(VecGetArray(xx, &x));
1647   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1648   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1649   PetscCall(VecRestoreArrayRead(bb, &b));
1650   PetscCall(VecRestoreArray(xx, &x));
1651   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1652   PetscFunctionReturn(PETSC_SUCCESS);
1653 }
1654 
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1655 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1656 {
1657   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1658   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1659   const MatScalar   *aa = a->a;
1660   PetscScalar       *x;
1661   const PetscScalar *b;
1662 
1663   PetscFunctionBegin;
1664   PetscCall(VecGetArrayRead(bb, &b));
1665   PetscCall(VecGetArray(xx, &x));
1666   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1667   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1668   PetscCall(VecRestoreArrayRead(bb, &b));
1669   PetscCall(VecRestoreArray(xx, &x));
1670   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1671   PetscFunctionReturn(PETSC_SUCCESS);
1672 }
1673 
MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1674 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1675 {
1676   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1677   IS                 isrow = a->row;
1678   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1679   const MatScalar   *aa = a->a, *v;
1680   const PetscScalar *b;
1681   PetscScalar       *x, xk, *t;
1682   PetscInt           nz, k, j;
1683 
1684   PetscFunctionBegin;
1685   PetscCall(VecGetArrayRead(bb, &b));
1686   PetscCall(VecGetArray(xx, &x));
1687   t = a->solve_work;
1688   PetscCall(ISGetIndices(isrow, &rp));
1689 
1690   /* solve U^T*D*y = perm(b) by forward substitution */
1691   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1692   for (k = 0; k < mbs; k++) {
1693     v  = aa + ai[k];
1694     vj = aj + ai[k];
1695     xk = t[k];
1696     nz = ai[k + 1] - ai[k] - 1;
1697     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1698     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1699   }
1700 
1701   /* solve U*perm(x) = y by back substitution */
1702   for (k = mbs - 1; k >= 0; k--) {
1703     v  = aa + adiag[k] - 1;
1704     vj = aj + adiag[k] - 1;
1705     nz = ai[k + 1] - ai[k] - 1;
1706     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1707     x[rp[k]] = t[k];
1708   }
1709 
1710   PetscCall(ISRestoreIndices(isrow, &rp));
1711   PetscCall(VecRestoreArrayRead(bb, &b));
1712   PetscCall(VecRestoreArray(xx, &x));
1713   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1714   PetscFunctionReturn(PETSC_SUCCESS);
1715 }
1716 
MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1717 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1718 {
1719   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1720   IS                 isrow = a->row;
1721   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1722   const MatScalar   *aa = a->a, *v;
1723   PetscScalar       *x, xk, *t;
1724   const PetscScalar *b;
1725   PetscInt           nz, k;
1726 
1727   PetscFunctionBegin;
1728   PetscCall(VecGetArrayRead(bb, &b));
1729   PetscCall(VecGetArray(xx, &x));
1730   t = a->solve_work;
1731   PetscCall(ISGetIndices(isrow, &rp));
1732 
1733   /* solve U^T*D*y = perm(b) by forward substitution */
1734   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1735   for (k = 0; k < mbs; k++) {
1736     v  = aa + ai[k] + 1;
1737     vj = aj + ai[k] + 1;
1738     xk = t[k];
1739     nz = ai[k + 1] - ai[k] - 1;
1740     while (nz--) t[*vj++] += (*v++) * xk;
1741     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1742   }
1743 
1744   /* solve U*perm(x) = y by back substitution */
1745   for (k = mbs - 1; k >= 0; k--) {
1746     v  = aa + ai[k] + 1;
1747     vj = aj + ai[k] + 1;
1748     nz = ai[k + 1] - ai[k] - 1;
1749     while (nz--) t[k] += (*v++) * t[*vj++];
1750     x[rp[k]] = t[k];
1751   }
1752 
1753   PetscCall(ISRestoreIndices(isrow, &rp));
1754   PetscCall(VecRestoreArrayRead(bb, &b));
1755   PetscCall(VecRestoreArray(xx, &x));
1756   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1757   PetscFunctionReturn(PETSC_SUCCESS);
1758 }
1759 
MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1760 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1761 {
1762   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1763   IS                 isrow = a->row;
1764   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1765   const MatScalar   *aa = a->a, *v;
1766   PetscReal          diagk;
1767   PetscScalar       *x, xk;
1768   const PetscScalar *b;
1769   PetscInt           nz, k;
1770 
1771   PetscFunctionBegin;
1772   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773   PetscCall(VecGetArrayRead(bb, &b));
1774   PetscCall(VecGetArray(xx, &x));
1775   PetscCall(ISGetIndices(isrow, &rp));
1776 
1777   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1778   for (k = 0; k < mbs; k++) {
1779     v  = aa + ai[k];
1780     vj = aj + ai[k];
1781     xk = x[k];
1782     nz = ai[k + 1] - ai[k] - 1;
1783     while (nz--) x[*vj++] += (*v++) * xk;
1784 
1785     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1787     x[k] = xk * PetscSqrtReal(diagk);
1788   }
1789   PetscCall(ISRestoreIndices(isrow, &rp));
1790   PetscCall(VecRestoreArrayRead(bb, &b));
1791   PetscCall(VecRestoreArray(xx, &x));
1792   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1793   PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795 
MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1796 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1797 {
1798   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1799   IS                 isrow = a->row;
1800   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1801   const MatScalar   *aa = a->a, *v;
1802   PetscReal          diagk;
1803   PetscScalar       *x, xk;
1804   const PetscScalar *b;
1805   PetscInt           nz, k;
1806 
1807   PetscFunctionBegin;
1808   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1809   PetscCall(VecGetArrayRead(bb, &b));
1810   PetscCall(VecGetArray(xx, &x));
1811   PetscCall(ISGetIndices(isrow, &rp));
1812 
1813   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1814   for (k = 0; k < mbs; k++) {
1815     v  = aa + ai[k] + 1;
1816     vj = aj + ai[k] + 1;
1817     xk = x[k];
1818     nz = ai[k + 1] - ai[k] - 1;
1819     while (nz--) x[*vj++] += (*v++) * xk;
1820 
1821     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1822     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1823     x[k] = xk * PetscSqrtReal(diagk);
1824   }
1825   PetscCall(ISRestoreIndices(isrow, &rp));
1826   PetscCall(VecRestoreArrayRead(bb, &b));
1827   PetscCall(VecRestoreArray(xx, &x));
1828   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1829   PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831 
MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1832 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1833 {
1834   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1835   IS                 isrow = a->row;
1836   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1837   const MatScalar   *aa = a->a, *v;
1838   PetscReal          diagk;
1839   PetscScalar       *x, *t;
1840   const PetscScalar *b;
1841   PetscInt           nz, k;
1842 
1843   PetscFunctionBegin;
1844   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1845   PetscCall(VecGetArrayRead(bb, &b));
1846   PetscCall(VecGetArray(xx, &x));
1847   t = a->solve_work;
1848   PetscCall(ISGetIndices(isrow, &rp));
1849 
1850   for (k = mbs - 1; k >= 0; k--) {
1851     v     = aa + ai[k];
1852     vj    = aj + ai[k];
1853     diagk = PetscRealPart(aa[adiag[k]]);
1854     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1855     t[k] = b[k] * PetscSqrtReal(diagk);
1856     nz   = ai[k + 1] - ai[k] - 1;
1857     while (nz--) t[k] += (*v++) * t[*vj++];
1858     x[rp[k]] = t[k];
1859   }
1860   PetscCall(ISRestoreIndices(isrow, &rp));
1861   PetscCall(VecRestoreArrayRead(bb, &b));
1862   PetscCall(VecRestoreArray(xx, &x));
1863   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1864   PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866 
MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1867 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1868 {
1869   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1870   IS                 isrow = a->row;
1871   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1872   const MatScalar   *aa = a->a, *v;
1873   PetscReal          diagk;
1874   PetscScalar       *x, *t;
1875   const PetscScalar *b;
1876   PetscInt           nz, k;
1877 
1878   PetscFunctionBegin;
1879   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1880   PetscCall(VecGetArrayRead(bb, &b));
1881   PetscCall(VecGetArray(xx, &x));
1882   t = a->solve_work;
1883   PetscCall(ISGetIndices(isrow, &rp));
1884 
1885   for (k = mbs - 1; k >= 0; k--) {
1886     v     = aa + ai[k] + 1;
1887     vj    = aj + ai[k] + 1;
1888     diagk = PetscRealPart(aa[ai[k]]);
1889     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1890     t[k] = b[k] * PetscSqrtReal(diagk);
1891     nz   = ai[k + 1] - ai[k] - 1;
1892     while (nz--) t[k] += (*v++) * t[*vj++];
1893     x[rp[k]] = t[k];
1894   }
1895   PetscCall(ISRestoreIndices(isrow, &rp));
1896   PetscCall(VecRestoreArrayRead(bb, &b));
1897   PetscCall(VecRestoreArray(xx, &x));
1898   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1899   PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901 
MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)1902 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1903 {
1904   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1905 
1906   PetscFunctionBegin;
1907   if (A->rmap->bs == 1) {
1908     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1909   } else {
1910     IS                 isrow = a->row;
1911     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1912     const MatScalar   *aa = a->a, *v;
1913     PetscScalar       *x, *t;
1914     const PetscScalar *b;
1915     PetscInt           nz, k, n, i, j;
1916 
1917     if (bb->n > a->solves_work_n) {
1918       PetscCall(PetscFree(a->solves_work));
1919       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1920       a->solves_work_n = bb->n;
1921     }
1922     n = bb->n;
1923     PetscCall(VecGetArrayRead(bb->v, &b));
1924     PetscCall(VecGetArray(xx->v, &x));
1925     t = a->solves_work;
1926 
1927     PetscCall(ISGetIndices(isrow, &rp));
1928 
1929     /* solve U^T*D*y = perm(b) by forward substitution */
1930     for (k = 0; k < mbs; k++) {
1931       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1932     }
1933     for (k = 0; k < mbs; k++) {
1934       v  = aa + ai[k];
1935       vj = aj + ai[k];
1936       nz = ai[k + 1] - ai[k] - 1;
1937       for (j = 0; j < nz; j++) {
1938         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1939         v++;
1940         vj++;
1941       }
1942       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1943     }
1944 
1945     /* solve U*perm(x) = y by back substitution */
1946     for (k = mbs - 1; k >= 0; k--) {
1947       v  = aa + ai[k] - 1;
1948       vj = aj + ai[k] - 1;
1949       nz = ai[k + 1] - ai[k] - 1;
1950       for (j = 0; j < nz; j++) {
1951         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1952         v++;
1953         vj++;
1954       }
1955       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1956     }
1957 
1958     PetscCall(ISRestoreIndices(isrow, &rp));
1959     PetscCall(VecRestoreArrayRead(bb->v, &b));
1960     PetscCall(VecRestoreArray(xx->v, &x));
1961     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1962   }
1963   PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965 
MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)1966 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1967 {
1968   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1969 
1970   PetscFunctionBegin;
1971   if (A->rmap->bs == 1) {
1972     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1973   } else {
1974     IS                 isrow = a->row;
1975     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1976     const MatScalar   *aa = a->a, *v;
1977     PetscScalar       *x, *t;
1978     const PetscScalar *b;
1979     PetscInt           nz, k, n, i;
1980 
1981     if (bb->n > a->solves_work_n) {
1982       PetscCall(PetscFree(a->solves_work));
1983       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1984       a->solves_work_n = bb->n;
1985     }
1986     n = bb->n;
1987     PetscCall(VecGetArrayRead(bb->v, &b));
1988     PetscCall(VecGetArray(xx->v, &x));
1989     t = a->solves_work;
1990 
1991     PetscCall(ISGetIndices(isrow, &rp));
1992 
1993     /* solve U^T*D*y = perm(b) by forward substitution */
1994     for (k = 0; k < mbs; k++) {
1995       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1996     }
1997     for (k = 0; k < mbs; k++) {
1998       v  = aa + ai[k];
1999       vj = aj + ai[k];
2000       nz = ai[k + 1] - ai[k];
2001       while (nz--) {
2002         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2003         v++;
2004         vj++;
2005       }
2006       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2007     }
2008 
2009     /* solve U*perm(x) = y by back substitution */
2010     for (k = mbs - 1; k >= 0; k--) {
2011       v  = aa + ai[k];
2012       vj = aj + ai[k];
2013       nz = ai[k + 1] - ai[k];
2014       while (nz--) {
2015         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2016         v++;
2017         vj++;
2018       }
2019       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2020     }
2021 
2022     PetscCall(ISRestoreIndices(isrow, &rp));
2023     PetscCall(VecRestoreArrayRead(bb->v, &b));
2024     PetscCall(VecRestoreArray(xx->v, &x));
2025     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2026   }
2027   PetscFunctionReturn(PETSC_SUCCESS);
2028 }
2029 
MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2030 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2031 {
2032   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2033   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2034   const MatScalar   *aa = a->a, *v;
2035   const PetscScalar *b;
2036   PetscScalar       *x, xi;
2037   PetscInt           nz, i, j;
2038 
2039   PetscFunctionBegin;
2040   PetscCall(VecGetArrayRead(bb, &b));
2041   PetscCall(VecGetArray(xx, &x));
2042   /* solve U^T*D*y = b by forward substitution */
2043   PetscCall(PetscArraycpy(x, b, mbs));
2044   for (i = 0; i < mbs; i++) {
2045     v  = aa + ai[i];
2046     vj = aj + ai[i];
2047     xi = x[i];
2048     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2049     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2050     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2051   }
2052   /* solve U*x = y by backward substitution */
2053   for (i = mbs - 2; i >= 0; i--) {
2054     xi = x[i];
2055     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2056     vj = aj + adiag[i] - 1;
2057     nz = ai[i + 1] - ai[i] - 1;
2058     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2059     x[i] = xi;
2060   }
2061   PetscCall(VecRestoreArrayRead(bb, &b));
2062   PetscCall(VecRestoreArray(xx, &x));
2063   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2064   PetscFunctionReturn(PETSC_SUCCESS);
2065 }
2066 
MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)2067 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2068 {
2069   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2070   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2071   const MatScalar   *aa = a->a, *v;
2072   const PetscScalar *b;
2073   PetscScalar       *x, xi;
2074   PetscInt           nz, i, j, neq, ldb, ldx;
2075   PetscBool          isdense;
2076 
2077   PetscFunctionBegin;
2078   if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2079   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2080   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2081   if (X != B) {
2082     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2083     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2084   }
2085   PetscCall(MatDenseGetArrayRead(B, &b));
2086   PetscCall(MatDenseGetLDA(B, &ldb));
2087   PetscCall(MatDenseGetArray(X, &x));
2088   PetscCall(MatDenseGetLDA(X, &ldx));
2089   for (neq = 0; neq < B->cmap->n; neq++) {
2090     /* solve U^T*D*y = b by forward substitution */
2091     PetscCall(PetscArraycpy(x, b, mbs));
2092     for (i = 0; i < mbs; i++) {
2093       v  = aa + ai[i];
2094       vj = aj + ai[i];
2095       xi = x[i];
2096       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2097       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2098       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2099     }
2100     /* solve U*x = y by backward substitution */
2101     for (i = mbs - 2; i >= 0; i--) {
2102       xi = x[i];
2103       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2104       vj = aj + adiag[i] - 1;
2105       nz = ai[i + 1] - ai[i] - 1;
2106       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2107       x[i] = xi;
2108     }
2109     b += ldb;
2110     x += ldx;
2111   }
2112   PetscCall(MatDenseRestoreArrayRead(B, &b));
2113   PetscCall(MatDenseRestoreArray(X, &x));
2114   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2115   PetscFunctionReturn(PETSC_SUCCESS);
2116 }
2117 
MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2118 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2119 {
2120   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2121   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2122   const MatScalar   *aa = a->a, *v;
2123   PetscScalar       *x, xk;
2124   const PetscScalar *b;
2125   PetscInt           nz, k;
2126 
2127   PetscFunctionBegin;
2128   PetscCall(VecGetArrayRead(bb, &b));
2129   PetscCall(VecGetArray(xx, &x));
2130 
2131   /* solve U^T*D*y = b by forward substitution */
2132   PetscCall(PetscArraycpy(x, b, mbs));
2133   for (k = 0; k < mbs; k++) {
2134     v  = aa + ai[k] + 1;
2135     vj = aj + ai[k] + 1;
2136     xk = x[k];
2137     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2138     while (nz--) x[*vj++] += (*v++) * xk;
2139     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2140   }
2141 
2142   /* solve U*x = y by back substitution */
2143   for (k = mbs - 2; k >= 0; k--) {
2144     v  = aa + ai[k] + 1;
2145     vj = aj + ai[k] + 1;
2146     xk = x[k];
2147     nz = ai[k + 1] - ai[k] - 1;
2148     while (nz--) xk += (*v++) * x[*vj++];
2149     x[k] = xk;
2150   }
2151 
2152   PetscCall(VecRestoreArrayRead(bb, &b));
2153   PetscCall(VecRestoreArray(xx, &x));
2154   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2155   PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157 
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2158 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2159 {
2160   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2161   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2162   const MatScalar   *aa = a->a, *v;
2163   PetscReal          diagk;
2164   PetscScalar       *x;
2165   const PetscScalar *b;
2166   PetscInt           nz, k;
2167 
2168   PetscFunctionBegin;
2169   /* solve U^T*D^(1/2)*x = b by forward substitution */
2170   PetscCall(VecGetArrayRead(bb, &b));
2171   PetscCall(VecGetArray(xx, &x));
2172   PetscCall(PetscArraycpy(x, b, mbs));
2173   for (k = 0; k < mbs; k++) {
2174     v  = aa + ai[k];
2175     vj = aj + ai[k];
2176     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2177     while (nz--) x[*vj++] += (*v++) * x[k];
2178     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2179     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2180     x[k] *= PetscSqrtReal(diagk);
2181   }
2182   PetscCall(VecRestoreArrayRead(bb, &b));
2183   PetscCall(VecRestoreArray(xx, &x));
2184   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2185   PetscFunctionReturn(PETSC_SUCCESS);
2186 }
2187 
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2188 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2189 {
2190   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2191   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2192   const MatScalar   *aa = a->a, *v;
2193   PetscReal          diagk;
2194   PetscScalar       *x;
2195   const PetscScalar *b;
2196   PetscInt           nz, k;
2197 
2198   PetscFunctionBegin;
2199   /* solve U^T*D^(1/2)*x = b by forward substitution */
2200   PetscCall(VecGetArrayRead(bb, &b));
2201   PetscCall(VecGetArray(xx, &x));
2202   PetscCall(PetscArraycpy(x, b, mbs));
2203   for (k = 0; k < mbs; k++) {
2204     v  = aa + ai[k] + 1;
2205     vj = aj + ai[k] + 1;
2206     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2207     while (nz--) x[*vj++] += (*v++) * x[k];
2208     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2209     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2210     x[k] *= PetscSqrtReal(diagk);
2211   }
2212   PetscCall(VecRestoreArrayRead(bb, &b));
2213   PetscCall(VecRestoreArray(xx, &x));
2214   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2218 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2219 {
2220   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2221   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2222   const MatScalar   *aa = a->a, *v;
2223   PetscReal          diagk;
2224   PetscScalar       *x;
2225   const PetscScalar *b;
2226   PetscInt           nz, k;
2227 
2228   PetscFunctionBegin;
2229   /* solve D^(1/2)*U*x = b by back substitution */
2230   PetscCall(VecGetArrayRead(bb, &b));
2231   PetscCall(VecGetArray(xx, &x));
2232 
2233   for (k = mbs - 1; k >= 0; k--) {
2234     v     = aa + ai[k];
2235     vj    = aj + ai[k];
2236     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2237     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2238     x[k] = PetscSqrtReal(diagk) * b[k];
2239     nz   = ai[k + 1] - ai[k] - 1;
2240     while (nz--) x[k] += (*v++) * x[*vj++];
2241   }
2242   PetscCall(VecRestoreArrayRead(bb, &b));
2243   PetscCall(VecRestoreArray(xx, &x));
2244   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2245   PetscFunctionReturn(PETSC_SUCCESS);
2246 }
2247 
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2248 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2249 {
2250   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2251   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2252   const MatScalar   *aa = a->a, *v;
2253   PetscReal          diagk;
2254   PetscScalar       *x;
2255   const PetscScalar *b;
2256   PetscInt           nz, k;
2257 
2258   PetscFunctionBegin;
2259   /* solve D^(1/2)*U*x = b by back substitution */
2260   PetscCall(VecGetArrayRead(bb, &b));
2261   PetscCall(VecGetArray(xx, &x));
2262 
2263   for (k = mbs - 1; k >= 0; k--) {
2264     v     = aa + ai[k] + 1;
2265     vj    = aj + ai[k] + 1;
2266     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2267     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2268     x[k] = PetscSqrtReal(diagk) * b[k];
2269     nz   = ai[k + 1] - ai[k] - 1;
2270     while (nz--) x[k] += (*v++) * x[*vj++];
2271   }
2272   PetscCall(VecRestoreArrayRead(bb, &b));
2273   PetscCall(VecRestoreArray(xx, &x));
2274   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2275   PetscFunctionReturn(PETSC_SUCCESS);
2276 }
2277 
2278 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo * info)2279 static PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2280 {
2281   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2282   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2283   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2284   PetscInt        m, reallocs = 0, *levtmp;
2285   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2286   PetscInt        incrlev, *lev, shift, prow, nz;
2287   PetscReal       f = info->fill, levels = info->levels;
2288   PetscBool       perm_identity;
2289 
2290   PetscFunctionBegin;
2291   /* check whether perm is the identity mapping */
2292   PetscCall(ISIdentity(perm, &perm_identity));
2293 
2294   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2295   a->permute = PETSC_FALSE;
2296   ai         = a->i;
2297   aj         = a->j;
2298 
2299   /* initialization */
2300   PetscCall(ISGetIndices(perm, &rip));
2301   umax = (PetscInt)(f * ai[mbs] + 1);
2302   PetscCall(PetscMalloc1(umax, &lev));
2303   umax += mbs + 1;
2304   shift = mbs + 1;
2305   PetscCall(PetscMalloc1(mbs + 1, &iu));
2306   PetscCall(PetscMalloc1(umax, &ju));
2307   iu[0] = mbs + 1;
2308   juidx = mbs + 1;
2309   /* prowl: linked list for pivot row */
2310   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2311   /* q: linked list for col index */
2312 
2313   for (i = 0; i < mbs; i++) {
2314     prowl[i] = mbs;
2315     q[i]     = 0;
2316   }
2317 
2318   /* for each row k */
2319   for (k = 0; k < mbs; k++) {
2320     nzk  = 0;
2321     q[k] = mbs;
2322     /* copy current row into linked list */
2323     nz = ai[rip[k] + 1] - ai[rip[k]];
2324     j  = ai[rip[k]];
2325     while (nz--) {
2326       vj = rip[aj[j++]];
2327       if (vj > k) {
2328         qm = k;
2329         do {
2330           m  = qm;
2331           qm = q[m];
2332         } while (qm < vj);
2333         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2334         nzk++;
2335         q[m]       = vj;
2336         q[vj]      = qm;
2337         levtmp[vj] = 0; /* initialize lev for nonzero element */
2338       }
2339     }
2340 
2341     /* modify nonzero structure of k-th row by computing fill-in
2342        for each row prow to be merged in */
2343     prow = k;
2344     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2345 
2346     while (prow < k) {
2347       /* merge row prow into k-th row */
2348       jmin = iu[prow] + 1;
2349       jmax = iu[prow + 1];
2350       qm   = k;
2351       for (j = jmin; j < jmax; j++) {
2352         incrlev = lev[j - shift] + 1;
2353         if (incrlev > levels) continue;
2354         vj = ju[j];
2355         do {
2356           m  = qm;
2357           qm = q[m];
2358         } while (qm < vj);
2359         if (qm != vj) { /* a fill */
2360           nzk++;
2361           q[m]       = vj;
2362           q[vj]      = qm;
2363           qm         = vj;
2364           levtmp[vj] = incrlev;
2365         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2366       }
2367       prow = prowl[prow]; /* next pivot row */
2368     }
2369 
2370     /* add k to row list for first nonzero element in k-th row */
2371     if (nzk > 1) {
2372       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2373       prowl[k] = prowl[i];
2374       prowl[i] = k;
2375     }
2376     iu[k + 1] = iu[k] + nzk;
2377 
2378     /* allocate more space to ju and lev if needed */
2379     if (iu[k + 1] > umax) {
2380       /* estimate how much additional space we will need */
2381       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2382       /* just double the memory each time */
2383       maxadd = umax;
2384       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2385       umax += maxadd;
2386 
2387       /* allocate a longer ju */
2388       PetscCall(PetscMalloc1(umax, &jutmp));
2389       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2390       PetscCall(PetscFree(ju));
2391       ju = jutmp;
2392 
2393       PetscCall(PetscMalloc1(umax, &jutmp));
2394       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2395       PetscCall(PetscFree(lev));
2396       lev = jutmp;
2397       reallocs += 2; /* count how many times we realloc */
2398     }
2399 
2400     /* save nonzero structure of k-th row in ju */
2401     i = k;
2402     while (nzk--) {
2403       i                  = q[i];
2404       ju[juidx]          = i;
2405       lev[juidx - shift] = levtmp[i];
2406       juidx++;
2407     }
2408   }
2409 
2410 #if defined(PETSC_USE_INFO)
2411   if (ai[mbs] != 0) {
2412     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2413     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2414     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2415     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2416     PetscCall(PetscInfo(A, "for best performance.\n"));
2417   } else {
2418     PetscCall(PetscInfo(A, "Empty matrix\n"));
2419   }
2420 #endif
2421 
2422   PetscCall(ISRestoreIndices(perm, &rip));
2423   PetscCall(PetscFree3(prowl, q, levtmp));
2424   PetscCall(PetscFree(lev));
2425 
2426   /* put together the new matrix */
2427   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2428 
2429   b = (Mat_SeqSBAIJ *)B->data;
2430   PetscCall(PetscFree2(b->imax, b->ilen));
2431 
2432   /* the next line frees the default space generated by the Create() */
2433   PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
2434   PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * a->bs2, sizeof(PetscScalar), (void **)&b->a));
2435   b->free_a  = PETSC_TRUE;
2436   b->free_ij = PETSC_TRUE;
2437   b->j       = ju;
2438   b->i       = iu;
2439   b->diag    = NULL;
2440   b->ilen    = NULL;
2441   b->imax    = NULL;
2442 
2443   PetscCall(ISDestroy(&b->row));
2444   PetscCall(ISDestroy(&b->icol));
2445   b->row  = perm;
2446   b->icol = perm;
2447   PetscCall(PetscObjectReference((PetscObject)perm));
2448   PetscCall(PetscObjectReference((PetscObject)perm));
2449   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450   /* In b structure:  Free imax, ilen, old a, old j.
2451      Allocate idnew, solve_work, new a, new j */
2452   b->maxnz = b->nz = iu[mbs];
2453 
2454   B->info.factor_mallocs   = reallocs;
2455   B->info.fill_ratio_given = f;
2456   if (ai[mbs] != 0) {
2457     B->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458   } else {
2459     B->info.fill_ratio_needed = 0.0;
2460   }
2461   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462   PetscFunctionReturn(PETSC_SUCCESS);
2463 }
2464 
2465 /*
2466   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467 */
2468 #include <petscbt.h>
2469 #include <../src/mat/utils/freespace.h>
MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2470 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471 {
2472   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2473   PetscBool          perm_identity, free_ij = PETSC_TRUE, diagDense;
2474   PetscInt           bs = A->rmap->bs, am = a->mbs, *ai = a->i, *aj = a->j;
2475   const PetscInt    *rip;
2476   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2477   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479   PetscReal          fill = info->fill, levels = info->levels;
2480   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482   PetscBT            lnkbt;
2483 
2484   PetscFunctionBegin;
2485   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486   PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, NULL, &diagDense));
2487   PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
2488   if (bs > 1) {
2489     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490     PetscFunctionReturn(PETSC_SUCCESS);
2491   }
2492 
2493   /* check whether perm is the identity mapping */
2494   PetscCall(ISIdentity(perm, &perm_identity));
2495   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496   a->permute = PETSC_FALSE;
2497 
2498   PetscCall(PetscMalloc1(am + 1, &ui));
2499   PetscCall(PetscMalloc1(am + 1, &udiag));
2500   ui[0] = 0;
2501 
2502   /* ICC(0) without matrix ordering: simply rearrange column indices */
2503   if (!levels) {
2504     /* reuse the column pointers and row offsets for memory savings */
2505     for (i = 0; i < am; i++) {
2506       ncols     = ai[i + 1] - ai[i];
2507       ui[i + 1] = ui[i] + ncols;
2508       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509     }
2510     PetscCall(PetscMalloc1(ui[am], &uj));
2511     cols = uj;
2512     for (i = 0; i < am; i++) {
2513       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514       ncols = ai[i + 1] - ai[i] - 1;
2515       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517     }
2518   } else { /* case: levels>0 */
2519     PetscCall(ISGetIndices(perm, &rip));
2520 
2521     /* initialization */
2522     /* jl: linked list for storing indices of the pivot rows
2523        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525     for (i = 0; i < am; i++) {
2526       jl[i] = am;
2527       il[i] = 0;
2528     }
2529 
2530     /* create and initialize a linked list for storing column indices of the active row k */
2531     nlnk = am + 1;
2532     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2533 
2534     /* initial FreeSpace size is fill*(ai[am]+1) */
2535     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2536 
2537     current_space = free_space;
2538 
2539     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2540 
2541     current_space_lvl = free_space_lvl;
2542 
2543     for (k = 0; k < am; k++) { /* for each active row k */
2544       /* initialize lnk by the column indices of row k */
2545       nzk   = 0;
2546       ncols = ai[k + 1] - ai[k];
2547       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548       cols = aj + ai[k];
2549       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550       nzk += nlnk;
2551 
2552       /* update lnk by computing fill-in for each pivot row to be merged in */
2553       prow = jl[k]; /* 1st pivot row */
2554 
2555       while (prow < k) {
2556         nextprow = jl[prow];
2557 
2558         /* merge prow into k-th row */
2559         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560         jmax  = ui[prow + 1];
2561         ncols = jmax - jmin;
2562         i     = jmin - ui[prow];
2563 
2564         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2566         j    = *(uj - 1);
2567         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568         nzk += nlnk;
2569 
2570         /* update il and jl for prow */
2571         if (jmin < jmax) {
2572           il[prow] = jmin;
2573 
2574           j        = *cols;
2575           jl[prow] = jl[j];
2576           jl[j]    = prow;
2577         }
2578         prow = nextprow;
2579       }
2580 
2581       /* if free space is not available, make more free space */
2582       if (current_space->local_remaining < nzk) {
2583         i = am - k + 1;                                    /* num of unfactored rows */
2584         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585         PetscCall(PetscFreeSpaceGet(i, &current_space));
2586         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2587         reallocs++;
2588       }
2589 
2590       /* copy data into free_space and free_space_lvl, then initialize lnk */
2591       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2593 
2594       /* add the k-th row into il and jl */
2595       if (nzk > 1) {
2596         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597         jl[k] = jl[i];
2598         jl[i] = k;
2599         il[k] = ui[k] + 1;
2600       }
2601       uj_ptr[k]     = current_space->array;
2602       uj_lvl_ptr[k] = current_space_lvl->array;
2603 
2604       current_space->array += nzk;
2605       current_space->local_used += nzk;
2606       current_space->local_remaining -= nzk;
2607       current_space_lvl->array += nzk;
2608       current_space_lvl->local_used += nzk;
2609       current_space_lvl->local_remaining -= nzk;
2610 
2611       ui[k + 1] = ui[k] + nzk;
2612     }
2613 
2614     PetscCall(ISRestoreIndices(perm, &rip));
2615     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2616 
2617     /* destroy list of free space and other temporary array(s) */
2618     PetscCall(PetscMalloc1(ui[am], &uj));
2619     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2620     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2622 
2623   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2624 
2625   /* put together the new matrix in MATSEQSBAIJ format */
2626   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2627 
2628   b = (Mat_SeqSBAIJ *)fact->data;
2629   PetscCall(PetscFree2(b->imax, b->ilen));
2630   PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
2631   b->free_a  = PETSC_TRUE;
2632   b->free_ij = free_ij;
2633   b->j       = uj;
2634   b->i       = ui;
2635   b->diag    = udiag;
2636   b->ilen    = NULL;
2637   b->imax    = NULL;
2638   b->row     = perm;
2639   b->col     = perm;
2640 
2641   PetscCall(PetscObjectReference((PetscObject)perm));
2642   PetscCall(PetscObjectReference((PetscObject)perm));
2643 
2644   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2645 
2646   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2647 
2648   b->maxnz = b->nz = ui[am];
2649 
2650   fact->info.factor_mallocs   = reallocs;
2651   fact->info.fill_ratio_given = fill;
2652   if (ai[am] != 0) {
2653     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2654   } else {
2655     fact->info.fill_ratio_needed = 0.0;
2656   }
2657 #if defined(PETSC_USE_INFO)
2658   if (ai[am] != 0) {
2659     PetscReal af = fact->info.fill_ratio_needed;
2660     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2661     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2662     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2663   } else {
2664     PetscCall(PetscInfo(A, "Empty matrix\n"));
2665   }
2666 #endif
2667   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2668   PetscFunctionReturn(PETSC_SUCCESS);
2669 }
2670 
MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2671 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2672 {
2673   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2674   Mat_SeqSBAIJ      *b;
2675   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2676   PetscInt           bs = A->rmap->bs, am = a->mbs;
2677   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2678   PetscInt           reallocs = 0, i, *ui;
2679   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2680   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2681   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2682   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2683   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2684   PetscBT            lnkbt;
2685 
2686   PetscFunctionBegin;
2687   /*
2688    This code originally uses Modified Sparse Row (MSR) storage
2689    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2690    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2691    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2692    thus the original code in MSR format is still used for these cases.
2693    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2694    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2695   */
2696   if (bs > 1) {
2697     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2698     PetscFunctionReturn(PETSC_SUCCESS);
2699   }
2700 
2701   /* check whether perm is the identity mapping */
2702   PetscCall(ISIdentity(perm, &perm_identity));
2703   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2704   a->permute = PETSC_FALSE;
2705 
2706   /* special case that simply copies fill pattern */
2707   if (!levels) {
2708     /* reuse the column pointers and row offsets for memory savings */
2709     ui           = a->i;
2710     uj           = a->j;
2711     free_ij      = PETSC_FALSE;
2712     ratio_needed = 1.0;
2713   } else { /* case: levels>0 */
2714     PetscCall(ISGetIndices(perm, &rip));
2715 
2716     /* initialization */
2717     PetscCall(PetscMalloc1(am + 1, &ui));
2718     ui[0] = 0;
2719 
2720     /* jl: linked list for storing indices of the pivot rows
2721        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2722     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2723     for (i = 0; i < am; i++) {
2724       jl[i] = am;
2725       il[i] = 0;
2726     }
2727 
2728     /* create and initialize a linked list for storing column indices of the active row k */
2729     nlnk = am + 1;
2730     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2731 
2732     /* initial FreeSpace size is fill*(ai[am]+1) */
2733     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2734 
2735     current_space = free_space;
2736 
2737     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2738 
2739     current_space_lvl = free_space_lvl;
2740 
2741     for (k = 0; k < am; k++) { /* for each active row k */
2742       /* initialize lnk by the column indices of row rip[k] */
2743       nzk   = 0;
2744       ncols = ai[rip[k] + 1] - ai[rip[k]];
2745       cols  = aj + ai[rip[k]];
2746       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2747       nzk += nlnk;
2748 
2749       /* update lnk by computing fill-in for each pivot row to be merged in */
2750       prow = jl[k]; /* 1st pivot row */
2751 
2752       while (prow < k) {
2753         nextprow = jl[prow];
2754 
2755         /* merge prow into k-th row */
2756         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2757         jmax     = ui[prow + 1];
2758         ncols    = jmax - jmin;
2759         i        = jmin - ui[prow];
2760         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2761         j        = *(uj_lvl_ptr[prow] + i - 1);
2762         cols_lvl = uj_lvl_ptr[prow] + i;
2763         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2764         nzk += nlnk;
2765 
2766         /* update il and jl for prow */
2767         if (jmin < jmax) {
2768           il[prow] = jmin;
2769           j        = *cols;
2770           jl[prow] = jl[j];
2771           jl[j]    = prow;
2772         }
2773         prow = nextprow;
2774       }
2775 
2776       /* if free space is not available, make more free space */
2777       if (current_space->local_remaining < nzk) {
2778         i = am - k + 1;                                                             /* num of unfactored rows */
2779         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2780         PetscCall(PetscFreeSpaceGet(i, &current_space));
2781         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2782         reallocs++;
2783       }
2784 
2785       /* copy data into free_space and free_space_lvl, then initialize lnk */
2786       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2787 
2788       /* add the k-th row into il and jl */
2789       if (nzk - 1 > 0) {
2790         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2791         jl[k] = jl[i];
2792         jl[i] = k;
2793         il[k] = ui[k] + 1;
2794       }
2795       uj_ptr[k]     = current_space->array;
2796       uj_lvl_ptr[k] = current_space_lvl->array;
2797 
2798       current_space->array += nzk;
2799       current_space->local_used += nzk;
2800       current_space->local_remaining -= nzk;
2801       current_space_lvl->array += nzk;
2802       current_space_lvl->local_used += nzk;
2803       current_space_lvl->local_remaining -= nzk;
2804 
2805       ui[k + 1] = ui[k] + nzk;
2806     }
2807 
2808     PetscCall(ISRestoreIndices(perm, &rip));
2809     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2810 
2811     /* destroy list of free space and other temporary array(s) */
2812     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2813     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2814     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2815     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2816     if (ai[am] != 0) {
2817       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2818     } else {
2819       ratio_needed = 0.0;
2820     }
2821   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2822 
2823   /* put together the new matrix in MATSEQSBAIJ format */
2824   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2825 
2826   b = (Mat_SeqSBAIJ *)fact->data;
2827   PetscCall(PetscFree2(b->imax, b->ilen));
2828   PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
2829   b->free_a        = PETSC_TRUE;
2830   b->free_ij       = free_ij;
2831   b->j             = uj;
2832   b->i             = ui;
2833   b->diag          = NULL;
2834   b->ilen          = NULL;
2835   b->imax          = NULL;
2836   b->row           = perm;
2837   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2838 
2839   PetscCall(PetscObjectReference((PetscObject)perm));
2840   b->icol = perm;
2841   PetscCall(PetscObjectReference((PetscObject)perm));
2842   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2843 
2844   b->maxnz = b->nz = ui[am];
2845 
2846   fact->info.factor_mallocs    = reallocs;
2847   fact->info.fill_ratio_given  = fill;
2848   fact->info.fill_ratio_needed = ratio_needed;
2849 #if defined(PETSC_USE_INFO)
2850   if (ai[am] != 0) {
2851     PetscReal af = fact->info.fill_ratio_needed;
2852     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2853     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2854     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2855   } else {
2856     PetscCall(PetscInfo(A, "Empty matrix\n"));
2857   }
2858 #endif
2859   if (perm_identity) {
2860     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2861   } else {
2862     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2863   }
2864   PetscFunctionReturn(PETSC_SUCCESS);
2865 }
2866