1 /*
2 Factorization code for SBAIJ format.
3 */
4
5 #include <../src/mat/impls/sbaij/seq/sbaij.h>
6 #include <../src/mat/impls/baij/seq/baij.h>
7 #include <petsc/private/kernels/blockinvert.h>
8
MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)9 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
10 {
11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12 IS isrow = a->row;
13 PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
14 const PetscInt *r;
15 PetscInt nz, *vj, k, idx, k1;
16 PetscInt bs = A->rmap->bs, bs2 = a->bs2;
17 const MatScalar *aa = a->a, *v, *diag;
18 PetscScalar *x, *xk, *xj, *xk_tmp, *t;
19 const PetscScalar *b;
20
21 PetscFunctionBegin;
22 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
23 PetscCall(VecGetArrayRead(bb, &b));
24 PetscCall(VecGetArray(xx, &x));
25 t = a->solve_work;
26 PetscCall(ISGetIndices(isrow, &r));
27 PetscCall(PetscMalloc1(bs, &xk_tmp));
28
29 /* solve U^T * D * y = b by forward substitution */
30 xk = t;
31 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
32 idx = bs * r[k];
33 for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
34 }
35 for (k = 0; k < mbs; k++) {
36 v = aa + bs2 * ai[k];
37 xk = t + k * bs; /* Dk*xk = k-th block of x */
38 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
39 nz = ai[k + 1] - ai[k];
40 vj = aj + ai[k];
41 xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
42 while (nz--) {
43 /* x(:) += U(k,:)^T*(Dk*xk) */
44 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
45 vj++;
46 xj = t + (*vj) * bs;
47 v += bs2;
48 }
49 /* xk = inv(Dk)*(Dk*xk) */
50 diag = aa + k * bs2; /* ptr to inv(Dk) */
51 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
52 }
53
54 /* solve U*x = y by back substitution */
55 for (k = mbs - 1; k >= 0; k--) {
56 v = aa + bs2 * ai[k];
57 xk = t + k * bs; /* xk */
58 nz = ai[k + 1] - ai[k];
59 vj = aj + ai[k];
60 xj = t + (*vj) * bs;
61 while (nz--) {
62 /* xk += U(k,:)*x(:) */
63 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
64 vj++;
65 v += bs2;
66 xj = t + (*vj) * bs;
67 }
68 idx = bs * r[k];
69 for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
70 }
71
72 PetscCall(PetscFree(xk_tmp));
73 PetscCall(ISRestoreIndices(isrow, &r));
74 PetscCall(VecRestoreArrayRead(bb, &b));
75 PetscCall(VecRestoreArray(xx, &x));
76 PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
77 PetscFunctionReturn(PETSC_SUCCESS);
78 }
79
MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)80 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
81 {
82 PetscFunctionBegin;
83 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
84 }
85
MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
87 {
88 PetscFunctionBegin;
89 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
90 }
91
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)92 static PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
93 {
94 PetscInt nz, k;
95 const PetscInt *vj, bs2 = bs * bs;
96 const MatScalar *v, *diag;
97 PetscScalar *xk, *xj, *xk_tmp;
98
99 PetscFunctionBegin;
100 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
101 PetscCall(PetscMalloc1(bs, &xk_tmp));
102 for (k = 0; k < mbs; k++) {
103 v = aa + bs2 * ai[k];
104 xk = x + k * bs; /* Dk*xk = k-th block of x */
105 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
106 nz = ai[k + 1] - ai[k];
107 vj = aj + ai[k];
108 xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
109 while (nz--) {
110 /* x(:) += U(k,:)^T*(Dk*xk) */
111 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
112 vj++;
113 xj = x + (size_t)(*vj) * bs;
114 v += bs2;
115 }
116 /* xk = inv(Dk)*(Dk*xk) */
117 diag = aa + k * bs2; /* ptr to inv(Dk) */
118 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
119 }
120 PetscCall(PetscFree(xk_tmp));
121 PetscFunctionReturn(PETSC_SUCCESS);
122 }
123
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)124 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
125 {
126 PetscInt nz, k;
127 const PetscInt *vj, bs2 = bs * bs;
128 const MatScalar *v;
129 PetscScalar *xk, *xj;
130
131 PetscFunctionBegin;
132 for (k = mbs - 1; k >= 0; k--) {
133 v = aa + bs2 * ai[k];
134 xk = x + k * bs; /* xk */
135 nz = ai[k + 1] - ai[k];
136 vj = aj + ai[k];
137 xj = x + (size_t)(*vj) * bs;
138 while (nz--) {
139 /* xk += U(k,:)*x(:) */
140 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
141 vj++;
142 v += bs2;
143 xj = x + (size_t)(*vj) * bs;
144 }
145 }
146 PetscFunctionReturn(PETSC_SUCCESS);
147 }
148
MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)149 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
150 {
151 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
152 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
153 PetscInt bs = A->rmap->bs;
154 const MatScalar *aa = a->a;
155 PetscScalar *x;
156 const PetscScalar *b;
157
158 PetscFunctionBegin;
159 PetscCall(VecGetArrayRead(bb, &b));
160 PetscCall(VecGetArray(xx, &x));
161
162 /* solve U^T * D * y = b by forward substitution */
163 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
164 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
165
166 /* solve U*x = y by back substitution */
167 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
168
169 PetscCall(VecRestoreArrayRead(bb, &b));
170 PetscCall(VecRestoreArray(xx, &x));
171 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
172 PetscFunctionReturn(PETSC_SUCCESS);
173 }
174
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
176 {
177 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
178 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
179 PetscInt bs = A->rmap->bs;
180 const MatScalar *aa = a->a;
181 const PetscScalar *b;
182 PetscScalar *x;
183
184 PetscFunctionBegin;
185 PetscCall(VecGetArrayRead(bb, &b));
186 PetscCall(VecGetArray(xx, &x));
187 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
188 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
189 PetscCall(VecRestoreArrayRead(bb, &b));
190 PetscCall(VecRestoreArray(xx, &x));
191 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
192 PetscFunctionReturn(PETSC_SUCCESS);
193 }
194
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)195 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
196 {
197 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
198 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
199 PetscInt bs = A->rmap->bs;
200 const MatScalar *aa = a->a;
201 const PetscScalar *b;
202 PetscScalar *x;
203
204 PetscFunctionBegin;
205 PetscCall(VecGetArrayRead(bb, &b));
206 PetscCall(VecGetArray(xx, &x));
207 PetscCall(PetscArraycpy(x, b, bs * mbs));
208 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
209 PetscCall(VecRestoreArrayRead(bb, &b));
210 PetscCall(VecRestoreArray(xx, &x));
211 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
212 PetscFunctionReturn(PETSC_SUCCESS);
213 }
214
MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
216 {
217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
218 IS isrow = a->row;
219 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
220 PetscInt nz, k, idx;
221 const MatScalar *aa = a->a, *v, *d;
222 PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
223 const PetscScalar *b;
224
225 PetscFunctionBegin;
226 PetscCall(VecGetArrayRead(bb, &b));
227 PetscCall(VecGetArray(xx, &x));
228 t = a->solve_work;
229 PetscCall(ISGetIndices(isrow, &r));
230
231 /* solve U^T * D * y = b by forward substitution */
232 tp = t;
233 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
234 idx = 7 * r[k];
235 tp[0] = b[idx];
236 tp[1] = b[idx + 1];
237 tp[2] = b[idx + 2];
238 tp[3] = b[idx + 3];
239 tp[4] = b[idx + 4];
240 tp[5] = b[idx + 5];
241 tp[6] = b[idx + 6];
242 tp += 7;
243 }
244
245 for (k = 0; k < mbs; k++) {
246 v = aa + 49 * ai[k];
247 vj = aj + ai[k];
248 tp = t + k * 7;
249 x0 = tp[0];
250 x1 = tp[1];
251 x2 = tp[2];
252 x3 = tp[3];
253 x4 = tp[4];
254 x5 = tp[5];
255 x6 = tp[6];
256 nz = ai[k + 1] - ai[k];
257 tp = t + (*vj) * 7;
258 while (nz--) {
259 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
260 tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
261 tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
262 tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
263 tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
264 tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
265 tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
266 vj++;
267 tp = t + (*vj) * 7;
268 v += 49;
269 }
270
271 /* xk = inv(Dk)*(Dk*xk) */
272 d = aa + k * 49; /* ptr to inv(Dk) */
273 tp = t + k * 7;
274 tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
275 tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
276 tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
277 tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
278 tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
279 tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
280 tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
281 }
282
283 /* solve U*x = y by back substitution */
284 for (k = mbs - 1; k >= 0; k--) {
285 v = aa + 49 * ai[k];
286 vj = aj + ai[k];
287 tp = t + k * 7;
288 x0 = tp[0];
289 x1 = tp[1];
290 x2 = tp[2];
291 x3 = tp[3];
292 x4 = tp[4];
293 x5 = tp[5];
294 x6 = tp[6]; /* xk */
295 nz = ai[k + 1] - ai[k];
296
297 tp = t + (*vj) * 7;
298 while (nz--) {
299 /* xk += U(k,:)*x(:) */
300 x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
301 x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
302 x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
303 x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
304 x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
305 x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
306 x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
307 vj++;
308 tp = t + (*vj) * 7;
309 v += 49;
310 }
311 tp = t + k * 7;
312 tp[0] = x0;
313 tp[1] = x1;
314 tp[2] = x2;
315 tp[3] = x3;
316 tp[4] = x4;
317 tp[5] = x5;
318 tp[6] = x6;
319 idx = 7 * r[k];
320 x[idx] = x0;
321 x[idx + 1] = x1;
322 x[idx + 2] = x2;
323 x[idx + 3] = x3;
324 x[idx + 4] = x4;
325 x[idx + 5] = x5;
326 x[idx + 6] = x6;
327 }
328
329 PetscCall(ISRestoreIndices(isrow, &r));
330 PetscCall(VecRestoreArrayRead(bb, &b));
331 PetscCall(VecRestoreArray(xx, &x));
332 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
333 PetscFunctionReturn(PETSC_SUCCESS);
334 }
335
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)336 static PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
337 {
338 const MatScalar *v, *d;
339 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
340 PetscInt nz, k;
341 const PetscInt *vj;
342
343 PetscFunctionBegin;
344 for (k = 0; k < mbs; k++) {
345 v = aa + 49 * ai[k];
346 xp = x + k * 7;
347 x0 = xp[0];
348 x1 = xp[1];
349 x2 = xp[2];
350 x3 = xp[3];
351 x4 = xp[4];
352 x5 = xp[5];
353 x6 = xp[6]; /* Dk*xk = k-th block of x */
354 nz = ai[k + 1] - ai[k];
355 vj = aj + ai[k];
356 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357 PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358 while (nz--) {
359 xp = x + (*vj) * 7;
360 /* x(:) += U(k,:)^T*(Dk*xk) */
361 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
362 xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
363 xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
364 xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
365 xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
366 xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
367 xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
368 vj++;
369 v += 49;
370 }
371 /* xk = inv(Dk)*(Dk*xk) */
372 d = aa + k * 49; /* ptr to inv(Dk) */
373 xp = x + k * 7;
374 xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
375 xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
376 xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
377 xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
378 xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
379 xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
380 xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
381 }
382 PetscFunctionReturn(PETSC_SUCCESS);
383 }
384
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)385 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
386 {
387 const MatScalar *v;
388 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6;
389 PetscInt nz, k;
390 const PetscInt *vj;
391
392 PetscFunctionBegin;
393 for (k = mbs - 1; k >= 0; k--) {
394 v = aa + 49 * ai[k];
395 xp = x + k * 7;
396 x0 = xp[0];
397 x1 = xp[1];
398 x2 = xp[2];
399 x3 = xp[3];
400 x4 = xp[4];
401 x5 = xp[5];
402 x6 = xp[6]; /* xk */
403 nz = ai[k + 1] - ai[k];
404 vj = aj + ai[k];
405 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
406 PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
407 while (nz--) {
408 xp = x + (*vj) * 7;
409 /* xk += U(k,:)*x(:) */
410 x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
411 x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
412 x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
413 x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
414 x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
415 x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
416 x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
417 vj++;
418 v += 49;
419 }
420 xp = x + k * 7;
421 xp[0] = x0;
422 xp[1] = x1;
423 xp[2] = x2;
424 xp[3] = x3;
425 xp[4] = x4;
426 xp[5] = x5;
427 xp[6] = x6;
428 }
429 PetscFunctionReturn(PETSC_SUCCESS);
430 }
431
MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)432 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
433 {
434 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
435 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
436 const MatScalar *aa = a->a;
437 PetscScalar *x;
438 const PetscScalar *b;
439
440 PetscFunctionBegin;
441 PetscCall(VecGetArrayRead(bb, &b));
442 PetscCall(VecGetArray(xx, &x));
443
444 /* solve U^T * D * y = b by forward substitution */
445 PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
446 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
447
448 /* solve U*x = y by back substitution */
449 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
450
451 PetscCall(VecRestoreArrayRead(bb, &b));
452 PetscCall(VecRestoreArray(xx, &x));
453 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
454 PetscFunctionReturn(PETSC_SUCCESS);
455 }
456
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)457 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
458 {
459 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
460 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
461 const MatScalar *aa = a->a;
462 PetscScalar *x;
463 const PetscScalar *b;
464
465 PetscFunctionBegin;
466 PetscCall(VecGetArrayRead(bb, &b));
467 PetscCall(VecGetArray(xx, &x));
468 PetscCall(PetscArraycpy(x, b, 7 * mbs));
469 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
470 PetscCall(VecRestoreArrayRead(bb, &b));
471 PetscCall(VecRestoreArray(xx, &x));
472 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
473 PetscFunctionReturn(PETSC_SUCCESS);
474 }
475
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)476 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
477 {
478 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
479 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
480 const MatScalar *aa = a->a;
481 PetscScalar *x;
482 const PetscScalar *b;
483
484 PetscFunctionBegin;
485 PetscCall(VecGetArrayRead(bb, &b));
486 PetscCall(VecGetArray(xx, &x));
487 PetscCall(PetscArraycpy(x, b, 7 * mbs));
488 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
489 PetscCall(VecRestoreArrayRead(bb, &b));
490 PetscCall(VecRestoreArray(xx, &x));
491 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
492 PetscFunctionReturn(PETSC_SUCCESS);
493 }
494
MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)495 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
496 {
497 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
498 IS isrow = a->row;
499 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
500 PetscInt nz, k, idx;
501 const MatScalar *aa = a->a, *v, *d;
502 PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp;
503 const PetscScalar *b;
504
505 PetscFunctionBegin;
506 PetscCall(VecGetArrayRead(bb, &b));
507 PetscCall(VecGetArray(xx, &x));
508 t = a->solve_work;
509 PetscCall(ISGetIndices(isrow, &r));
510
511 /* solve U^T * D * y = b by forward substitution */
512 tp = t;
513 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
514 idx = 6 * r[k];
515 tp[0] = b[idx];
516 tp[1] = b[idx + 1];
517 tp[2] = b[idx + 2];
518 tp[3] = b[idx + 3];
519 tp[4] = b[idx + 4];
520 tp[5] = b[idx + 5];
521 tp += 6;
522 }
523
524 for (k = 0; k < mbs; k++) {
525 v = aa + 36 * ai[k];
526 vj = aj + ai[k];
527 tp = t + k * 6;
528 x0 = tp[0];
529 x1 = tp[1];
530 x2 = tp[2];
531 x3 = tp[3];
532 x4 = tp[4];
533 x5 = tp[5];
534 nz = ai[k + 1] - ai[k];
535 tp = t + (*vj) * 6;
536 while (nz--) {
537 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
538 tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
539 tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
540 tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
541 tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
542 tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
543 vj++;
544 tp = t + (*vj) * 6;
545 v += 36;
546 }
547
548 /* xk = inv(Dk)*(Dk*xk) */
549 d = aa + k * 36; /* ptr to inv(Dk) */
550 tp = t + k * 6;
551 tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
552 tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
553 tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
554 tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
555 tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
556 tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
557 }
558
559 /* solve U*x = y by back substitution */
560 for (k = mbs - 1; k >= 0; k--) {
561 v = aa + 36 * ai[k];
562 vj = aj + ai[k];
563 tp = t + k * 6;
564 x0 = tp[0];
565 x1 = tp[1];
566 x2 = tp[2];
567 x3 = tp[3];
568 x4 = tp[4];
569 x5 = tp[5]; /* xk */
570 nz = ai[k + 1] - ai[k];
571
572 tp = t + (*vj) * 6;
573 while (nz--) {
574 /* xk += U(k,:)*x(:) */
575 x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
576 x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
577 x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
578 x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
579 x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
580 x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
581 vj++;
582 tp = t + (*vj) * 6;
583 v += 36;
584 }
585 tp = t + k * 6;
586 tp[0] = x0;
587 tp[1] = x1;
588 tp[2] = x2;
589 tp[3] = x3;
590 tp[4] = x4;
591 tp[5] = x5;
592 idx = 6 * r[k];
593 x[idx] = x0;
594 x[idx + 1] = x1;
595 x[idx + 2] = x2;
596 x[idx + 3] = x3;
597 x[idx + 4] = x4;
598 x[idx + 5] = x5;
599 }
600
601 PetscCall(ISRestoreIndices(isrow, &r));
602 PetscCall(VecRestoreArrayRead(bb, &b));
603 PetscCall(VecRestoreArray(xx, &x));
604 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
605 PetscFunctionReturn(PETSC_SUCCESS);
606 }
607
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)608 static PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
609 {
610 const MatScalar *v, *d;
611 PetscScalar *xp, x0, x1, x2, x3, x4, x5;
612 PetscInt nz, k;
613 const PetscInt *vj;
614
615 PetscFunctionBegin;
616 for (k = 0; k < mbs; k++) {
617 v = aa + 36 * ai[k];
618 xp = x + k * 6;
619 x0 = xp[0];
620 x1 = xp[1];
621 x2 = xp[2];
622 x3 = xp[3];
623 x4 = xp[4];
624 x5 = xp[5]; /* Dk*xk = k-th block of x */
625 nz = ai[k + 1] - ai[k];
626 vj = aj + ai[k];
627 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
628 PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
629 while (nz--) {
630 xp = x + (*vj) * 6;
631 /* x(:) += U(k,:)^T*(Dk*xk) */
632 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
633 xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
634 xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
635 xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
636 xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
637 xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
638 vj++;
639 v += 36;
640 }
641 /* xk = inv(Dk)*(Dk*xk) */
642 d = aa + k * 36; /* ptr to inv(Dk) */
643 xp = x + k * 6;
644 xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
645 xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
646 xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
647 xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
648 xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
649 xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
650 }
651 PetscFunctionReturn(PETSC_SUCCESS);
652 }
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)653 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
654 {
655 const MatScalar *v;
656 PetscScalar *xp, x0, x1, x2, x3, x4, x5;
657 PetscInt nz, k;
658 const PetscInt *vj;
659
660 PetscFunctionBegin;
661 for (k = mbs - 1; k >= 0; k--) {
662 v = aa + 36 * ai[k];
663 xp = x + k * 6;
664 x0 = xp[0];
665 x1 = xp[1];
666 x2 = xp[2];
667 x3 = xp[3];
668 x4 = xp[4];
669 x5 = xp[5]; /* xk */
670 nz = ai[k + 1] - ai[k];
671 vj = aj + ai[k];
672 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
673 PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
674 while (nz--) {
675 xp = x + (*vj) * 6;
676 /* xk += U(k,:)*x(:) */
677 x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
678 x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
679 x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
680 x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
681 x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
682 x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
683 vj++;
684 v += 36;
685 }
686 xp = x + k * 6;
687 xp[0] = x0;
688 xp[1] = x1;
689 xp[2] = x2;
690 xp[3] = x3;
691 xp[4] = x4;
692 xp[5] = x5;
693 }
694 PetscFunctionReturn(PETSC_SUCCESS);
695 }
696
MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)697 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
698 {
699 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
700 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
701 const MatScalar *aa = a->a;
702 PetscScalar *x;
703 const PetscScalar *b;
704
705 PetscFunctionBegin;
706 PetscCall(VecGetArrayRead(bb, &b));
707 PetscCall(VecGetArray(xx, &x));
708
709 /* solve U^T * D * y = b by forward substitution */
710 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
711 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
712
713 /* solve U*x = y by back substitution */
714 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
715
716 PetscCall(VecRestoreArrayRead(bb, &b));
717 PetscCall(VecRestoreArray(xx, &x));
718 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
719 PetscFunctionReturn(PETSC_SUCCESS);
720 }
721
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)722 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
723 {
724 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
725 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
726 const MatScalar *aa = a->a;
727 PetscScalar *x;
728 const PetscScalar *b;
729
730 PetscFunctionBegin;
731 PetscCall(VecGetArrayRead(bb, &b));
732 PetscCall(VecGetArray(xx, &x));
733 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
734 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
735 PetscCall(VecRestoreArrayRead(bb, &b));
736 PetscCall(VecRestoreArray(xx, &x));
737 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
738 PetscFunctionReturn(PETSC_SUCCESS);
739 }
740
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)741 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
742 {
743 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
744 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
745 const MatScalar *aa = a->a;
746 PetscScalar *x;
747 const PetscScalar *b;
748
749 PetscFunctionBegin;
750 PetscCall(VecGetArrayRead(bb, &b));
751 PetscCall(VecGetArray(xx, &x));
752 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
753 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
754 PetscCall(VecRestoreArrayRead(bb, &b));
755 PetscCall(VecRestoreArray(xx, &x));
756 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
757 PetscFunctionReturn(PETSC_SUCCESS);
758 }
759
MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)760 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
761 {
762 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
763 IS isrow = a->row;
764 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
765 const PetscInt *r, *vj;
766 PetscInt nz, k, idx;
767 const MatScalar *aa = a->a, *v, *diag;
768 PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp;
769 const PetscScalar *b;
770
771 PetscFunctionBegin;
772 PetscCall(VecGetArrayRead(bb, &b));
773 PetscCall(VecGetArray(xx, &x));
774 t = a->solve_work;
775 PetscCall(ISGetIndices(isrow, &r));
776
777 /* solve U^T * D * y = b by forward substitution */
778 tp = t;
779 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
780 idx = 5 * r[k];
781 tp[0] = b[idx];
782 tp[1] = b[idx + 1];
783 tp[2] = b[idx + 2];
784 tp[3] = b[idx + 3];
785 tp[4] = b[idx + 4];
786 tp += 5;
787 }
788
789 for (k = 0; k < mbs; k++) {
790 v = aa + 25 * ai[k];
791 vj = aj + ai[k];
792 tp = t + k * 5;
793 x0 = tp[0];
794 x1 = tp[1];
795 x2 = tp[2];
796 x3 = tp[3];
797 x4 = tp[4];
798 nz = ai[k + 1] - ai[k];
799
800 tp = t + (*vj) * 5;
801 while (nz--) {
802 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
803 tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
804 tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
805 tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
806 tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
807 vj++;
808 tp = t + (*vj) * 5;
809 v += 25;
810 }
811
812 /* xk = inv(Dk)*(Dk*xk) */
813 diag = aa + k * 25; /* ptr to inv(Dk) */
814 tp = t + k * 5;
815 tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
816 tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
817 tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
818 tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
819 tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
820 }
821
822 /* solve U*x = y by back substitution */
823 for (k = mbs - 1; k >= 0; k--) {
824 v = aa + 25 * ai[k];
825 vj = aj + ai[k];
826 tp = t + k * 5;
827 x0 = tp[0];
828 x1 = tp[1];
829 x2 = tp[2];
830 x3 = tp[3];
831 x4 = tp[4]; /* xk */
832 nz = ai[k + 1] - ai[k];
833
834 tp = t + (*vj) * 5;
835 while (nz--) {
836 /* xk += U(k,:)*x(:) */
837 x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
838 x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
839 x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
840 x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
841 x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
842 vj++;
843 tp = t + (*vj) * 5;
844 v += 25;
845 }
846 tp = t + k * 5;
847 tp[0] = x0;
848 tp[1] = x1;
849 tp[2] = x2;
850 tp[3] = x3;
851 tp[4] = x4;
852 idx = 5 * r[k];
853 x[idx] = x0;
854 x[idx + 1] = x1;
855 x[idx + 2] = x2;
856 x[idx + 3] = x3;
857 x[idx + 4] = x4;
858 }
859
860 PetscCall(ISRestoreIndices(isrow, &r));
861 PetscCall(VecRestoreArrayRead(bb, &b));
862 PetscCall(VecRestoreArray(xx, &x));
863 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
864 PetscFunctionReturn(PETSC_SUCCESS);
865 }
866
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)867 static PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
868 {
869 const MatScalar *v, *diag;
870 PetscScalar *xp, x0, x1, x2, x3, x4;
871 PetscInt nz, k;
872 const PetscInt *vj;
873
874 PetscFunctionBegin;
875 for (k = 0; k < mbs; k++) {
876 v = aa + 25 * ai[k];
877 xp = x + k * 5;
878 x0 = xp[0];
879 x1 = xp[1];
880 x2 = xp[2];
881 x3 = xp[3];
882 x4 = xp[4]; /* Dk*xk = k-th block of x */
883 nz = ai[k + 1] - ai[k];
884 vj = aj + ai[k];
885 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
886 PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
887 while (nz--) {
888 xp = x + (*vj) * 5;
889 /* x(:) += U(k,:)^T*(Dk*xk) */
890 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
891 xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
892 xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
893 xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
894 xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
895 vj++;
896 v += 25;
897 }
898 /* xk = inv(Dk)*(Dk*xk) */
899 diag = aa + k * 25; /* ptr to inv(Dk) */
900 xp = x + k * 5;
901 xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
902 xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
903 xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
904 xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
905 xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
906 }
907 PetscFunctionReturn(PETSC_SUCCESS);
908 }
909
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)910 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
911 {
912 const MatScalar *v;
913 PetscScalar *xp, x0, x1, x2, x3, x4;
914 PetscInt nz, k;
915 const PetscInt *vj;
916
917 PetscFunctionBegin;
918 for (k = mbs - 1; k >= 0; k--) {
919 v = aa + 25 * ai[k];
920 xp = x + k * 5;
921 x0 = xp[0];
922 x1 = xp[1];
923 x2 = xp[2];
924 x3 = xp[3];
925 x4 = xp[4]; /* xk */
926 nz = ai[k + 1] - ai[k];
927 vj = aj + ai[k];
928 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
929 PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
930 while (nz--) {
931 xp = x + (*vj) * 5;
932 /* xk += U(k,:)*x(:) */
933 x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
934 x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
935 x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
936 x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
937 x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
938 vj++;
939 v += 25;
940 }
941 xp = x + k * 5;
942 xp[0] = x0;
943 xp[1] = x1;
944 xp[2] = x2;
945 xp[3] = x3;
946 xp[4] = x4;
947 }
948 PetscFunctionReturn(PETSC_SUCCESS);
949 }
950
MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)951 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
952 {
953 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
954 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
955 const MatScalar *aa = a->a;
956 PetscScalar *x;
957 const PetscScalar *b;
958
959 PetscFunctionBegin;
960 PetscCall(VecGetArrayRead(bb, &b));
961 PetscCall(VecGetArray(xx, &x));
962
963 /* solve U^T * D * y = b by forward substitution */
964 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
965 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
966
967 /* solve U*x = y by back substitution */
968 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
969
970 PetscCall(VecRestoreArrayRead(bb, &b));
971 PetscCall(VecRestoreArray(xx, &x));
972 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
973 PetscFunctionReturn(PETSC_SUCCESS);
974 }
975
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)976 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
977 {
978 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
979 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
980 const MatScalar *aa = a->a;
981 PetscScalar *x;
982 const PetscScalar *b;
983
984 PetscFunctionBegin;
985 PetscCall(VecGetArrayRead(bb, &b));
986 PetscCall(VecGetArray(xx, &x));
987 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
988 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
989 PetscCall(VecRestoreArrayRead(bb, &b));
990 PetscCall(VecRestoreArray(xx, &x));
991 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
992 PetscFunctionReturn(PETSC_SUCCESS);
993 }
994
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)995 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
996 {
997 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
998 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
999 const MatScalar *aa = a->a;
1000 PetscScalar *x;
1001 const PetscScalar *b;
1002
1003 PetscFunctionBegin;
1004 PetscCall(VecGetArrayRead(bb, &b));
1005 PetscCall(VecGetArray(xx, &x));
1006 PetscCall(PetscArraycpy(x, b, 5 * mbs));
1007 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1008 PetscCall(VecRestoreArrayRead(bb, &b));
1009 PetscCall(VecRestoreArray(xx, &x));
1010 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1011 PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013
MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)1014 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1015 {
1016 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1017 IS isrow = a->row;
1018 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1019 const PetscInt *r, *vj;
1020 PetscInt nz, k, idx;
1021 const MatScalar *aa = a->a, *v, *diag;
1022 PetscScalar *x, x0, x1, x2, x3, *t, *tp;
1023 const PetscScalar *b;
1024
1025 PetscFunctionBegin;
1026 PetscCall(VecGetArrayRead(bb, &b));
1027 PetscCall(VecGetArray(xx, &x));
1028 t = a->solve_work;
1029 PetscCall(ISGetIndices(isrow, &r));
1030
1031 /* solve U^T * D * y = b by forward substitution */
1032 tp = t;
1033 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1034 idx = 4 * r[k];
1035 tp[0] = b[idx];
1036 tp[1] = b[idx + 1];
1037 tp[2] = b[idx + 2];
1038 tp[3] = b[idx + 3];
1039 tp += 4;
1040 }
1041
1042 for (k = 0; k < mbs; k++) {
1043 v = aa + 16 * ai[k];
1044 vj = aj + ai[k];
1045 tp = t + k * 4;
1046 x0 = tp[0];
1047 x1 = tp[1];
1048 x2 = tp[2];
1049 x3 = tp[3];
1050 nz = ai[k + 1] - ai[k];
1051
1052 tp = t + (*vj) * 4;
1053 while (nz--) {
1054 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1055 tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1056 tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1057 tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1058 vj++;
1059 tp = t + (*vj) * 4;
1060 v += 16;
1061 }
1062
1063 /* xk = inv(Dk)*(Dk*xk) */
1064 diag = aa + k * 16; /* ptr to inv(Dk) */
1065 tp = t + k * 4;
1066 tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1067 tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1068 tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1069 tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1070 }
1071
1072 /* solve U*x = y by back substitution */
1073 for (k = mbs - 1; k >= 0; k--) {
1074 v = aa + 16 * ai[k];
1075 vj = aj + ai[k];
1076 tp = t + k * 4;
1077 x0 = tp[0];
1078 x1 = tp[1];
1079 x2 = tp[2];
1080 x3 = tp[3]; /* xk */
1081 nz = ai[k + 1] - ai[k];
1082
1083 tp = t + (*vj) * 4;
1084 while (nz--) {
1085 /* xk += U(k,:)*x(:) */
1086 x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1087 x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1088 x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1089 x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1090 vj++;
1091 tp = t + (*vj) * 4;
1092 v += 16;
1093 }
1094 tp = t + k * 4;
1095 tp[0] = x0;
1096 tp[1] = x1;
1097 tp[2] = x2;
1098 tp[3] = x3;
1099 idx = 4 * r[k];
1100 x[idx] = x0;
1101 x[idx + 1] = x1;
1102 x[idx + 2] = x2;
1103 x[idx + 3] = x3;
1104 }
1105
1106 PetscCall(ISRestoreIndices(isrow, &r));
1107 PetscCall(VecRestoreArrayRead(bb, &b));
1108 PetscCall(VecRestoreArray(xx, &x));
1109 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1110 PetscFunctionReturn(PETSC_SUCCESS);
1111 }
1112
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1113 static PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1114 {
1115 const MatScalar *v, *diag;
1116 PetscScalar *xp, x0, x1, x2, x3;
1117 PetscInt nz, k;
1118 const PetscInt *vj;
1119
1120 PetscFunctionBegin;
1121 for (k = 0; k < mbs; k++) {
1122 v = aa + 16 * ai[k];
1123 xp = x + k * 4;
1124 x0 = xp[0];
1125 x1 = xp[1];
1126 x2 = xp[2];
1127 x3 = xp[3]; /* Dk*xk = k-th block of x */
1128 nz = ai[k + 1] - ai[k];
1129 vj = aj + ai[k];
1130 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1131 PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1132 while (nz--) {
1133 xp = x + (*vj) * 4;
1134 /* x(:) += U(k,:)^T*(Dk*xk) */
1135 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1136 xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1137 xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1138 xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1139 vj++;
1140 v += 16;
1141 }
1142 /* xk = inv(Dk)*(Dk*xk) */
1143 diag = aa + k * 16; /* ptr to inv(Dk) */
1144 xp = x + k * 4;
1145 xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1146 xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1147 xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1148 xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1149 }
1150 PetscFunctionReturn(PETSC_SUCCESS);
1151 }
1152
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1153 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1154 {
1155 const MatScalar *v;
1156 PetscScalar *xp, x0, x1, x2, x3;
1157 PetscInt nz, k;
1158 const PetscInt *vj;
1159
1160 PetscFunctionBegin;
1161 for (k = mbs - 1; k >= 0; k--) {
1162 v = aa + 16 * ai[k];
1163 xp = x + k * 4;
1164 x0 = xp[0];
1165 x1 = xp[1];
1166 x2 = xp[2];
1167 x3 = xp[3]; /* xk */
1168 nz = ai[k + 1] - ai[k];
1169 vj = aj + ai[k];
1170 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1171 PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1172 while (nz--) {
1173 xp = x + (*vj) * 4;
1174 /* xk += U(k,:)*x(:) */
1175 x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1176 x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1177 x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1178 x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1179 vj++;
1180 v += 16;
1181 }
1182 xp = x + k * 4;
1183 xp[0] = x0;
1184 xp[1] = x1;
1185 xp[2] = x2;
1186 xp[3] = x3;
1187 }
1188 PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190
MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1191 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1192 {
1193 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1194 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1195 const MatScalar *aa = a->a;
1196 PetscScalar *x;
1197 const PetscScalar *b;
1198
1199 PetscFunctionBegin;
1200 PetscCall(VecGetArrayRead(bb, &b));
1201 PetscCall(VecGetArray(xx, &x));
1202
1203 /* solve U^T * D * y = b by forward substitution */
1204 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1205 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1206
1207 /* solve U*x = y by back substitution */
1208 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1209 PetscCall(VecRestoreArrayRead(bb, &b));
1210 PetscCall(VecRestoreArray(xx, &x));
1211 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1212 PetscFunctionReturn(PETSC_SUCCESS);
1213 }
1214
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1215 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1216 {
1217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1218 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1219 const MatScalar *aa = a->a;
1220 PetscScalar *x;
1221 const PetscScalar *b;
1222
1223 PetscFunctionBegin;
1224 PetscCall(VecGetArrayRead(bb, &b));
1225 PetscCall(VecGetArray(xx, &x));
1226 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1227 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1228 PetscCall(VecRestoreArrayRead(bb, &b));
1229 PetscCall(VecRestoreArray(xx, &x));
1230 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1231 PetscFunctionReturn(PETSC_SUCCESS);
1232 }
1233
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1234 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1235 {
1236 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1237 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1238 const MatScalar *aa = a->a;
1239 PetscScalar *x;
1240 const PetscScalar *b;
1241
1242 PetscFunctionBegin;
1243 PetscCall(VecGetArrayRead(bb, &b));
1244 PetscCall(VecGetArray(xx, &x));
1245 PetscCall(PetscArraycpy(x, b, 4 * mbs));
1246 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1247 PetscCall(VecRestoreArrayRead(bb, &b));
1248 PetscCall(VecRestoreArray(xx, &x));
1249 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1250 PetscFunctionReturn(PETSC_SUCCESS);
1251 }
1252
MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)1253 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1254 {
1255 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1256 IS isrow = a->row;
1257 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1258 const PetscInt *r;
1259 PetscInt nz, k, idx;
1260 const PetscInt *vj;
1261 const MatScalar *aa = a->a, *v, *diag;
1262 PetscScalar *x, x0, x1, x2, *t, *tp;
1263 const PetscScalar *b;
1264
1265 PetscFunctionBegin;
1266 PetscCall(VecGetArrayRead(bb, &b));
1267 PetscCall(VecGetArray(xx, &x));
1268 t = a->solve_work;
1269 PetscCall(ISGetIndices(isrow, &r));
1270
1271 /* solve U^T * D * y = b by forward substitution */
1272 tp = t;
1273 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1274 idx = 3 * r[k];
1275 tp[0] = b[idx];
1276 tp[1] = b[idx + 1];
1277 tp[2] = b[idx + 2];
1278 tp += 3;
1279 }
1280
1281 for (k = 0; k < mbs; k++) {
1282 v = aa + 9 * ai[k];
1283 vj = aj + ai[k];
1284 tp = t + k * 3;
1285 x0 = tp[0];
1286 x1 = tp[1];
1287 x2 = tp[2];
1288 nz = ai[k + 1] - ai[k];
1289
1290 tp = t + (*vj) * 3;
1291 while (nz--) {
1292 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1293 tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1294 tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1295 vj++;
1296 tp = t + (*vj) * 3;
1297 v += 9;
1298 }
1299
1300 /* xk = inv(Dk)*(Dk*xk) */
1301 diag = aa + k * 9; /* ptr to inv(Dk) */
1302 tp = t + k * 3;
1303 tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1304 tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1305 tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1306 }
1307
1308 /* solve U*x = y by back substitution */
1309 for (k = mbs - 1; k >= 0; k--) {
1310 v = aa + 9 * ai[k];
1311 vj = aj + ai[k];
1312 tp = t + k * 3;
1313 x0 = tp[0];
1314 x1 = tp[1];
1315 x2 = tp[2]; /* xk */
1316 nz = ai[k + 1] - ai[k];
1317
1318 tp = t + (*vj) * 3;
1319 while (nz--) {
1320 /* xk += U(k,:)*x(:) */
1321 x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1322 x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1323 x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1324 vj++;
1325 tp = t + (*vj) * 3;
1326 v += 9;
1327 }
1328 tp = t + k * 3;
1329 tp[0] = x0;
1330 tp[1] = x1;
1331 tp[2] = x2;
1332 idx = 3 * r[k];
1333 x[idx] = x0;
1334 x[idx + 1] = x1;
1335 x[idx + 2] = x2;
1336 }
1337
1338 PetscCall(ISRestoreIndices(isrow, &r));
1339 PetscCall(VecRestoreArrayRead(bb, &b));
1340 PetscCall(VecRestoreArray(xx, &x));
1341 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1342 PetscFunctionReturn(PETSC_SUCCESS);
1343 }
1344
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1345 static PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1346 {
1347 const MatScalar *v, *diag;
1348 PetscScalar *xp, x0, x1, x2;
1349 PetscInt nz, k;
1350 const PetscInt *vj;
1351
1352 PetscFunctionBegin;
1353 for (k = 0; k < mbs; k++) {
1354 v = aa + 9 * ai[k];
1355 xp = x + k * 3;
1356 x0 = xp[0];
1357 x1 = xp[1];
1358 x2 = xp[2]; /* Dk*xk = k-th block of x */
1359 nz = ai[k + 1] - ai[k];
1360 vj = aj + ai[k];
1361 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1362 PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1363 while (nz--) {
1364 xp = x + (*vj) * 3;
1365 /* x(:) += U(k,:)^T*(Dk*xk) */
1366 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1367 xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1368 xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1369 vj++;
1370 v += 9;
1371 }
1372 /* xk = inv(Dk)*(Dk*xk) */
1373 diag = aa + k * 9; /* ptr to inv(Dk) */
1374 xp = x + k * 3;
1375 xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1376 xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1377 xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1378 }
1379 PetscFunctionReturn(PETSC_SUCCESS);
1380 }
1381
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1382 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1383 {
1384 const MatScalar *v;
1385 PetscScalar *xp, x0, x1, x2;
1386 PetscInt nz, k;
1387 const PetscInt *vj;
1388
1389 PetscFunctionBegin;
1390 for (k = mbs - 1; k >= 0; k--) {
1391 v = aa + 9 * ai[k];
1392 xp = x + k * 3;
1393 x0 = xp[0];
1394 x1 = xp[1];
1395 x2 = xp[2]; /* xk */
1396 nz = ai[k + 1] - ai[k];
1397 vj = aj + ai[k];
1398 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1399 PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1400 while (nz--) {
1401 xp = x + (*vj) * 3;
1402 /* xk += U(k,:)*x(:) */
1403 x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1404 x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1405 x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1406 vj++;
1407 v += 9;
1408 }
1409 xp = x + k * 3;
1410 xp[0] = x0;
1411 xp[1] = x1;
1412 xp[2] = x2;
1413 }
1414 PetscFunctionReturn(PETSC_SUCCESS);
1415 }
1416
MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1417 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1418 {
1419 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1420 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1421 const MatScalar *aa = a->a;
1422 PetscScalar *x;
1423 const PetscScalar *b;
1424
1425 PetscFunctionBegin;
1426 PetscCall(VecGetArrayRead(bb, &b));
1427 PetscCall(VecGetArray(xx, &x));
1428
1429 /* solve U^T * D * y = b by forward substitution */
1430 PetscCall(PetscArraycpy(x, b, 3 * mbs));
1431 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1432
1433 /* solve U*x = y by back substitution */
1434 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1435
1436 PetscCall(VecRestoreArrayRead(bb, &b));
1437 PetscCall(VecRestoreArray(xx, &x));
1438 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1439 PetscFunctionReturn(PETSC_SUCCESS);
1440 }
1441
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1442 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1443 {
1444 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1445 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1446 const MatScalar *aa = a->a;
1447 PetscScalar *x;
1448 const PetscScalar *b;
1449
1450 PetscFunctionBegin;
1451 PetscCall(VecGetArrayRead(bb, &b));
1452 PetscCall(VecGetArray(xx, &x));
1453 PetscCall(PetscArraycpy(x, b, 3 * mbs));
1454 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1455 PetscCall(VecRestoreArrayRead(bb, &b));
1456 PetscCall(VecRestoreArray(xx, &x));
1457 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1458 PetscFunctionReturn(PETSC_SUCCESS);
1459 }
1460
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1461 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1462 {
1463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1464 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1465 const MatScalar *aa = a->a;
1466 PetscScalar *x;
1467 const PetscScalar *b;
1468
1469 PetscFunctionBegin;
1470 PetscCall(VecGetArrayRead(bb, &b));
1471 PetscCall(VecGetArray(xx, &x));
1472 PetscCall(PetscArraycpy(x, b, 3 * mbs));
1473 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1474 PetscCall(VecRestoreArrayRead(bb, &b));
1475 PetscCall(VecRestoreArray(xx, &x));
1476 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1477 PetscFunctionReturn(PETSC_SUCCESS);
1478 }
1479
MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)1480 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1481 {
1482 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1483 IS isrow = a->row;
1484 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1485 const PetscInt *r, *vj;
1486 PetscInt nz, k, k2, idx;
1487 const MatScalar *aa = a->a, *v, *diag;
1488 PetscScalar *x, x0, x1, *t;
1489 const PetscScalar *b;
1490
1491 PetscFunctionBegin;
1492 PetscCall(VecGetArrayRead(bb, &b));
1493 PetscCall(VecGetArray(xx, &x));
1494 t = a->solve_work;
1495 PetscCall(ISGetIndices(isrow, &r));
1496
1497 /* solve U^T * D * y = perm(b) by forward substitution */
1498 for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1499 idx = 2 * r[k];
1500 t[k * 2] = b[idx];
1501 t[k * 2 + 1] = b[idx + 1];
1502 }
1503 for (k = 0; k < mbs; k++) {
1504 v = aa + 4 * ai[k];
1505 vj = aj + ai[k];
1506 k2 = k * 2;
1507 x0 = t[k2];
1508 x1 = t[k2 + 1];
1509 nz = ai[k + 1] - ai[k];
1510 while (nz--) {
1511 t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1512 t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1513 vj++;
1514 v += 4;
1515 }
1516 diag = aa + k * 4; /* ptr to inv(Dk) */
1517 t[k2] = diag[0] * x0 + diag[2] * x1;
1518 t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1519 }
1520
1521 /* solve U*x = y by back substitution */
1522 for (k = mbs - 1; k >= 0; k--) {
1523 v = aa + 4 * ai[k];
1524 vj = aj + ai[k];
1525 k2 = k * 2;
1526 x0 = t[k2];
1527 x1 = t[k2 + 1];
1528 nz = ai[k + 1] - ai[k];
1529 while (nz--) {
1530 x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1531 x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1532 vj++;
1533 v += 4;
1534 }
1535 t[k2] = x0;
1536 t[k2 + 1] = x1;
1537 idx = 2 * r[k];
1538 x[idx] = x0;
1539 x[idx + 1] = x1;
1540 }
1541
1542 PetscCall(ISRestoreIndices(isrow, &r));
1543 PetscCall(VecRestoreArrayRead(bb, &b));
1544 PetscCall(VecRestoreArray(xx, &x));
1545 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1546 PetscFunctionReturn(PETSC_SUCCESS);
1547 }
1548
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1549 static PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1550 {
1551 const MatScalar *v, *diag;
1552 PetscScalar x0, x1;
1553 PetscInt nz, k, k2;
1554 const PetscInt *vj;
1555
1556 PetscFunctionBegin;
1557 for (k = 0; k < mbs; k++) {
1558 v = aa + 4 * ai[k];
1559 vj = aj + ai[k];
1560 k2 = k * 2;
1561 x0 = x[k2];
1562 x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1563 nz = ai[k + 1] - ai[k];
1564 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1565 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1566 while (nz--) {
1567 /* x(:) += U(k,:)^T*(Dk*xk) */
1568 x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1569 x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1570 vj++;
1571 v += 4;
1572 }
1573 /* xk = inv(Dk)*(Dk*xk) */
1574 diag = aa + k * 4; /* ptr to inv(Dk) */
1575 x[k2] = diag[0] * x0 + diag[2] * x1;
1576 x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1577 }
1578 PetscFunctionReturn(PETSC_SUCCESS);
1579 }
1580
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1581 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1582 {
1583 const MatScalar *v;
1584 PetscScalar x0, x1;
1585 PetscInt nz, k, k2;
1586 const PetscInt *vj;
1587
1588 PetscFunctionBegin;
1589 for (k = mbs - 1; k >= 0; k--) {
1590 v = aa + 4 * ai[k];
1591 vj = aj + ai[k];
1592 k2 = k * 2;
1593 x0 = x[k2];
1594 x1 = x[k2 + 1]; /* xk */
1595 nz = ai[k + 1] - ai[k];
1596 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1597 PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1598 while (nz--) {
1599 /* xk += U(k,:)*x(:) */
1600 x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1601 x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1602 vj++;
1603 v += 4;
1604 }
1605 x[k2] = x0;
1606 x[k2 + 1] = x1;
1607 }
1608 PetscFunctionReturn(PETSC_SUCCESS);
1609 }
1610
MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1611 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1612 {
1613 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1614 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1615 const MatScalar *aa = a->a;
1616 PetscScalar *x;
1617 const PetscScalar *b;
1618
1619 PetscFunctionBegin;
1620 PetscCall(VecGetArrayRead(bb, &b));
1621 PetscCall(VecGetArray(xx, &x));
1622
1623 /* solve U^T * D * y = b by forward substitution */
1624 PetscCall(PetscArraycpy(x, b, 2 * mbs));
1625 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1626
1627 /* solve U*x = y by back substitution */
1628 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1629
1630 PetscCall(VecRestoreArrayRead(bb, &b));
1631 PetscCall(VecRestoreArray(xx, &x));
1632 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1633 PetscFunctionReturn(PETSC_SUCCESS);
1634 }
1635
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1636 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1637 {
1638 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1639 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1640 const MatScalar *aa = a->a;
1641 PetscScalar *x;
1642 const PetscScalar *b;
1643
1644 PetscFunctionBegin;
1645 PetscCall(VecGetArrayRead(bb, &b));
1646 PetscCall(VecGetArray(xx, &x));
1647 PetscCall(PetscArraycpy(x, b, 2 * mbs));
1648 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1649 PetscCall(VecRestoreArrayRead(bb, &b));
1650 PetscCall(VecRestoreArray(xx, &x));
1651 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1652 PetscFunctionReturn(PETSC_SUCCESS);
1653 }
1654
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1655 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1656 {
1657 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1658 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j;
1659 const MatScalar *aa = a->a;
1660 PetscScalar *x;
1661 const PetscScalar *b;
1662
1663 PetscFunctionBegin;
1664 PetscCall(VecGetArrayRead(bb, &b));
1665 PetscCall(VecGetArray(xx, &x));
1666 PetscCall(PetscArraycpy(x, b, 2 * mbs));
1667 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1668 PetscCall(VecRestoreArrayRead(bb, &b));
1669 PetscCall(VecRestoreArray(xx, &x));
1670 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1671 PetscFunctionReturn(PETSC_SUCCESS);
1672 }
1673
MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1674 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1675 {
1676 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1677 IS isrow = a->row;
1678 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1679 const MatScalar *aa = a->a, *v;
1680 const PetscScalar *b;
1681 PetscScalar *x, xk, *t;
1682 PetscInt nz, k, j;
1683
1684 PetscFunctionBegin;
1685 PetscCall(VecGetArrayRead(bb, &b));
1686 PetscCall(VecGetArray(xx, &x));
1687 t = a->solve_work;
1688 PetscCall(ISGetIndices(isrow, &rp));
1689
1690 /* solve U^T*D*y = perm(b) by forward substitution */
1691 for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1692 for (k = 0; k < mbs; k++) {
1693 v = aa + ai[k];
1694 vj = aj + ai[k];
1695 xk = t[k];
1696 nz = ai[k + 1] - ai[k] - 1;
1697 for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1698 t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1699 }
1700
1701 /* solve U*perm(x) = y by back substitution */
1702 for (k = mbs - 1; k >= 0; k--) {
1703 v = aa + adiag[k] - 1;
1704 vj = aj + adiag[k] - 1;
1705 nz = ai[k + 1] - ai[k] - 1;
1706 for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1707 x[rp[k]] = t[k];
1708 }
1709
1710 PetscCall(ISRestoreIndices(isrow, &rp));
1711 PetscCall(VecRestoreArrayRead(bb, &b));
1712 PetscCall(VecRestoreArray(xx, &x));
1713 PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1714 PetscFunctionReturn(PETSC_SUCCESS);
1715 }
1716
MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1717 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1718 {
1719 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1720 IS isrow = a->row;
1721 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1722 const MatScalar *aa = a->a, *v;
1723 PetscScalar *x, xk, *t;
1724 const PetscScalar *b;
1725 PetscInt nz, k;
1726
1727 PetscFunctionBegin;
1728 PetscCall(VecGetArrayRead(bb, &b));
1729 PetscCall(VecGetArray(xx, &x));
1730 t = a->solve_work;
1731 PetscCall(ISGetIndices(isrow, &rp));
1732
1733 /* solve U^T*D*y = perm(b) by forward substitution */
1734 for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1735 for (k = 0; k < mbs; k++) {
1736 v = aa + ai[k] + 1;
1737 vj = aj + ai[k] + 1;
1738 xk = t[k];
1739 nz = ai[k + 1] - ai[k] - 1;
1740 while (nz--) t[*vj++] += (*v++) * xk;
1741 t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1742 }
1743
1744 /* solve U*perm(x) = y by back substitution */
1745 for (k = mbs - 1; k >= 0; k--) {
1746 v = aa + ai[k] + 1;
1747 vj = aj + ai[k] + 1;
1748 nz = ai[k + 1] - ai[k] - 1;
1749 while (nz--) t[k] += (*v++) * t[*vj++];
1750 x[rp[k]] = t[k];
1751 }
1752
1753 PetscCall(ISRestoreIndices(isrow, &rp));
1754 PetscCall(VecRestoreArrayRead(bb, &b));
1755 PetscCall(VecRestoreArray(xx, &x));
1756 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1757 PetscFunctionReturn(PETSC_SUCCESS);
1758 }
1759
MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1760 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1761 {
1762 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1763 IS isrow = a->row;
1764 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1765 const MatScalar *aa = a->a, *v;
1766 PetscReal diagk;
1767 PetscScalar *x, xk;
1768 const PetscScalar *b;
1769 PetscInt nz, k;
1770
1771 PetscFunctionBegin;
1772 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1773 PetscCall(VecGetArrayRead(bb, &b));
1774 PetscCall(VecGetArray(xx, &x));
1775 PetscCall(ISGetIndices(isrow, &rp));
1776
1777 for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1778 for (k = 0; k < mbs; k++) {
1779 v = aa + ai[k];
1780 vj = aj + ai[k];
1781 xk = x[k];
1782 nz = ai[k + 1] - ai[k] - 1;
1783 while (nz--) x[*vj++] += (*v++) * xk;
1784
1785 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1786 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1787 x[k] = xk * PetscSqrtReal(diagk);
1788 }
1789 PetscCall(ISRestoreIndices(isrow, &rp));
1790 PetscCall(VecRestoreArrayRead(bb, &b));
1791 PetscCall(VecRestoreArray(xx, &x));
1792 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1793 PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795
MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1796 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1797 {
1798 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1799 IS isrow = a->row;
1800 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1801 const MatScalar *aa = a->a, *v;
1802 PetscReal diagk;
1803 PetscScalar *x, xk;
1804 const PetscScalar *b;
1805 PetscInt nz, k;
1806
1807 PetscFunctionBegin;
1808 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1809 PetscCall(VecGetArrayRead(bb, &b));
1810 PetscCall(VecGetArray(xx, &x));
1811 PetscCall(ISGetIndices(isrow, &rp));
1812
1813 for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1814 for (k = 0; k < mbs; k++) {
1815 v = aa + ai[k] + 1;
1816 vj = aj + ai[k] + 1;
1817 xk = x[k];
1818 nz = ai[k + 1] - ai[k] - 1;
1819 while (nz--) x[*vj++] += (*v++) * xk;
1820
1821 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1822 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1823 x[k] = xk * PetscSqrtReal(diagk);
1824 }
1825 PetscCall(ISRestoreIndices(isrow, &rp));
1826 PetscCall(VecRestoreArrayRead(bb, &b));
1827 PetscCall(VecRestoreArray(xx, &x));
1828 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1829 PetscFunctionReturn(PETSC_SUCCESS);
1830 }
1831
MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1832 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1833 {
1834 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1835 IS isrow = a->row;
1836 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1837 const MatScalar *aa = a->a, *v;
1838 PetscReal diagk;
1839 PetscScalar *x, *t;
1840 const PetscScalar *b;
1841 PetscInt nz, k;
1842
1843 PetscFunctionBegin;
1844 /* solve D^(1/2)*U*perm(x) = b by back substitution */
1845 PetscCall(VecGetArrayRead(bb, &b));
1846 PetscCall(VecGetArray(xx, &x));
1847 t = a->solve_work;
1848 PetscCall(ISGetIndices(isrow, &rp));
1849
1850 for (k = mbs - 1; k >= 0; k--) {
1851 v = aa + ai[k];
1852 vj = aj + ai[k];
1853 diagk = PetscRealPart(aa[adiag[k]]);
1854 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1855 t[k] = b[k] * PetscSqrtReal(diagk);
1856 nz = ai[k + 1] - ai[k] - 1;
1857 while (nz--) t[k] += (*v++) * t[*vj++];
1858 x[rp[k]] = t[k];
1859 }
1860 PetscCall(ISRestoreIndices(isrow, &rp));
1861 PetscCall(VecRestoreArrayRead(bb, &b));
1862 PetscCall(VecRestoreArray(xx, &x));
1863 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1864 PetscFunctionReturn(PETSC_SUCCESS);
1865 }
1866
MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1867 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1868 {
1869 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1870 IS isrow = a->row;
1871 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1872 const MatScalar *aa = a->a, *v;
1873 PetscReal diagk;
1874 PetscScalar *x, *t;
1875 const PetscScalar *b;
1876 PetscInt nz, k;
1877
1878 PetscFunctionBegin;
1879 /* solve D^(1/2)*U*perm(x) = b by back substitution */
1880 PetscCall(VecGetArrayRead(bb, &b));
1881 PetscCall(VecGetArray(xx, &x));
1882 t = a->solve_work;
1883 PetscCall(ISGetIndices(isrow, &rp));
1884
1885 for (k = mbs - 1; k >= 0; k--) {
1886 v = aa + ai[k] + 1;
1887 vj = aj + ai[k] + 1;
1888 diagk = PetscRealPart(aa[ai[k]]);
1889 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1890 t[k] = b[k] * PetscSqrtReal(diagk);
1891 nz = ai[k + 1] - ai[k] - 1;
1892 while (nz--) t[k] += (*v++) * t[*vj++];
1893 x[rp[k]] = t[k];
1894 }
1895 PetscCall(ISRestoreIndices(isrow, &rp));
1896 PetscCall(VecRestoreArrayRead(bb, &b));
1897 PetscCall(VecRestoreArray(xx, &x));
1898 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1899 PetscFunctionReturn(PETSC_SUCCESS);
1900 }
1901
MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)1902 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1903 {
1904 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1905
1906 PetscFunctionBegin;
1907 if (A->rmap->bs == 1) {
1908 PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1909 } else {
1910 IS isrow = a->row;
1911 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1912 const MatScalar *aa = a->a, *v;
1913 PetscScalar *x, *t;
1914 const PetscScalar *b;
1915 PetscInt nz, k, n, i, j;
1916
1917 if (bb->n > a->solves_work_n) {
1918 PetscCall(PetscFree(a->solves_work));
1919 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1920 a->solves_work_n = bb->n;
1921 }
1922 n = bb->n;
1923 PetscCall(VecGetArrayRead(bb->v, &b));
1924 PetscCall(VecGetArray(xx->v, &x));
1925 t = a->solves_work;
1926
1927 PetscCall(ISGetIndices(isrow, &rp));
1928
1929 /* solve U^T*D*y = perm(b) by forward substitution */
1930 for (k = 0; k < mbs; k++) {
1931 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1932 }
1933 for (k = 0; k < mbs; k++) {
1934 v = aa + ai[k];
1935 vj = aj + ai[k];
1936 nz = ai[k + 1] - ai[k] - 1;
1937 for (j = 0; j < nz; j++) {
1938 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1939 v++;
1940 vj++;
1941 }
1942 for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1943 }
1944
1945 /* solve U*perm(x) = y by back substitution */
1946 for (k = mbs - 1; k >= 0; k--) {
1947 v = aa + ai[k] - 1;
1948 vj = aj + ai[k] - 1;
1949 nz = ai[k + 1] - ai[k] - 1;
1950 for (j = 0; j < nz; j++) {
1951 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1952 v++;
1953 vj++;
1954 }
1955 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1956 }
1957
1958 PetscCall(ISRestoreIndices(isrow, &rp));
1959 PetscCall(VecRestoreArrayRead(bb->v, &b));
1960 PetscCall(VecRestoreArray(xx->v, &x));
1961 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1962 }
1963 PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965
MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)1966 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1967 {
1968 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1969
1970 PetscFunctionBegin;
1971 if (A->rmap->bs == 1) {
1972 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1973 } else {
1974 IS isrow = a->row;
1975 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1976 const MatScalar *aa = a->a, *v;
1977 PetscScalar *x, *t;
1978 const PetscScalar *b;
1979 PetscInt nz, k, n, i;
1980
1981 if (bb->n > a->solves_work_n) {
1982 PetscCall(PetscFree(a->solves_work));
1983 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1984 a->solves_work_n = bb->n;
1985 }
1986 n = bb->n;
1987 PetscCall(VecGetArrayRead(bb->v, &b));
1988 PetscCall(VecGetArray(xx->v, &x));
1989 t = a->solves_work;
1990
1991 PetscCall(ISGetIndices(isrow, &rp));
1992
1993 /* solve U^T*D*y = perm(b) by forward substitution */
1994 for (k = 0; k < mbs; k++) {
1995 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1996 }
1997 for (k = 0; k < mbs; k++) {
1998 v = aa + ai[k];
1999 vj = aj + ai[k];
2000 nz = ai[k + 1] - ai[k];
2001 while (nz--) {
2002 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2003 v++;
2004 vj++;
2005 }
2006 for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2007 }
2008
2009 /* solve U*perm(x) = y by back substitution */
2010 for (k = mbs - 1; k >= 0; k--) {
2011 v = aa + ai[k];
2012 vj = aj + ai[k];
2013 nz = ai[k + 1] - ai[k];
2014 while (nz--) {
2015 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2016 v++;
2017 vj++;
2018 }
2019 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2020 }
2021
2022 PetscCall(ISRestoreIndices(isrow, &rp));
2023 PetscCall(VecRestoreArrayRead(bb->v, &b));
2024 PetscCall(VecRestoreArray(xx->v, &x));
2025 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2026 }
2027 PetscFunctionReturn(PETSC_SUCCESS);
2028 }
2029
MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2030 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2031 {
2032 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2033 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2034 const MatScalar *aa = a->a, *v;
2035 const PetscScalar *b;
2036 PetscScalar *x, xi;
2037 PetscInt nz, i, j;
2038
2039 PetscFunctionBegin;
2040 PetscCall(VecGetArrayRead(bb, &b));
2041 PetscCall(VecGetArray(xx, &x));
2042 /* solve U^T*D*y = b by forward substitution */
2043 PetscCall(PetscArraycpy(x, b, mbs));
2044 for (i = 0; i < mbs; i++) {
2045 v = aa + ai[i];
2046 vj = aj + ai[i];
2047 xi = x[i];
2048 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2049 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2050 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2051 }
2052 /* solve U*x = y by backward substitution */
2053 for (i = mbs - 2; i >= 0; i--) {
2054 xi = x[i];
2055 v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2056 vj = aj + adiag[i] - 1;
2057 nz = ai[i + 1] - ai[i] - 1;
2058 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2059 x[i] = xi;
2060 }
2061 PetscCall(VecRestoreArrayRead(bb, &b));
2062 PetscCall(VecRestoreArray(xx, &x));
2063 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2064 PetscFunctionReturn(PETSC_SUCCESS);
2065 }
2066
MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)2067 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2068 {
2069 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2070 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2071 const MatScalar *aa = a->a, *v;
2072 const PetscScalar *b;
2073 PetscScalar *x, xi;
2074 PetscInt nz, i, j, neq, ldb, ldx;
2075 PetscBool isdense;
2076
2077 PetscFunctionBegin;
2078 if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2079 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2080 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2081 if (X != B) {
2082 PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2083 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2084 }
2085 PetscCall(MatDenseGetArrayRead(B, &b));
2086 PetscCall(MatDenseGetLDA(B, &ldb));
2087 PetscCall(MatDenseGetArray(X, &x));
2088 PetscCall(MatDenseGetLDA(X, &ldx));
2089 for (neq = 0; neq < B->cmap->n; neq++) {
2090 /* solve U^T*D*y = b by forward substitution */
2091 PetscCall(PetscArraycpy(x, b, mbs));
2092 for (i = 0; i < mbs; i++) {
2093 v = aa + ai[i];
2094 vj = aj + ai[i];
2095 xi = x[i];
2096 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2097 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2098 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2099 }
2100 /* solve U*x = y by backward substitution */
2101 for (i = mbs - 2; i >= 0; i--) {
2102 xi = x[i];
2103 v = aa + adiag[i] - 1; /* end of row i, excluding diag */
2104 vj = aj + adiag[i] - 1;
2105 nz = ai[i + 1] - ai[i] - 1;
2106 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2107 x[i] = xi;
2108 }
2109 b += ldb;
2110 x += ldx;
2111 }
2112 PetscCall(MatDenseRestoreArrayRead(B, &b));
2113 PetscCall(MatDenseRestoreArray(X, &x));
2114 PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2115 PetscFunctionReturn(PETSC_SUCCESS);
2116 }
2117
MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2118 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2119 {
2120 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2121 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2122 const MatScalar *aa = a->a, *v;
2123 PetscScalar *x, xk;
2124 const PetscScalar *b;
2125 PetscInt nz, k;
2126
2127 PetscFunctionBegin;
2128 PetscCall(VecGetArrayRead(bb, &b));
2129 PetscCall(VecGetArray(xx, &x));
2130
2131 /* solve U^T*D*y = b by forward substitution */
2132 PetscCall(PetscArraycpy(x, b, mbs));
2133 for (k = 0; k < mbs; k++) {
2134 v = aa + ai[k] + 1;
2135 vj = aj + ai[k] + 1;
2136 xk = x[k];
2137 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2138 while (nz--) x[*vj++] += (*v++) * xk;
2139 x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2140 }
2141
2142 /* solve U*x = y by back substitution */
2143 for (k = mbs - 2; k >= 0; k--) {
2144 v = aa + ai[k] + 1;
2145 vj = aj + ai[k] + 1;
2146 xk = x[k];
2147 nz = ai[k + 1] - ai[k] - 1;
2148 while (nz--) xk += (*v++) * x[*vj++];
2149 x[k] = xk;
2150 }
2151
2152 PetscCall(VecRestoreArrayRead(bb, &b));
2153 PetscCall(VecRestoreArray(xx, &x));
2154 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2155 PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2158 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2159 {
2160 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2161 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2162 const MatScalar *aa = a->a, *v;
2163 PetscReal diagk;
2164 PetscScalar *x;
2165 const PetscScalar *b;
2166 PetscInt nz, k;
2167
2168 PetscFunctionBegin;
2169 /* solve U^T*D^(1/2)*x = b by forward substitution */
2170 PetscCall(VecGetArrayRead(bb, &b));
2171 PetscCall(VecGetArray(xx, &x));
2172 PetscCall(PetscArraycpy(x, b, mbs));
2173 for (k = 0; k < mbs; k++) {
2174 v = aa + ai[k];
2175 vj = aj + ai[k];
2176 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2177 while (nz--) x[*vj++] += (*v++) * x[k];
2178 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2179 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2180 x[k] *= PetscSqrtReal(diagk);
2181 }
2182 PetscCall(VecRestoreArrayRead(bb, &b));
2183 PetscCall(VecRestoreArray(xx, &x));
2184 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2185 PetscFunctionReturn(PETSC_SUCCESS);
2186 }
2187
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2188 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2189 {
2190 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2191 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2192 const MatScalar *aa = a->a, *v;
2193 PetscReal diagk;
2194 PetscScalar *x;
2195 const PetscScalar *b;
2196 PetscInt nz, k;
2197
2198 PetscFunctionBegin;
2199 /* solve U^T*D^(1/2)*x = b by forward substitution */
2200 PetscCall(VecGetArrayRead(bb, &b));
2201 PetscCall(VecGetArray(xx, &x));
2202 PetscCall(PetscArraycpy(x, b, mbs));
2203 for (k = 0; k < mbs; k++) {
2204 v = aa + ai[k] + 1;
2205 vj = aj + ai[k] + 1;
2206 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2207 while (nz--) x[*vj++] += (*v++) * x[k];
2208 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2209 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2210 x[k] *= PetscSqrtReal(diagk);
2211 }
2212 PetscCall(VecRestoreArrayRead(bb, &b));
2213 PetscCall(VecRestoreArray(xx, &x));
2214 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2215 PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2218 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2219 {
2220 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2221 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2222 const MatScalar *aa = a->a, *v;
2223 PetscReal diagk;
2224 PetscScalar *x;
2225 const PetscScalar *b;
2226 PetscInt nz, k;
2227
2228 PetscFunctionBegin;
2229 /* solve D^(1/2)*U*x = b by back substitution */
2230 PetscCall(VecGetArrayRead(bb, &b));
2231 PetscCall(VecGetArray(xx, &x));
2232
2233 for (k = mbs - 1; k >= 0; k--) {
2234 v = aa + ai[k];
2235 vj = aj + ai[k];
2236 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2237 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2238 x[k] = PetscSqrtReal(diagk) * b[k];
2239 nz = ai[k + 1] - ai[k] - 1;
2240 while (nz--) x[k] += (*v++) * x[*vj++];
2241 }
2242 PetscCall(VecRestoreArrayRead(bb, &b));
2243 PetscCall(VecRestoreArray(xx, &x));
2244 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2245 PetscFunctionReturn(PETSC_SUCCESS);
2246 }
2247
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2248 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2249 {
2250 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2251 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2252 const MatScalar *aa = a->a, *v;
2253 PetscReal diagk;
2254 PetscScalar *x;
2255 const PetscScalar *b;
2256 PetscInt nz, k;
2257
2258 PetscFunctionBegin;
2259 /* solve D^(1/2)*U*x = b by back substitution */
2260 PetscCall(VecGetArrayRead(bb, &b));
2261 PetscCall(VecGetArray(xx, &x));
2262
2263 for (k = mbs - 1; k >= 0; k--) {
2264 v = aa + ai[k] + 1;
2265 vj = aj + ai[k] + 1;
2266 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2267 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2268 x[k] = PetscSqrtReal(diagk) * b[k];
2269 nz = ai[k + 1] - ai[k] - 1;
2270 while (nz--) x[k] += (*v++) * x[*vj++];
2271 }
2272 PetscCall(VecRestoreArrayRead(bb, &b));
2273 PetscCall(VecRestoreArray(xx, &x));
2274 PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2275 PetscFunctionReturn(PETSC_SUCCESS);
2276 }
2277
2278 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo * info)2279 static PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2280 {
2281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2282 const PetscInt *rip, mbs = a->mbs, *ai, *aj;
2283 PetscInt *jutmp, bs = A->rmap->bs, i;
2284 PetscInt m, reallocs = 0, *levtmp;
2285 PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2286 PetscInt incrlev, *lev, shift, prow, nz;
2287 PetscReal f = info->fill, levels = info->levels;
2288 PetscBool perm_identity;
2289
2290 PetscFunctionBegin;
2291 /* check whether perm is the identity mapping */
2292 PetscCall(ISIdentity(perm, &perm_identity));
2293
2294 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2295 a->permute = PETSC_FALSE;
2296 ai = a->i;
2297 aj = a->j;
2298
2299 /* initialization */
2300 PetscCall(ISGetIndices(perm, &rip));
2301 umax = (PetscInt)(f * ai[mbs] + 1);
2302 PetscCall(PetscMalloc1(umax, &lev));
2303 umax += mbs + 1;
2304 shift = mbs + 1;
2305 PetscCall(PetscMalloc1(mbs + 1, &iu));
2306 PetscCall(PetscMalloc1(umax, &ju));
2307 iu[0] = mbs + 1;
2308 juidx = mbs + 1;
2309 /* prowl: linked list for pivot row */
2310 PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2311 /* q: linked list for col index */
2312
2313 for (i = 0; i < mbs; i++) {
2314 prowl[i] = mbs;
2315 q[i] = 0;
2316 }
2317
2318 /* for each row k */
2319 for (k = 0; k < mbs; k++) {
2320 nzk = 0;
2321 q[k] = mbs;
2322 /* copy current row into linked list */
2323 nz = ai[rip[k] + 1] - ai[rip[k]];
2324 j = ai[rip[k]];
2325 while (nz--) {
2326 vj = rip[aj[j++]];
2327 if (vj > k) {
2328 qm = k;
2329 do {
2330 m = qm;
2331 qm = q[m];
2332 } while (qm < vj);
2333 PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2334 nzk++;
2335 q[m] = vj;
2336 q[vj] = qm;
2337 levtmp[vj] = 0; /* initialize lev for nonzero element */
2338 }
2339 }
2340
2341 /* modify nonzero structure of k-th row by computing fill-in
2342 for each row prow to be merged in */
2343 prow = k;
2344 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2345
2346 while (prow < k) {
2347 /* merge row prow into k-th row */
2348 jmin = iu[prow] + 1;
2349 jmax = iu[prow + 1];
2350 qm = k;
2351 for (j = jmin; j < jmax; j++) {
2352 incrlev = lev[j - shift] + 1;
2353 if (incrlev > levels) continue;
2354 vj = ju[j];
2355 do {
2356 m = qm;
2357 qm = q[m];
2358 } while (qm < vj);
2359 if (qm != vj) { /* a fill */
2360 nzk++;
2361 q[m] = vj;
2362 q[vj] = qm;
2363 qm = vj;
2364 levtmp[vj] = incrlev;
2365 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2366 }
2367 prow = prowl[prow]; /* next pivot row */
2368 }
2369
2370 /* add k to row list for first nonzero element in k-th row */
2371 if (nzk > 1) {
2372 i = q[k]; /* col value of first nonzero element in k_th row of U */
2373 prowl[k] = prowl[i];
2374 prowl[i] = k;
2375 }
2376 iu[k + 1] = iu[k] + nzk;
2377
2378 /* allocate more space to ju and lev if needed */
2379 if (iu[k + 1] > umax) {
2380 /* estimate how much additional space we will need */
2381 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2382 /* just double the memory each time */
2383 maxadd = umax;
2384 if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2385 umax += maxadd;
2386
2387 /* allocate a longer ju */
2388 PetscCall(PetscMalloc1(umax, &jutmp));
2389 PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2390 PetscCall(PetscFree(ju));
2391 ju = jutmp;
2392
2393 PetscCall(PetscMalloc1(umax, &jutmp));
2394 PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2395 PetscCall(PetscFree(lev));
2396 lev = jutmp;
2397 reallocs += 2; /* count how many times we realloc */
2398 }
2399
2400 /* save nonzero structure of k-th row in ju */
2401 i = k;
2402 while (nzk--) {
2403 i = q[i];
2404 ju[juidx] = i;
2405 lev[juidx - shift] = levtmp[i];
2406 juidx++;
2407 }
2408 }
2409
2410 #if defined(PETSC_USE_INFO)
2411 if (ai[mbs] != 0) {
2412 PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2413 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2414 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2415 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2416 PetscCall(PetscInfo(A, "for best performance.\n"));
2417 } else {
2418 PetscCall(PetscInfo(A, "Empty matrix\n"));
2419 }
2420 #endif
2421
2422 PetscCall(ISRestoreIndices(perm, &rip));
2423 PetscCall(PetscFree3(prowl, q, levtmp));
2424 PetscCall(PetscFree(lev));
2425
2426 /* put together the new matrix */
2427 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2428
2429 b = (Mat_SeqSBAIJ *)B->data;
2430 PetscCall(PetscFree2(b->imax, b->ilen));
2431
2432 /* the next line frees the default space generated by the Create() */
2433 PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
2434 PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * a->bs2, sizeof(PetscScalar), (void **)&b->a));
2435 b->free_a = PETSC_TRUE;
2436 b->free_ij = PETSC_TRUE;
2437 b->j = ju;
2438 b->i = iu;
2439 b->diag = NULL;
2440 b->ilen = NULL;
2441 b->imax = NULL;
2442
2443 PetscCall(ISDestroy(&b->row));
2444 PetscCall(ISDestroy(&b->icol));
2445 b->row = perm;
2446 b->icol = perm;
2447 PetscCall(PetscObjectReference((PetscObject)perm));
2448 PetscCall(PetscObjectReference((PetscObject)perm));
2449 PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450 /* In b structure: Free imax, ilen, old a, old j.
2451 Allocate idnew, solve_work, new a, new j */
2452 b->maxnz = b->nz = iu[mbs];
2453
2454 B->info.factor_mallocs = reallocs;
2455 B->info.fill_ratio_given = f;
2456 if (ai[mbs] != 0) {
2457 B->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458 } else {
2459 B->info.fill_ratio_needed = 0.0;
2460 }
2461 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462 PetscFunctionReturn(PETSC_SUCCESS);
2463 }
2464
2465 /*
2466 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467 */
2468 #include <petscbt.h>
2469 #include <../src/mat/utils/freespace.h>
MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2470 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471 {
2472 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b;
2473 PetscBool perm_identity, free_ij = PETSC_TRUE, diagDense;
2474 PetscInt bs = A->rmap->bs, am = a->mbs, *ai = a->i, *aj = a->j;
2475 const PetscInt *rip;
2476 PetscInt reallocs = 0, i, *ui, *udiag, *cols;
2477 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479 PetscReal fill = info->fill, levels = info->levels;
2480 PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482 PetscBT lnkbt;
2483
2484 PetscFunctionBegin;
2485 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486 PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, NULL, &diagDense));
2487 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
2488 if (bs > 1) {
2489 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490 PetscFunctionReturn(PETSC_SUCCESS);
2491 }
2492
2493 /* check whether perm is the identity mapping */
2494 PetscCall(ISIdentity(perm, &perm_identity));
2495 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496 a->permute = PETSC_FALSE;
2497
2498 PetscCall(PetscMalloc1(am + 1, &ui));
2499 PetscCall(PetscMalloc1(am + 1, &udiag));
2500 ui[0] = 0;
2501
2502 /* ICC(0) without matrix ordering: simply rearrange column indices */
2503 if (!levels) {
2504 /* reuse the column pointers and row offsets for memory savings */
2505 for (i = 0; i < am; i++) {
2506 ncols = ai[i + 1] - ai[i];
2507 ui[i + 1] = ui[i] + ncols;
2508 udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509 }
2510 PetscCall(PetscMalloc1(ui[am], &uj));
2511 cols = uj;
2512 for (i = 0; i < am; i++) {
2513 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514 ncols = ai[i + 1] - ai[i] - 1;
2515 for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517 }
2518 } else { /* case: levels>0 */
2519 PetscCall(ISGetIndices(perm, &rip));
2520
2521 /* initialization */
2522 /* jl: linked list for storing indices of the pivot rows
2523 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525 for (i = 0; i < am; i++) {
2526 jl[i] = am;
2527 il[i] = 0;
2528 }
2529
2530 /* create and initialize a linked list for storing column indices of the active row k */
2531 nlnk = am + 1;
2532 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2533
2534 /* initial FreeSpace size is fill*(ai[am]+1) */
2535 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2536
2537 current_space = free_space;
2538
2539 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2540
2541 current_space_lvl = free_space_lvl;
2542
2543 for (k = 0; k < am; k++) { /* for each active row k */
2544 /* initialize lnk by the column indices of row k */
2545 nzk = 0;
2546 ncols = ai[k + 1] - ai[k];
2547 PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548 cols = aj + ai[k];
2549 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550 nzk += nlnk;
2551
2552 /* update lnk by computing fill-in for each pivot row to be merged in */
2553 prow = jl[k]; /* 1st pivot row */
2554
2555 while (prow < k) {
2556 nextprow = jl[prow];
2557
2558 /* merge prow into k-th row */
2559 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560 jmax = ui[prow + 1];
2561 ncols = jmax - jmin;
2562 i = jmin - ui[prow];
2563
2564 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565 uj = uj_lvl_ptr[prow] + i; /* levels of cols */
2566 j = *(uj - 1);
2567 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568 nzk += nlnk;
2569
2570 /* update il and jl for prow */
2571 if (jmin < jmax) {
2572 il[prow] = jmin;
2573
2574 j = *cols;
2575 jl[prow] = jl[j];
2576 jl[j] = prow;
2577 }
2578 prow = nextprow;
2579 }
2580
2581 /* if free space is not available, make more free space */
2582 if (current_space->local_remaining < nzk) {
2583 i = am - k + 1; /* num of unfactored rows */
2584 i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585 PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2586 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2587 reallocs++;
2588 }
2589
2590 /* copy data into free_space and free_space_lvl, then initialize lnk */
2591 PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2593
2594 /* add the k-th row into il and jl */
2595 if (nzk > 1) {
2596 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597 jl[k] = jl[i];
2598 jl[i] = k;
2599 il[k] = ui[k] + 1;
2600 }
2601 uj_ptr[k] = current_space->array;
2602 uj_lvl_ptr[k] = current_space_lvl->array;
2603
2604 current_space->array += nzk;
2605 current_space->local_used += nzk;
2606 current_space->local_remaining -= nzk;
2607 current_space_lvl->array += nzk;
2608 current_space_lvl->local_used += nzk;
2609 current_space_lvl->local_remaining -= nzk;
2610
2611 ui[k + 1] = ui[k] + nzk;
2612 }
2613
2614 PetscCall(ISRestoreIndices(perm, &rip));
2615 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2616
2617 /* destroy list of free space and other temporary array(s) */
2618 PetscCall(PetscMalloc1(ui[am], &uj));
2619 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */
2620 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621 PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2622
2623 } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2624
2625 /* put together the new matrix in MATSEQSBAIJ format */
2626 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2627
2628 b = (Mat_SeqSBAIJ *)fact->data;
2629 PetscCall(PetscFree2(b->imax, b->ilen));
2630 PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
2631 b->free_a = PETSC_TRUE;
2632 b->free_ij = free_ij;
2633 b->j = uj;
2634 b->i = ui;
2635 b->diag = udiag;
2636 b->ilen = NULL;
2637 b->imax = NULL;
2638 b->row = perm;
2639 b->col = perm;
2640
2641 PetscCall(PetscObjectReference((PetscObject)perm));
2642 PetscCall(PetscObjectReference((PetscObject)perm));
2643
2644 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2645
2646 PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2647
2648 b->maxnz = b->nz = ui[am];
2649
2650 fact->info.factor_mallocs = reallocs;
2651 fact->info.fill_ratio_given = fill;
2652 if (ai[am] != 0) {
2653 fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2654 } else {
2655 fact->info.fill_ratio_needed = 0.0;
2656 }
2657 #if defined(PETSC_USE_INFO)
2658 if (ai[am] != 0) {
2659 PetscReal af = fact->info.fill_ratio_needed;
2660 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2661 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2662 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2663 } else {
2664 PetscCall(PetscInfo(A, "Empty matrix\n"));
2665 }
2666 #endif
2667 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2668 PetscFunctionReturn(PETSC_SUCCESS);
2669 }
2670
MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2671 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2672 {
2673 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
2674 Mat_SeqSBAIJ *b;
2675 PetscBool perm_identity, free_ij = PETSC_TRUE;
2676 PetscInt bs = A->rmap->bs, am = a->mbs;
2677 const PetscInt *cols, *rip, *ai = a->i, *aj = a->j;
2678 PetscInt reallocs = 0, i, *ui;
2679 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2680 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2681 PetscReal fill = info->fill, levels = info->levels, ratio_needed;
2682 PetscFreeSpaceList free_space = NULL, current_space = NULL;
2683 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2684 PetscBT lnkbt;
2685
2686 PetscFunctionBegin;
2687 /*
2688 This code originally uses Modified Sparse Row (MSR) storage
2689 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2690 Then it is rewritten so the factor B takes seqsbaij format. However the associated
2691 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2692 thus the original code in MSR format is still used for these cases.
2693 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2694 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2695 */
2696 if (bs > 1) {
2697 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2698 PetscFunctionReturn(PETSC_SUCCESS);
2699 }
2700
2701 /* check whether perm is the identity mapping */
2702 PetscCall(ISIdentity(perm, &perm_identity));
2703 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2704 a->permute = PETSC_FALSE;
2705
2706 /* special case that simply copies fill pattern */
2707 if (!levels) {
2708 /* reuse the column pointers and row offsets for memory savings */
2709 ui = a->i;
2710 uj = a->j;
2711 free_ij = PETSC_FALSE;
2712 ratio_needed = 1.0;
2713 } else { /* case: levels>0 */
2714 PetscCall(ISGetIndices(perm, &rip));
2715
2716 /* initialization */
2717 PetscCall(PetscMalloc1(am + 1, &ui));
2718 ui[0] = 0;
2719
2720 /* jl: linked list for storing indices of the pivot rows
2721 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2722 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2723 for (i = 0; i < am; i++) {
2724 jl[i] = am;
2725 il[i] = 0;
2726 }
2727
2728 /* create and initialize a linked list for storing column indices of the active row k */
2729 nlnk = am + 1;
2730 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2731
2732 /* initial FreeSpace size is fill*(ai[am]+1) */
2733 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2734
2735 current_space = free_space;
2736
2737 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2738
2739 current_space_lvl = free_space_lvl;
2740
2741 for (k = 0; k < am; k++) { /* for each active row k */
2742 /* initialize lnk by the column indices of row rip[k] */
2743 nzk = 0;
2744 ncols = ai[rip[k] + 1] - ai[rip[k]];
2745 cols = aj + ai[rip[k]];
2746 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2747 nzk += nlnk;
2748
2749 /* update lnk by computing fill-in for each pivot row to be merged in */
2750 prow = jl[k]; /* 1st pivot row */
2751
2752 while (prow < k) {
2753 nextprow = jl[prow];
2754
2755 /* merge prow into k-th row */
2756 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2757 jmax = ui[prow + 1];
2758 ncols = jmax - jmin;
2759 i = jmin - ui[prow];
2760 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2761 j = *(uj_lvl_ptr[prow] + i - 1);
2762 cols_lvl = uj_lvl_ptr[prow] + i;
2763 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2764 nzk += nlnk;
2765
2766 /* update il and jl for prow */
2767 if (jmin < jmax) {
2768 il[prow] = jmin;
2769 j = *cols;
2770 jl[prow] = jl[j];
2771 jl[j] = prow;
2772 }
2773 prow = nextprow;
2774 }
2775
2776 /* if free space is not available, make more free space */
2777 if (current_space->local_remaining < nzk) {
2778 i = am - k + 1; /* num of unfactored rows */
2779 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2780 PetscCall(PetscFreeSpaceGet(i, ¤t_space));
2781 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
2782 reallocs++;
2783 }
2784
2785 /* copy data into free_space and free_space_lvl, then initialize lnk */
2786 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2787
2788 /* add the k-th row into il and jl */
2789 if (nzk - 1 > 0) {
2790 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2791 jl[k] = jl[i];
2792 jl[i] = k;
2793 il[k] = ui[k] + 1;
2794 }
2795 uj_ptr[k] = current_space->array;
2796 uj_lvl_ptr[k] = current_space_lvl->array;
2797
2798 current_space->array += nzk;
2799 current_space->local_used += nzk;
2800 current_space->local_remaining -= nzk;
2801 current_space_lvl->array += nzk;
2802 current_space_lvl->local_used += nzk;
2803 current_space_lvl->local_remaining -= nzk;
2804
2805 ui[k + 1] = ui[k] + nzk;
2806 }
2807
2808 PetscCall(ISRestoreIndices(perm, &rip));
2809 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2810
2811 /* destroy list of free space and other temporary array(s) */
2812 PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2813 PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2814 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2815 PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2816 if (ai[am] != 0) {
2817 ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2818 } else {
2819 ratio_needed = 0.0;
2820 }
2821 } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2822
2823 /* put together the new matrix in MATSEQSBAIJ format */
2824 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2825
2826 b = (Mat_SeqSBAIJ *)fact->data;
2827 PetscCall(PetscFree2(b->imax, b->ilen));
2828 PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a));
2829 b->free_a = PETSC_TRUE;
2830 b->free_ij = free_ij;
2831 b->j = uj;
2832 b->i = ui;
2833 b->diag = NULL;
2834 b->ilen = NULL;
2835 b->imax = NULL;
2836 b->row = perm;
2837 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2838
2839 PetscCall(PetscObjectReference((PetscObject)perm));
2840 b->icol = perm;
2841 PetscCall(PetscObjectReference((PetscObject)perm));
2842 PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2843
2844 b->maxnz = b->nz = ui[am];
2845
2846 fact->info.factor_mallocs = reallocs;
2847 fact->info.fill_ratio_given = fill;
2848 fact->info.fill_ratio_needed = ratio_needed;
2849 #if defined(PETSC_USE_INFO)
2850 if (ai[am] != 0) {
2851 PetscReal af = fact->info.fill_ratio_needed;
2852 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2853 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2854 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2855 } else {
2856 PetscCall(PetscInfo(A, "Empty matrix\n"));
2857 }
2858 #endif
2859 if (perm_identity) {
2860 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2861 } else {
2862 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2863 }
2864 PetscFunctionReturn(PETSC_SUCCESS);
2865 }
2866