1 /*
2 Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6
MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo * info)7 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
8 {
9 Mat C = B;
10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11 IS isrow = b->row, isicol = b->icol;
12 const PetscInt *r, *ic;
13 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
14 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
15 MatScalar *rtmp, *pc, *mwork, *pv;
16 MatScalar *aa = a->a, *v;
17 PetscReal shift = info->shiftamount;
18 const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
19
20 PetscFunctionBegin;
21 PetscCall(ISGetIndices(isrow, &r));
22 PetscCall(ISGetIndices(isicol, &ic));
23
24 /* generate work space needed by the factorization */
25 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
26 PetscCall(PetscArrayzero(rtmp, bs2 * n));
27
28 for (PetscInt i = 0; i < n; i++) {
29 /* zero rtmp */
30 /* L part */
31 PetscInt *pj;
32 PetscInt nzL, nz = bi[i + 1] - bi[i];
33 bjtmp = bj + bi[i];
34 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
35
36 /* U part */
37 nz = bdiag[i] - bdiag[i + 1];
38 bjtmp = bj + bdiag[i + 1] + 1;
39 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
40
41 /* load in initial (unfactored row) */
42 nz = ai[r[i] + 1] - ai[r[i]];
43 ajtmp = aj + ai[r[i]];
44 v = aa + bs2 * ai[r[i]];
45 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
46
47 /* elimination */
48 bjtmp = bj + bi[i];
49 nzL = bi[i + 1] - bi[i];
50 for (PetscInt k = 0; k < nzL; k++) {
51 PetscBool flg = PETSC_FALSE;
52 PetscInt row = bjtmp[k];
53
54 pc = rtmp + bs2 * row;
55 for (PetscInt j = 0; j < bs2; j++) {
56 if (pc[j] != (PetscScalar)0.0) {
57 flg = PETSC_TRUE;
58 break;
59 }
60 }
61 if (flg) {
62 pv = b->a + bs2 * bdiag[row];
63 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65
66 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67 pv = b->a + bs2 * (bdiag[row + 1] + 1);
68 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69 for (PetscInt j = 0; j < nz; j++) {
70 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72 v = rtmp + 4 * pj[j];
73 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74 pv += 4;
75 }
76 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77 }
78 }
79
80 /* finished row so stick it into b->a */
81 /* L part */
82 pv = b->a + bs2 * bi[i];
83 pj = b->j + bi[i];
84 nz = bi[i + 1] - bi[i];
85 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86
87 /* Mark diagonal and invert diagonal for simpler triangular solves */
88 pv = b->a + bs2 * bdiag[i];
89 pj = b->j + bdiag[i];
90 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91 {
92 PetscBool zeropivotdetected;
93
94 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
95 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
96 }
97
98 /* U part */
99 pv = b->a + bs2 * (bdiag[i + 1] + 1);
100 pj = b->j + bdiag[i + 1] + 1;
101 nz = bdiag[i] - bdiag[i + 1] - 1;
102 for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
103 }
104
105 PetscCall(PetscFree2(rtmp, mwork));
106 PetscCall(ISRestoreIndices(isicol, &ic));
107 PetscCall(ISRestoreIndices(isrow, &r));
108
109 C->ops->solve = MatSolve_SeqBAIJ_2;
110 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
111 C->assembled = PETSC_TRUE;
112
113 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
114 PetscFunctionReturn(PETSC_SUCCESS);
115 }
116
MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)117 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
118 {
119 Mat C = B;
120 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
121 PetscInt i, j, k, nz, nzL, row, *pj;
122 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
123 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
124 MatScalar *rtmp, *pc, *mwork, *pv;
125 MatScalar *aa = a->a, *v;
126 PetscInt flg;
127 PetscReal shift = info->shiftamount;
128 PetscBool allowzeropivot, zeropivotdetected;
129
130 PetscFunctionBegin;
131 allowzeropivot = PetscNot(A->erroriffailure);
132
133 /* generate work space needed by the factorization */
134 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
135 PetscCall(PetscArrayzero(rtmp, bs2 * n));
136
137 for (i = 0; i < n; i++) {
138 /* zero rtmp */
139 /* L part */
140 nz = bi[i + 1] - bi[i];
141 bjtmp = bj + bi[i];
142 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143
144 /* U part */
145 nz = bdiag[i] - bdiag[i + 1];
146 bjtmp = bj + bdiag[i + 1] + 1;
147 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
148
149 /* load in initial (unfactored row) */
150 nz = ai[i + 1] - ai[i];
151 ajtmp = aj + ai[i];
152 v = aa + bs2 * ai[i];
153 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
154
155 /* elimination */
156 bjtmp = bj + bi[i];
157 nzL = bi[i + 1] - bi[i];
158 for (k = 0; k < nzL; k++) {
159 row = bjtmp[k];
160 pc = rtmp + bs2 * row;
161 for (flg = 0, j = 0; j < bs2; j++) {
162 if (pc[j] != (PetscScalar)0.0) {
163 flg = 1;
164 break;
165 }
166 }
167 if (flg) {
168 pv = b->a + bs2 * bdiag[row];
169 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
170 PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
171
172 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
173 pv = b->a + bs2 * (bdiag[row + 1] + 1);
174 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
175 for (j = 0; j < nz; j++) {
176 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
177 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
178 v = rtmp + 4 * pj[j];
179 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
180 pv += 4;
181 }
182 PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
183 }
184 }
185
186 /* finished row so stick it into b->a */
187 /* L part */
188 pv = b->a + bs2 * bi[i];
189 pj = b->j + bi[i];
190 nz = bi[i + 1] - bi[i];
191 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
192
193 /* Mark diagonal and invert diagonal for simpler triangular solves */
194 pv = b->a + bs2 * bdiag[i];
195 pj = b->j + bdiag[i];
196 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
197 PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
198 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
199
200 /* U part */
201 /*
202 pv = b->a + bs2*bi[2*n-i];
203 pj = b->j + bi[2*n-i];
204 nz = bi[2*n-i+1] - bi[2*n-i] - 1;
205 */
206 pv = b->a + bs2 * (bdiag[i + 1] + 1);
207 pj = b->j + bdiag[i + 1] + 1;
208 nz = bdiag[i] - bdiag[i + 1] - 1;
209 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
210 }
211 PetscCall(PetscFree2(rtmp, mwork));
212
213 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering;
214 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
215 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
217 C->assembled = PETSC_TRUE;
218
219 PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
220 PetscFunctionReturn(PETSC_SUCCESS);
221 }
222
MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B,Mat A,const MatFactorInfo * info)223 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
224 {
225 Mat C = B;
226 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
227 IS isrow = b->row, isicol = b->icol;
228 const PetscInt *r, *ic;
229 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
230 PetscInt *ajtmpold, *ajtmp, nz, row;
231 PetscInt idx, *ai = a->i, *aj = a->j, *pj;
232 const PetscInt *diag_offset;
233 MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234 MatScalar p1, p2, p3, p4;
235 MatScalar *ba = b->a, *aa = a->a;
236 PetscReal shift = info->shiftamount;
237 PetscBool allowzeropivot, zeropivotdetected;
238
239 PetscFunctionBegin;
240 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
241 A->factortype = MAT_FACTOR_NONE;
242 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
243 A->factortype = MAT_FACTOR_ILU;
244 allowzeropivot = PetscNot(A->erroriffailure);
245 PetscCall(ISGetIndices(isrow, &r));
246 PetscCall(ISGetIndices(isicol, &ic));
247 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
248
249 for (i = 0; i < n; i++) {
250 nz = bi[i + 1] - bi[i];
251 ajtmp = bj + bi[i];
252 for (j = 0; j < nz; j++) {
253 x = rtmp + 4 * ajtmp[j];
254 x[0] = x[1] = x[2] = x[3] = 0.0;
255 }
256 /* load in initial (unfactored row) */
257 idx = r[i];
258 nz = ai[idx + 1] - ai[idx];
259 ajtmpold = aj + ai[idx];
260 v = aa + 4 * ai[idx];
261 for (j = 0; j < nz; j++) {
262 x = rtmp + 4 * ic[ajtmpold[j]];
263 x[0] = v[0];
264 x[1] = v[1];
265 x[2] = v[2];
266 x[3] = v[3];
267 v += 4;
268 }
269 row = *ajtmp++;
270 while (row < i) {
271 pc = rtmp + 4 * row;
272 p1 = pc[0];
273 p2 = pc[1];
274 p3 = pc[2];
275 p4 = pc[3];
276 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
277 pv = ba + 4 * diag_offset[row];
278 pj = bj + diag_offset[row] + 1;
279 x1 = pv[0];
280 x2 = pv[1];
281 x3 = pv[2];
282 x4 = pv[3];
283 pc[0] = m1 = p1 * x1 + p3 * x2;
284 pc[1] = m2 = p2 * x1 + p4 * x2;
285 pc[2] = m3 = p1 * x3 + p3 * x4;
286 pc[3] = m4 = p2 * x3 + p4 * x4;
287 nz = bi[row + 1] - diag_offset[row] - 1;
288 pv += 4;
289 for (j = 0; j < nz; j++) {
290 x1 = pv[0];
291 x2 = pv[1];
292 x3 = pv[2];
293 x4 = pv[3];
294 x = rtmp + 4 * pj[j];
295 x[0] -= m1 * x1 + m3 * x2;
296 x[1] -= m2 * x1 + m4 * x2;
297 x[2] -= m1 * x3 + m3 * x4;
298 x[3] -= m2 * x3 + m4 * x4;
299 pv += 4;
300 }
301 PetscCall(PetscLogFlops(16.0 * nz + 12.0));
302 }
303 row = *ajtmp++;
304 }
305 /* finished row so stick it into b->a */
306 pv = ba + 4 * bi[i];
307 pj = bj + bi[i];
308 nz = bi[i + 1] - bi[i];
309 for (j = 0; j < nz; j++) {
310 x = rtmp + 4 * pj[j];
311 pv[0] = x[0];
312 pv[1] = x[1];
313 pv[2] = x[2];
314 pv[3] = x[3];
315 pv += 4;
316 }
317 /* invert diagonal block */
318 w = ba + 4 * diag_offset[i];
319 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
320 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
321 }
322
323 PetscCall(PetscFree(rtmp));
324 PetscCall(ISRestoreIndices(isicol, &ic));
325 PetscCall(ISRestoreIndices(isrow, &r));
326
327 C->ops->solve = MatSolve_SeqBAIJ_2_inplace;
328 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
329 C->assembled = PETSC_TRUE;
330
331 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
332 PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 /*
335 Version for when blocks are 2 by 2 Using natural ordering
336 */
MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)337 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
338 {
339 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
340 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
341 PetscInt *ajtmpold, *ajtmp, nz, row;
342 PetscInt *ai = a->i, *aj = a->j, *pj;
343 const PetscInt *diag_offset;
344 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
345 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
346 MatScalar *ba = b->a, *aa = a->a;
347 PetscReal shift = info->shiftamount;
348 PetscBool allowzeropivot, zeropivotdetected;
349
350 PetscFunctionBegin;
351 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
352 A->factortype = MAT_FACTOR_NONE;
353 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
354 A->factortype = MAT_FACTOR_ILU;
355 allowzeropivot = PetscNot(A->erroriffailure);
356 PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
357 for (i = 0; i < n; i++) {
358 nz = bi[i + 1] - bi[i];
359 ajtmp = bj + bi[i];
360 for (j = 0; j < nz; j++) {
361 x = rtmp + 4 * ajtmp[j];
362 x[0] = x[1] = x[2] = x[3] = 0.0;
363 }
364 /* load in initial (unfactored row) */
365 nz = ai[i + 1] - ai[i];
366 ajtmpold = aj + ai[i];
367 v = aa + 4 * ai[i];
368 for (j = 0; j < nz; j++) {
369 x = rtmp + 4 * ajtmpold[j];
370 x[0] = v[0];
371 x[1] = v[1];
372 x[2] = v[2];
373 x[3] = v[3];
374 v += 4;
375 }
376 row = *ajtmp++;
377 while (row < i) {
378 pc = rtmp + 4 * row;
379 p1 = pc[0];
380 p2 = pc[1];
381 p3 = pc[2];
382 p4 = pc[3];
383 if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
384 pv = ba + 4 * diag_offset[row];
385 pj = bj + diag_offset[row] + 1;
386 x1 = pv[0];
387 x2 = pv[1];
388 x3 = pv[2];
389 x4 = pv[3];
390 pc[0] = m1 = p1 * x1 + p3 * x2;
391 pc[1] = m2 = p2 * x1 + p4 * x2;
392 pc[2] = m3 = p1 * x3 + p3 * x4;
393 pc[3] = m4 = p2 * x3 + p4 * x4;
394 nz = bi[row + 1] - diag_offset[row] - 1;
395 pv += 4;
396 for (j = 0; j < nz; j++) {
397 x1 = pv[0];
398 x2 = pv[1];
399 x3 = pv[2];
400 x4 = pv[3];
401 x = rtmp + 4 * pj[j];
402 x[0] -= m1 * x1 + m3 * x2;
403 x[1] -= m2 * x1 + m4 * x2;
404 x[2] -= m1 * x3 + m3 * x4;
405 x[3] -= m2 * x3 + m4 * x4;
406 pv += 4;
407 }
408 PetscCall(PetscLogFlops(16.0 * nz + 12.0));
409 }
410 row = *ajtmp++;
411 }
412 /* finished row so stick it into b->a */
413 pv = ba + 4 * bi[i];
414 pj = bj + bi[i];
415 nz = bi[i + 1] - bi[i];
416 for (j = 0; j < nz; j++) {
417 x = rtmp + 4 * pj[j];
418 pv[0] = x[0];
419 pv[1] = x[1];
420 pv[2] = x[2];
421 pv[3] = x[3];
422 /*
423 printf(" col %d:",pj[j]);
424 PetscInt j1;
425 for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
426 printf("\n");
427 */
428 pv += 4;
429 }
430 /* invert diagonal block */
431 w = ba + 4 * diag_offset[i];
432 PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
433 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
434 }
435
436 PetscCall(PetscFree(rtmp));
437
438 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
439 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
440 C->assembled = PETSC_TRUE;
441
442 PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
443 PetscFunctionReturn(PETSC_SUCCESS);
444 }
445
446 /*
447 Version for when blocks are 1 by 1.
448 */
MatLUFactorNumeric_SeqBAIJ_1(Mat B,Mat A,const MatFactorInfo * info)449 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
450 {
451 Mat C = B;
452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
453 IS isrow = b->row, isicol = b->icol;
454 const PetscInt *r, *ic, *ics;
455 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
456 PetscInt i, j, k, nz, nzL, row, *pj;
457 const PetscInt *ajtmp, *bjtmp;
458 MatScalar *rtmp, *pc, multiplier, *pv;
459 const MatScalar *aa = a->a, *v;
460 PetscBool row_identity, col_identity;
461 FactorShiftCtx sctx;
462 const PetscInt *ddiag;
463 PetscReal rs;
464 MatScalar d;
465
466 PetscFunctionBegin;
467 /* MatPivotSetUp(): initialize shift context sctx */
468 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
469
470 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
471 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &ddiag, NULL));
472 sctx.shift_top = info->zeropivot;
473 for (i = 0; i < n; i++) {
474 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
475 d = (aa)[ddiag[i]];
476 rs = -PetscAbsScalar(d) - PetscRealPart(d);
477 v = aa + ai[i];
478 nz = ai[i + 1] - ai[i];
479 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
480 if (rs > sctx.shift_top) sctx.shift_top = rs;
481 }
482 sctx.shift_top *= 1.1;
483 sctx.nshift_max = 5;
484 sctx.shift_lo = 0.;
485 sctx.shift_hi = 1.;
486 }
487
488 PetscCall(ISGetIndices(isrow, &r));
489 PetscCall(ISGetIndices(isicol, &ic));
490 PetscCall(PetscMalloc1(n + 1, &rtmp));
491 ics = ic;
492
493 do {
494 sctx.newshift = PETSC_FALSE;
495 for (i = 0; i < n; i++) {
496 /* zero rtmp */
497 /* L part */
498 nz = bi[i + 1] - bi[i];
499 bjtmp = bj + bi[i];
500 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
501
502 /* U part */
503 nz = bdiag[i] - bdiag[i + 1];
504 bjtmp = bj + bdiag[i + 1] + 1;
505 for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
506
507 /* load in initial (unfactored row) */
508 nz = ai[r[i] + 1] - ai[r[i]];
509 ajtmp = aj + ai[r[i]];
510 v = aa + ai[r[i]];
511 for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
512
513 /* ZeropivotApply() */
514 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
515
516 /* elimination */
517 bjtmp = bj + bi[i];
518 row = *bjtmp++;
519 nzL = bi[i + 1] - bi[i];
520 for (k = 0; k < nzL; k++) {
521 pc = rtmp + row;
522 if (*pc != (PetscScalar)0.0) {
523 pv = b->a + bdiag[row];
524 multiplier = *pc * (*pv);
525 *pc = multiplier;
526
527 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
528 pv = b->a + bdiag[row + 1] + 1;
529 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
530 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
531 PetscCall(PetscLogFlops(2.0 * nz));
532 }
533 row = *bjtmp++;
534 }
535
536 /* finished row so stick it into b->a */
537 rs = 0.0;
538 /* L part */
539 pv = b->a + bi[i];
540 pj = b->j + bi[i];
541 nz = bi[i + 1] - bi[i];
542 for (j = 0; j < nz; j++) {
543 pv[j] = rtmp[pj[j]];
544 rs += PetscAbsScalar(pv[j]);
545 }
546
547 /* U part */
548 pv = b->a + bdiag[i + 1] + 1;
549 pj = b->j + bdiag[i + 1] + 1;
550 nz = bdiag[i] - bdiag[i + 1] - 1;
551 for (j = 0; j < nz; j++) {
552 pv[j] = rtmp[pj[j]];
553 rs += PetscAbsScalar(pv[j]);
554 }
555
556 sctx.rs = rs;
557 sctx.pv = rtmp[i];
558 PetscCall(MatPivotCheck(B, A, info, &sctx, i));
559 if (sctx.newshift) break; /* break for-loop */
560 rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
561
562 /* Mark diagonal and invert diagonal for simpler triangular solves */
563 pv = b->a + bdiag[i];
564 *pv = (PetscScalar)1.0 / rtmp[i];
565
566 } /* endof for (i=0; i<n; i++) { */
567
568 /* MatPivotRefine() */
569 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
570 /*
571 * if no shift in this attempt & shifting & started shifting & can refine,
572 * then try lower shift
573 */
574 sctx.shift_hi = sctx.shift_fraction;
575 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
576 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
577 sctx.newshift = PETSC_TRUE;
578 sctx.nshift++;
579 }
580 } while (sctx.newshift);
581
582 PetscCall(PetscFree(rtmp));
583 PetscCall(ISRestoreIndices(isicol, &ic));
584 PetscCall(ISRestoreIndices(isrow, &r));
585
586 PetscCall(ISIdentity(isrow, &row_identity));
587 PetscCall(ISIdentity(isicol, &col_identity));
588 if (row_identity && col_identity) {
589 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering;
590 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
591 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
592 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
593 } else {
594 C->ops->solve = MatSolve_SeqBAIJ_1;
595 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
596 }
597 C->assembled = PETSC_TRUE;
598 PetscCall(PetscLogFlops(C->cmap->n));
599
600 /* MatShiftView(A,info,&sctx) */
601 if (sctx.nshift) {
602 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
603 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
604 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
605 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
606 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
607 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
608 }
609 }
610 PetscFunctionReturn(PETSC_SUCCESS);
611 }
612
MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C,Mat A,const MatFactorInfo * info)613 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
614 {
615 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
616 IS isrow = b->row, isicol = b->icol;
617 const PetscInt *r, *ic;
618 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
619 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
620 PetscInt diag, *pj;
621 const PetscInt *diag_offset;
622 MatScalar *pv, *v, *rtmp, multiplier, *pc;
623 MatScalar *ba = b->a, *aa = a->a;
624 PetscBool row_identity, col_identity;
625
626 PetscFunctionBegin;
627 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
628 A->factortype = MAT_FACTOR_NONE;
629 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
630 A->factortype = MAT_FACTOR_ILU;
631 PetscCall(ISGetIndices(isrow, &r));
632 PetscCall(ISGetIndices(isicol, &ic));
633 PetscCall(PetscMalloc1(n + 1, &rtmp));
634
635 for (i = 0; i < n; i++) {
636 nz = bi[i + 1] - bi[i];
637 ajtmp = bj + bi[i];
638 for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
639
640 /* load in initial (unfactored row) */
641 nz = ai[r[i] + 1] - ai[r[i]];
642 ajtmpold = aj + ai[r[i]];
643 v = aa + ai[r[i]];
644 for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
645
646 row = *ajtmp++;
647 while (row < i) {
648 pc = rtmp + row;
649 if (*pc != 0.0) {
650 pv = ba + diag_offset[row];
651 pj = bj + diag_offset[row] + 1;
652 multiplier = *pc * *pv++;
653 *pc = multiplier;
654 nz = bi[row + 1] - diag_offset[row] - 1;
655 for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
656 PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
657 }
658 row = *ajtmp++;
659 }
660 /* finished row so stick it into b->a */
661 pv = ba + bi[i];
662 pj = bj + bi[i];
663 nz = bi[i + 1] - bi[i];
664 for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
665 diag = diag_offset[i] - bi[i];
666 /* check pivot entry for current row */
667 PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
668 pv[diag] = 1.0 / pv[diag];
669 }
670
671 PetscCall(PetscFree(rtmp));
672 PetscCall(ISRestoreIndices(isicol, &ic));
673 PetscCall(ISRestoreIndices(isrow, &r));
674 PetscCall(ISIdentity(isrow, &row_identity));
675 PetscCall(ISIdentity(isicol, &col_identity));
676 if (row_identity && col_identity) {
677 C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
678 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
679 } else {
680 C->ops->solve = MatSolve_SeqBAIJ_1_inplace;
681 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
682 }
683 C->assembled = PETSC_TRUE;
684 PetscCall(PetscLogFlops(C->cmap->n));
685 PetscFunctionReturn(PETSC_SUCCESS);
686 }
687
MatFactorGetSolverType_petsc(Mat A,MatSolverType * type)688 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
689 {
690 PetscFunctionBegin;
691 *type = MATSOLVERPETSC;
692 PetscFunctionReturn(PETSC_SUCCESS);
693 }
694
MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat * B)695 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
696 {
697 PetscInt n = A->rmap->n;
698
699 PetscFunctionBegin;
700 PetscCheck(!PetscDefined(USE_COMPLEX) || A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
701 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
702 PetscCall(MatSetSizes(*B, n, n, n, n));
703 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
704 PetscCall(MatSetType(*B, MATSEQBAIJ));
705
706 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ;
707 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
708 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
709 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
710 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
711 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
712 PetscCall(MatSetType(*B, MATSEQSBAIJ));
713 PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
714
715 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ;
716 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
717 /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
718 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
719 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
720 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
721 (*B)->factortype = ftype;
722 (*B)->canuseordering = PETSC_TRUE;
723
724 PetscCall(PetscFree((*B)->solvertype));
725 PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
726 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
727 PetscFunctionReturn(PETSC_SUCCESS);
728 }
729
MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo * info)730 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
731 {
732 Mat C;
733
734 PetscFunctionBegin;
735 PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
736 PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
737 PetscCall(MatLUFactorNumeric(C, A, info));
738
739 A->ops->solve = C->ops->solve;
740 A->ops->solvetranspose = C->ops->solvetranspose;
741
742 PetscCall(MatHeaderMerge(A, &C));
743 PetscFunctionReturn(PETSC_SUCCESS);
744 }
745
746 #include <../src/mat/impls/sbaij/seq/sbaij.h>
MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo * info)747 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
748 {
749 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
750 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
751 IS ip = b->row;
752 const PetscInt *rip;
753 PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
754 PetscInt *ai = a->i, *aj = a->j;
755 PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
756 MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
757 PetscReal rs;
758 FactorShiftCtx sctx;
759
760 PetscFunctionBegin;
761 if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
762 if (!a->sbaijMat) {
763 A->symmetric = PETSC_BOOL3_TRUE;
764 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
765 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
766 }
767 PetscCall(a->sbaijMat->ops->choleskyfactornumeric(C, a->sbaijMat, info));
768 PetscCall(MatDestroy(&a->sbaijMat));
769 PetscFunctionReturn(PETSC_SUCCESS);
770 }
771
772 /* MatPivotSetUp(): initialize shift context sctx */
773 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
774
775 PetscCall(ISGetIndices(ip, &rip));
776 PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
777
778 sctx.shift_amount = 0.;
779 sctx.nshift = 0;
780 do {
781 sctx.newshift = PETSC_FALSE;
782 for (i = 0; i < mbs; i++) {
783 rtmp[i] = 0.0;
784 jl[i] = mbs;
785 il[0] = 0;
786 }
787
788 for (k = 0; k < mbs; k++) {
789 bval = ba + bi[k];
790 /* initialize k-th row by the perm[k]-th row of A */
791 jmin = ai[rip[k]];
792 jmax = ai[rip[k] + 1];
793 for (j = jmin; j < jmax; j++) {
794 col = rip[aj[j]];
795 if (col >= k) { /* only take upper triangular entry */
796 rtmp[col] = aa[j];
797 *bval++ = 0.0; /* for in-place factorization */
798 }
799 }
800
801 /* shift the diagonal of the matrix */
802 if (sctx.nshift) rtmp[k] += sctx.shift_amount;
803
804 /* modify k-th row by adding in those rows i with U(i,k)!=0 */
805 dk = rtmp[k];
806 i = jl[k]; /* first row to be added to k_th row */
807
808 while (i < k) {
809 nexti = jl[i]; /* next row to be added to k_th row */
810
811 /* compute multiplier, update diag(k) and U(i,k) */
812 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
813 uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
814 dk += uikdi * ba[ili];
815 ba[ili] = uikdi; /* -U(i,k) */
816
817 /* add multiple of row i to k-th row */
818 jmin = ili + 1;
819 jmax = bi[i + 1];
820 if (jmin < jmax) {
821 for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
822 /* update il and jl for row i */
823 il[i] = jmin;
824 j = bj[jmin];
825 jl[i] = jl[j];
826 jl[j] = i;
827 }
828 i = nexti;
829 }
830
831 /* shift the diagonals when zero pivot is detected */
832 /* compute rs=sum of abs(off-diagonal) */
833 rs = 0.0;
834 jmin = bi[k] + 1;
835 nz = bi[k + 1] - jmin;
836 if (nz) {
837 bcol = bj + jmin;
838 while (nz--) {
839 rs += PetscAbsScalar(rtmp[*bcol]);
840 bcol++;
841 }
842 }
843
844 sctx.rs = rs;
845 sctx.pv = dk;
846 PetscCall(MatPivotCheck(C, A, info, &sctx, k));
847 if (sctx.newshift) break;
848 dk = sctx.pv;
849
850 /* copy data into U(k,:) */
851 ba[bi[k]] = 1.0 / dk; /* U(k,k) */
852 jmin = bi[k] + 1;
853 jmax = bi[k + 1];
854 if (jmin < jmax) {
855 for (j = jmin; j < jmax; j++) {
856 col = bj[j];
857 ba[j] = rtmp[col];
858 rtmp[col] = 0.0;
859 }
860 /* add the k-th row into il and jl */
861 il[k] = jmin;
862 i = bj[jmin];
863 jl[k] = jl[i];
864 jl[i] = k;
865 }
866 }
867 } while (sctx.newshift);
868 PetscCall(PetscFree3(rtmp, il, jl));
869
870 PetscCall(ISRestoreIndices(ip, &rip));
871
872 C->assembled = PETSC_TRUE;
873 C->preallocated = PETSC_TRUE;
874
875 PetscCall(PetscLogFlops(C->rmap->N));
876 if (sctx.nshift) {
877 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
878 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
879 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
880 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
881 }
882 }
883 PetscFunctionReturn(PETSC_SUCCESS);
884 }
885
MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo * info)886 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
887 {
888 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
889 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data;
890 PetscInt i, j, am = a->mbs;
891 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
892 PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
893 MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
894 PetscReal rs;
895 FactorShiftCtx sctx;
896
897 PetscFunctionBegin;
898 /* MatPivotSetUp(): initialize shift context sctx */
899 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
900
901 PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
902
903 do {
904 sctx.newshift = PETSC_FALSE;
905 for (i = 0; i < am; i++) {
906 rtmp[i] = 0.0;
907 jl[i] = am;
908 il[0] = 0;
909 }
910
911 for (k = 0; k < am; k++) {
912 /* initialize k-th row with elements nonzero in row perm(k) of A */
913 nz = ai[k + 1] - ai[k];
914 acol = aj + ai[k];
915 aval = aa + ai[k];
916 bval = ba + bi[k];
917 while (nz--) {
918 if (*acol < k) { /* skip lower triangular entries */
919 acol++;
920 aval++;
921 } else {
922 rtmp[*acol++] = *aval++;
923 *bval++ = 0.0; /* for in-place factorization */
924 }
925 }
926
927 /* shift the diagonal of the matrix */
928 if (sctx.nshift) rtmp[k] += sctx.shift_amount;
929
930 /* modify k-th row by adding in those rows i with U(i,k)!=0 */
931 dk = rtmp[k];
932 i = jl[k]; /* first row to be added to k_th row */
933
934 while (i < k) {
935 nexti = jl[i]; /* next row to be added to k_th row */
936 /* compute multiplier, update D(k) and U(i,k) */
937 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
938 uikdi = -ba[ili] * ba[bi[i]];
939 dk += uikdi * ba[ili];
940 ba[ili] = uikdi; /* -U(i,k) */
941
942 /* add multiple of row i to k-th row ... */
943 jmin = ili + 1;
944 nz = bi[i + 1] - jmin;
945 if (nz > 0) {
946 bcol = bj + jmin;
947 bval = ba + jmin;
948 while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
949 /* update il and jl for i-th row */
950 il[i] = jmin;
951 j = bj[jmin];
952 jl[i] = jl[j];
953 jl[j] = i;
954 }
955 i = nexti;
956 }
957
958 /* shift the diagonals when zero pivot is detected */
959 /* compute rs=sum of abs(off-diagonal) */
960 rs = 0.0;
961 jmin = bi[k] + 1;
962 nz = bi[k + 1] - jmin;
963 if (nz) {
964 bcol = bj + jmin;
965 while (nz--) {
966 rs += PetscAbsScalar(rtmp[*bcol]);
967 bcol++;
968 }
969 }
970
971 sctx.rs = rs;
972 sctx.pv = dk;
973 PetscCall(MatPivotCheck(C, A, info, &sctx, k));
974 if (sctx.newshift) break; /* sctx.shift_amount is updated */
975 dk = sctx.pv;
976
977 /* copy data into U(k,:) */
978 ba[bi[k]] = 1.0 / dk;
979 jmin = bi[k] + 1;
980 nz = bi[k + 1] - jmin;
981 if (nz) {
982 bcol = bj + jmin;
983 bval = ba + jmin;
984 while (nz--) {
985 *bval++ = rtmp[*bcol];
986 rtmp[*bcol++] = 0.0;
987 }
988 /* add k-th row into il and jl */
989 il[k] = jmin;
990 i = bj[jmin];
991 jl[k] = jl[i];
992 jl[i] = k;
993 }
994 }
995 } while (sctx.newshift);
996 PetscCall(PetscFree3(rtmp, il, jl));
997
998 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
999 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1000 C->assembled = PETSC_TRUE;
1001 C->preallocated = PETSC_TRUE;
1002
1003 PetscCall(PetscLogFlops(C->rmap->N));
1004 if (sctx.nshift) {
1005 if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1006 PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1007 } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1008 PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1009 }
1010 }
1011 PetscFunctionReturn(PETSC_SUCCESS);
1012 }
1013
1014 #include <petscbt.h>
1015 #include <../src/mat/utils/freespace.h>
MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1016 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1017 {
1018 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1019 Mat_SeqSBAIJ *b;
1020 Mat B;
1021 PetscBool perm_identity;
1022 PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1023 const PetscInt *rip;
1024 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1025 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1026 PetscReal fill = info->fill, levels = info->levels;
1027 PetscFreeSpaceList free_space = NULL, current_space = NULL;
1028 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1029 PetscBT lnkbt;
1030 PetscBool diagDense;
1031 const PetscInt *adiag;
1032
1033 PetscFunctionBegin;
1034 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &adiag, &diagDense));
1035 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1036
1037 if (bs > 1) {
1038 if (!a->sbaijMat) {
1039 A->symmetric = PETSC_BOOL3_TRUE;
1040 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1041 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1042 }
1043 fact->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1044
1045 PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1046 PetscFunctionReturn(PETSC_SUCCESS);
1047 }
1048
1049 PetscCall(ISIdentity(perm, &perm_identity));
1050 PetscCall(ISGetIndices(perm, &rip));
1051
1052 /* special case that simply copies fill pattern */
1053 if (!levels && perm_identity) {
1054 PetscCall(PetscMalloc1(am + 1, &ui));
1055 for (i = 0; i < am; i++) ui[i] = ai[i + 1] - adiag[i]; /* ui: rowlengths - changes when !perm_identity */
1056 B = fact;
1057 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1058
1059 b = (Mat_SeqSBAIJ *)B->data;
1060 uj = b->j;
1061 for (i = 0; i < am; i++) {
1062 aj = a->j + adiag[i];
1063 for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1064 b->ilen[i] = ui[i];
1065 }
1066 PetscCall(PetscFree(ui));
1067
1068 B->factortype = MAT_FACTOR_NONE;
1069
1070 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1071 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1072 B->factortype = MAT_FACTOR_ICC;
1073
1074 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1075 PetscFunctionReturn(PETSC_SUCCESS);
1076 }
1077
1078 /* initialization */
1079 PetscCall(PetscMalloc1(am + 1, &ui));
1080 ui[0] = 0;
1081 PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1082
1083 /* jl: linked list for storing indices of the pivot rows
1084 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1085 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1086 for (i = 0; i < am; i++) {
1087 jl[i] = am;
1088 il[i] = 0;
1089 }
1090
1091 /* create and initialize a linked list for storing column indices of the active row k */
1092 nlnk = am + 1;
1093 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1094
1095 /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1096 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1097
1098 current_space = free_space;
1099
1100 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1101 current_space_lvl = free_space_lvl;
1102
1103 for (k = 0; k < am; k++) { /* for each active row k */
1104 /* initialize lnk by the column indices of row rip[k] of A */
1105 nzk = 0;
1106 ncols = ai[rip[k] + 1] - ai[rip[k]];
1107 ncols_upper = 0;
1108 cols = cols_lvl + am;
1109 for (j = 0; j < ncols; j++) {
1110 i = rip[*(aj + ai[rip[k]] + j)];
1111 if (i >= k) { /* only take upper triangular entry */
1112 cols[ncols_upper] = i;
1113 cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1114 ncols_upper++;
1115 }
1116 }
1117 PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1118 nzk += nlnk;
1119
1120 /* update lnk by computing fill-in for each pivot row to be merged in */
1121 prow = jl[k]; /* 1st pivot row */
1122
1123 while (prow < k) {
1124 nextprow = jl[prow];
1125
1126 /* merge prow into k-th row */
1127 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1128 jmax = ui[prow + 1];
1129 ncols = jmax - jmin;
1130 i = jmin - ui[prow];
1131 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1132 for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1133 PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1134 nzk += nlnk;
1135
1136 /* update il and jl for prow */
1137 if (jmin < jmax) {
1138 il[prow] = jmin;
1139
1140 j = *cols;
1141 jl[prow] = jl[j];
1142 jl[j] = prow;
1143 }
1144 prow = nextprow;
1145 }
1146
1147 /* if free space is not available, make more free space */
1148 if (current_space->local_remaining < nzk) {
1149 i = am - k + 1; /* num of unfactored rows */
1150 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1151 PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1152 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl));
1153 reallocs++;
1154 }
1155
1156 /* copy data into free_space and free_space_lvl, then initialize lnk */
1157 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1158
1159 /* add the k-th row into il and jl */
1160 if (nzk - 1 > 0) {
1161 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1162 jl[k] = jl[i];
1163 jl[i] = k;
1164 il[k] = ui[k] + 1;
1165 }
1166 uj_ptr[k] = current_space->array;
1167 uj_lvl_ptr[k] = current_space_lvl->array;
1168
1169 current_space->array += nzk;
1170 current_space->local_used += nzk;
1171 current_space->local_remaining -= nzk;
1172
1173 current_space_lvl->array += nzk;
1174 current_space_lvl->local_used += nzk;
1175 current_space_lvl->local_remaining -= nzk;
1176
1177 ui[k + 1] = ui[k] + nzk;
1178 }
1179
1180 PetscCall(ISRestoreIndices(perm, &rip));
1181 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1182 PetscCall(PetscFree(cols_lvl));
1183
1184 /* copy free_space into uj and free free_space; set uj in new datastructure; */
1185 PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1186 PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1187 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1188 PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1189
1190 /* put together the new matrix in MATSEQSBAIJ format */
1191 B = fact;
1192 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1193
1194 b = (Mat_SeqSBAIJ *)B->data;
1195 b->free_a = PETSC_TRUE;
1196 b->free_ij = PETSC_TRUE;
1197
1198 PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1199
1200 b->j = uj;
1201 b->i = ui;
1202 b->diag = NULL;
1203 b->ilen = NULL;
1204 b->imax = NULL;
1205 b->row = perm;
1206 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1207
1208 PetscCall(PetscObjectReference((PetscObject)perm));
1209
1210 b->icol = perm;
1211
1212 PetscCall(PetscObjectReference((PetscObject)perm));
1213 PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1214
1215 b->maxnz = b->nz = ui[am];
1216
1217 B->info.factor_mallocs = reallocs;
1218 B->info.fill_ratio_given = fill;
1219 if (ai[am] != 0.) {
1220 /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1221 B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1222 } else {
1223 B->info.fill_ratio_needed = 0.0;
1224 }
1225 #if defined(PETSC_USE_INFO)
1226 if (ai[am] != 0) {
1227 PetscReal af = B->info.fill_ratio_needed;
1228 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1229 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1230 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1231 } else {
1232 PetscCall(PetscInfo(A, "Empty matrix\n"));
1233 }
1234 #endif
1235 if (perm_identity) {
1236 B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1237 B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1238 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1239 } else {
1240 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1241 }
1242 PetscFunctionReturn(PETSC_SUCCESS);
1243 }
1244
MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)1245 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1246 {
1247 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1248 Mat_SeqSBAIJ *b;
1249 Mat B;
1250 PetscBool perm_identity;
1251 PetscReal fill = info->fill;
1252 const PetscInt *rip;
1253 PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1254 PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1255 PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1256 PetscFreeSpaceList free_space = NULL, current_space = NULL;
1257 PetscBT lnkbt;
1258 PetscBool diagDense;
1259
1260 PetscFunctionBegin;
1261 if (bs > 1) { /* convert to seqsbaij */
1262 if (!a->sbaijMat) {
1263 A->symmetric = PETSC_BOOL3_TRUE;
1264 if (!PetscDefined(USE_COMPLEX)) A->hermitian = PETSC_BOOL3_TRUE;
1265 PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1266 }
1267 fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1268
1269 PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1270 PetscFunctionReturn(PETSC_SUCCESS);
1271 }
1272 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, NULL, &diagDense));
1273 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry");
1274
1275 /* check whether perm is the identity mapping */
1276 PetscCall(ISIdentity(perm, &perm_identity));
1277 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1278 PetscCall(ISGetIndices(perm, &rip));
1279
1280 /* initialization */
1281 PetscCall(PetscMalloc1(mbs + 1, &ui));
1282 ui[0] = 0;
1283
1284 /* jl: linked list for storing indices of the pivot rows
1285 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1286 PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1287 for (i = 0; i < mbs; i++) {
1288 jl[i] = mbs;
1289 il[i] = 0;
1290 }
1291
1292 /* create and initialize a linked list for storing column indices of the active row k */
1293 nlnk = mbs + 1;
1294 PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1295
1296 /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1297 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1298
1299 current_space = free_space;
1300
1301 for (k = 0; k < mbs; k++) { /* for each active row k */
1302 /* initialize lnk by the column indices of row rip[k] of A */
1303 nzk = 0;
1304 ncols = ai[rip[k] + 1] - ai[rip[k]];
1305 ncols_upper = 0;
1306 for (j = 0; j < ncols; j++) {
1307 i = rip[*(aj + ai[rip[k]] + j)];
1308 if (i >= k) { /* only take upper triangular entry */
1309 cols[ncols_upper] = i;
1310 ncols_upper++;
1311 }
1312 }
1313 PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1314 nzk += nlnk;
1315
1316 /* update lnk by computing fill-in for each pivot row to be merged in */
1317 prow = jl[k]; /* 1st pivot row */
1318
1319 while (prow < k) {
1320 nextprow = jl[prow];
1321 /* merge prow into k-th row */
1322 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1323 jmax = ui[prow + 1];
1324 ncols = jmax - jmin;
1325 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1326 PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1327 nzk += nlnk;
1328
1329 /* update il and jl for prow */
1330 if (jmin < jmax) {
1331 il[prow] = jmin;
1332 j = *uj_ptr;
1333 jl[prow] = jl[j];
1334 jl[j] = prow;
1335 }
1336 prow = nextprow;
1337 }
1338
1339 /* if free space is not available, make more free space */
1340 if (current_space->local_remaining < nzk) {
1341 i = mbs - k + 1; /* num of unfactored rows */
1342 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1343 PetscCall(PetscFreeSpaceGet(i, ¤t_space));
1344 reallocs++;
1345 }
1346
1347 /* copy data into free space, then initialize lnk */
1348 PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1349
1350 /* add the k-th row into il and jl */
1351 if (nzk - 1 > 0) {
1352 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1353 jl[k] = jl[i];
1354 jl[i] = k;
1355 il[k] = ui[k] + 1;
1356 }
1357 ui_ptr[k] = current_space->array;
1358 current_space->array += nzk;
1359 current_space->local_used += nzk;
1360 current_space->local_remaining -= nzk;
1361
1362 ui[k + 1] = ui[k] + nzk;
1363 }
1364
1365 PetscCall(ISRestoreIndices(perm, &rip));
1366 PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1367
1368 /* copy free_space into uj and free free_space; set uj in new datastructure; */
1369 PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1370 PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1371 PetscCall(PetscLLDestroy(lnk, lnkbt));
1372
1373 /* put together the new matrix in MATSEQSBAIJ format */
1374 B = fact;
1375 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1376
1377 b = (Mat_SeqSBAIJ *)B->data;
1378 b->free_a = PETSC_TRUE;
1379 b->free_ij = PETSC_TRUE;
1380
1381 PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1382
1383 b->j = uj;
1384 b->i = ui;
1385 b->diag = NULL;
1386 b->ilen = NULL;
1387 b->imax = NULL;
1388 b->row = perm;
1389 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1390
1391 PetscCall(PetscObjectReference((PetscObject)perm));
1392 b->icol = perm;
1393 PetscCall(PetscObjectReference((PetscObject)perm));
1394 PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1395 b->maxnz = b->nz = ui[mbs];
1396
1397 B->info.factor_mallocs = reallocs;
1398 B->info.fill_ratio_given = fill;
1399 if (ai[mbs] != 0.) {
1400 /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1401 B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1402 } else {
1403 B->info.fill_ratio_needed = 0.0;
1404 }
1405 #if defined(PETSC_USE_INFO)
1406 if (ai[mbs] != 0.) {
1407 PetscReal af = B->info.fill_ratio_needed;
1408 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1409 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1410 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1411 } else {
1412 PetscCall(PetscInfo(A, "Empty matrix\n"));
1413 }
1414 #endif
1415 if (perm_identity) {
1416 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1417 } else {
1418 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1419 }
1420 PetscFunctionReturn(PETSC_SUCCESS);
1421 }
1422
MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)1423 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1424 {
1425 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1426 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1427 PetscInt i, k, n = a->mbs;
1428 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1429 const MatScalar *aa = a->a, *v;
1430 PetscScalar *x, *s, *t, *ls;
1431 const PetscScalar *b;
1432
1433 PetscFunctionBegin;
1434 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1435 PetscCall(VecGetArrayRead(bb, &b));
1436 PetscCall(VecGetArray(xx, &x));
1437 t = a->solve_work;
1438
1439 /* forward solve the lower triangular */
1440 PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1441
1442 for (i = 1; i < n; i++) {
1443 v = aa + bs2 * ai[i];
1444 vi = aj + ai[i];
1445 nz = ai[i + 1] - ai[i];
1446 s = t + bs * i;
1447 PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1448 for (k = 0; k < nz; k++) {
1449 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1450 v += bs2;
1451 }
1452 }
1453
1454 /* backward solve the upper triangular */
1455 ls = a->solve_work + A->cmap->n;
1456 for (i = n - 1; i >= 0; i--) {
1457 v = aa + bs2 * (adiag[i + 1] + 1);
1458 vi = aj + adiag[i + 1] + 1;
1459 nz = adiag[i] - adiag[i + 1] - 1;
1460 PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1461 for (k = 0; k < nz; k++) {
1462 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1463 v += bs2;
1464 }
1465 PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1466 PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1467 }
1468
1469 PetscCall(VecRestoreArrayRead(bb, &b));
1470 PetscCall(VecRestoreArray(xx, &x));
1471 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1472 PetscFunctionReturn(PETSC_SUCCESS);
1473 }
1474
MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)1475 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1476 {
1477 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1478 IS iscol = a->col, isrow = a->row;
1479 const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1480 PetscInt i, m, n = a->mbs;
1481 PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2;
1482 const MatScalar *aa = a->a, *v;
1483 PetscScalar *x, *s, *t, *ls;
1484 const PetscScalar *b;
1485
1486 PetscFunctionBegin;
1487 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
1488 PetscCall(VecGetArrayRead(bb, &b));
1489 PetscCall(VecGetArray(xx, &x));
1490 t = a->solve_work;
1491
1492 PetscCall(ISGetIndices(isrow, &rout));
1493 r = rout;
1494 PetscCall(ISGetIndices(iscol, &cout));
1495 c = cout;
1496
1497 /* forward solve the lower triangular */
1498 PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1499 for (i = 1; i < n; i++) {
1500 v = aa + bs2 * ai[i];
1501 vi = aj + ai[i];
1502 nz = ai[i + 1] - ai[i];
1503 s = t + bs * i;
1504 PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1505 for (m = 0; m < nz; m++) {
1506 PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1507 v += bs2;
1508 }
1509 }
1510
1511 /* backward solve the upper triangular */
1512 ls = a->solve_work + A->cmap->n;
1513 for (i = n - 1; i >= 0; i--) {
1514 v = aa + bs2 * (adiag[i + 1] + 1);
1515 vi = aj + adiag[i + 1] + 1;
1516 nz = adiag[i] - adiag[i + 1] - 1;
1517 PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1518 for (m = 0; m < nz; m++) {
1519 PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1520 v += bs2;
1521 }
1522 PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1523 PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1524 }
1525 PetscCall(ISRestoreIndices(isrow, &rout));
1526 PetscCall(ISRestoreIndices(iscol, &cout));
1527 PetscCall(VecRestoreArrayRead(bb, &b));
1528 PetscCall(VecRestoreArray(xx, &x));
1529 PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1530 PetscFunctionReturn(PETSC_SUCCESS);
1531 }
1532