1 /*
2 Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6
7 /*
8 Version for when blocks are 3 by 3
9 */
MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C,Mat A,const MatFactorInfo * info)10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13 IS isrow = b->row, isicol = b->icol;
14 const PetscInt *r, *ic;
15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16 PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17 const PetscInt *diag_offset;
18 PetscInt idx, *pj;
19 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
22 MatScalar *ba = b->a, *aa = a->a;
23 PetscReal shift = info->shiftamount;
24 PetscBool allowzeropivot, zeropivotdetected;
25
26 PetscFunctionBegin;
27 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
28 A->factortype = MAT_FACTOR_NONE;
29 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
30 A->factortype = MAT_FACTOR_ILU;
31 PetscCall(ISGetIndices(isrow, &r));
32 PetscCall(ISGetIndices(isicol, &ic));
33 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
34 allowzeropivot = PetscNot(A->erroriffailure);
35
36 for (i = 0; i < n; i++) {
37 nz = bi[i + 1] - bi[i];
38 ajtmp = bj + bi[i];
39 for (j = 0; j < nz; j++) {
40 x = rtmp + 9 * ajtmp[j];
41 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
42 }
43 /* load in initial (unfactored row) */
44 idx = r[i];
45 nz = ai[idx + 1] - ai[idx];
46 ajtmpold = aj + ai[idx];
47 v = aa + 9 * ai[idx];
48 for (j = 0; j < nz; j++) {
49 x = rtmp + 9 * ic[ajtmpold[j]];
50 x[0] = v[0];
51 x[1] = v[1];
52 x[2] = v[2];
53 x[3] = v[3];
54 x[4] = v[4];
55 x[5] = v[5];
56 x[6] = v[6];
57 x[7] = v[7];
58 x[8] = v[8];
59 v += 9;
60 }
61 row = *ajtmp++;
62 while (row < i) {
63 pc = rtmp + 9 * row;
64 p1 = pc[0];
65 p2 = pc[1];
66 p3 = pc[2];
67 p4 = pc[3];
68 p5 = pc[4];
69 p6 = pc[5];
70 p7 = pc[6];
71 p8 = pc[7];
72 p9 = pc[8];
73 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
74 pv = ba + 9 * diag_offset[row];
75 pj = bj + diag_offset[row] + 1;
76 x1 = pv[0];
77 x2 = pv[1];
78 x3 = pv[2];
79 x4 = pv[3];
80 x5 = pv[4];
81 x6 = pv[5];
82 x7 = pv[6];
83 x8 = pv[7];
84 x9 = pv[8];
85 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
86 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
87 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
88
89 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
90 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
91 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
92
93 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
94 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
95 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
96 nz = bi[row + 1] - diag_offset[row] - 1;
97 pv += 9;
98 for (j = 0; j < nz; j++) {
99 x1 = pv[0];
100 x2 = pv[1];
101 x3 = pv[2];
102 x4 = pv[3];
103 x5 = pv[4];
104 x6 = pv[5];
105 x7 = pv[6];
106 x8 = pv[7];
107 x9 = pv[8];
108 x = rtmp + 9 * pj[j];
109 x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
110 x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
111 x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
112
113 x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
114 x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
115 x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
116
117 x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
118 x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
119 x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
120 pv += 9;
121 }
122 PetscCall(PetscLogFlops(54.0 * nz + 36.0));
123 }
124 row = *ajtmp++;
125 }
126 /* finished row so stick it into b->a */
127 pv = ba + 9 * bi[i];
128 pj = bj + bi[i];
129 nz = bi[i + 1] - bi[i];
130 for (j = 0; j < nz; j++) {
131 x = rtmp + 9 * pj[j];
132 pv[0] = x[0];
133 pv[1] = x[1];
134 pv[2] = x[2];
135 pv[3] = x[3];
136 pv[4] = x[4];
137 pv[5] = x[5];
138 pv[6] = x[6];
139 pv[7] = x[7];
140 pv[8] = x[8];
141 pv += 9;
142 }
143 /* invert diagonal block */
144 w = ba + 9 * diag_offset[i];
145 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
146 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
147 }
148
149 PetscCall(PetscFree(rtmp));
150 PetscCall(ISRestoreIndices(isicol, &ic));
151 PetscCall(ISRestoreIndices(isrow, &r));
152
153 C->ops->solve = MatSolve_SeqBAIJ_3_inplace;
154 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
155 C->assembled = PETSC_TRUE;
156
157 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
158 PetscFunctionReturn(PETSC_SUCCESS);
159 }
160
161 /* MatLUFactorNumeric_SeqBAIJ_3 -
162 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163 PetscKernel_A_gets_A_times_B()
164 PetscKernel_A_gets_A_minus_B_times_C()
165 PetscKernel_A_gets_inverse_A()
166 */
MatLUFactorNumeric_SeqBAIJ_3(Mat B,Mat A,const MatFactorInfo * info)167 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
168 {
169 Mat C = B;
170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
171 IS isrow = b->row, isicol = b->icol;
172 const PetscInt *r, *ic;
173 PetscInt i, j, k, nz, nzL, row;
174 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
175 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
176 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
177 PetscInt flg;
178 PetscReal shift = info->shiftamount;
179 PetscBool allowzeropivot, zeropivotdetected;
180
181 PetscFunctionBegin;
182 PetscCall(ISGetIndices(isrow, &r));
183 PetscCall(ISGetIndices(isicol, &ic));
184 allowzeropivot = PetscNot(A->erroriffailure);
185
186 /* generate work space needed by the factorization */
187 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
188 PetscCall(PetscArrayzero(rtmp, bs2 * n));
189
190 for (i = 0; i < n; i++) {
191 /* zero rtmp */
192 /* L part */
193 nz = bi[i + 1] - bi[i];
194 bjtmp = bj + bi[i];
195 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
196
197 /* U part */
198 nz = bdiag[i] - bdiag[i + 1];
199 bjtmp = bj + bdiag[i + 1] + 1;
200 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
201
202 /* load in initial (unfactored row) */
203 nz = ai[r[i] + 1] - ai[r[i]];
204 ajtmp = aj + ai[r[i]];
205 v = aa + bs2 * ai[r[i]];
206 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
207
208 /* elimination */
209 bjtmp = bj + bi[i];
210 nzL = bi[i + 1] - bi[i];
211 for (k = 0; k < nzL; k++) {
212 row = bjtmp[k];
213 pc = rtmp + bs2 * row;
214 for (flg = 0, j = 0; j < bs2; j++) {
215 if (pc[j] != 0.0) {
216 flg = 1;
217 break;
218 }
219 }
220 if (flg) {
221 pv = b->a + bs2 * bdiag[row];
222 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
223 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
224
225 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
226 pv = b->a + bs2 * (bdiag[row + 1] + 1);
227 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
228 for (j = 0; j < nz; j++) {
229 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
230 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
231 v = rtmp + bs2 * pj[j];
232 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
233 pv += bs2;
234 }
235 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
236 }
237 }
238
239 /* finished row so stick it into b->a */
240 /* L part */
241 pv = b->a + bs2 * bi[i];
242 pj = b->j + bi[i];
243 nz = bi[i + 1] - bi[i];
244 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
245
246 /* Mark diagonal and invert diagonal for simpler triangular solves */
247 pv = b->a + bs2 * bdiag[i];
248 pj = b->j + bdiag[i];
249 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
250 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
251 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
252
253 /* U part */
254 pj = b->j + bdiag[i + 1] + 1;
255 pv = b->a + bs2 * (bdiag[i + 1] + 1);
256 nz = bdiag[i] - bdiag[i + 1] - 1;
257 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
258 }
259
260 PetscCall(PetscFree2(rtmp, mwork));
261 PetscCall(ISRestoreIndices(isicol, &ic));
262 PetscCall(ISRestoreIndices(isrow, &r));
263
264 C->ops->solve = MatSolve_SeqBAIJ_3;
265 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
266 C->assembled = PETSC_TRUE;
267
268 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
269 PetscFunctionReturn(PETSC_SUCCESS);
270 }
271
MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)272 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
273 {
274 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
275 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
276 PetscInt *ajtmpold, *ajtmp, nz, row;
277 const PetscInt *diag_offset;
278 PetscInt *ai = a->i, *aj = a->j, *pj;
279 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
280 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
281 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
282 MatScalar *ba = b->a, *aa = a->a;
283 PetscReal shift = info->shiftamount;
284 PetscBool allowzeropivot, zeropivotdetected;
285
286 PetscFunctionBegin;
287 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
288 A->factortype = MAT_FACTOR_NONE;
289 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
290 A->factortype = MAT_FACTOR_ILU;
291 PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
292 allowzeropivot = PetscNot(A->erroriffailure);
293
294 for (i = 0; i < n; i++) {
295 nz = bi[i + 1] - bi[i];
296 ajtmp = bj + bi[i];
297 for (j = 0; j < nz; j++) {
298 x = rtmp + 9 * ajtmp[j];
299 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
300 }
301 /* load in initial (unfactored row) */
302 nz = ai[i + 1] - ai[i];
303 ajtmpold = aj + ai[i];
304 v = aa + 9 * ai[i];
305 for (j = 0; j < nz; j++) {
306 x = rtmp + 9 * ajtmpold[j];
307 x[0] = v[0];
308 x[1] = v[1];
309 x[2] = v[2];
310 x[3] = v[3];
311 x[4] = v[4];
312 x[5] = v[5];
313 x[6] = v[6];
314 x[7] = v[7];
315 x[8] = v[8];
316 v += 9;
317 }
318 row = *ajtmp++;
319 while (row < i) {
320 pc = rtmp + 9 * row;
321 p1 = pc[0];
322 p2 = pc[1];
323 p3 = pc[2];
324 p4 = pc[3];
325 p5 = pc[4];
326 p6 = pc[5];
327 p7 = pc[6];
328 p8 = pc[7];
329 p9 = pc[8];
330 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
331 pv = ba + 9 * diag_offset[row];
332 pj = bj + diag_offset[row] + 1;
333 x1 = pv[0];
334 x2 = pv[1];
335 x3 = pv[2];
336 x4 = pv[3];
337 x5 = pv[4];
338 x6 = pv[5];
339 x7 = pv[6];
340 x8 = pv[7];
341 x9 = pv[8];
342 pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
343 pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
344 pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
345
346 pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
347 pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
348 pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
349
350 pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
351 pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
352 pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
353
354 nz = bi[row + 1] - diag_offset[row] - 1;
355 pv += 9;
356 for (j = 0; j < nz; j++) {
357 x1 = pv[0];
358 x2 = pv[1];
359 x3 = pv[2];
360 x4 = pv[3];
361 x5 = pv[4];
362 x6 = pv[5];
363 x7 = pv[6];
364 x8 = pv[7];
365 x9 = pv[8];
366 x = rtmp + 9 * pj[j];
367 x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
368 x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
369 x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
370
371 x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
372 x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
373 x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
374
375 x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
376 x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
377 x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
378 pv += 9;
379 }
380 PetscCall(PetscLogFlops(54.0 * nz + 36.0));
381 }
382 row = *ajtmp++;
383 }
384 /* finished row so stick it into b->a */
385 pv = ba + 9 * bi[i];
386 pj = bj + bi[i];
387 nz = bi[i + 1] - bi[i];
388 for (j = 0; j < nz; j++) {
389 x = rtmp + 9 * pj[j];
390 pv[0] = x[0];
391 pv[1] = x[1];
392 pv[2] = x[2];
393 pv[3] = x[3];
394 pv[4] = x[4];
395 pv[5] = x[5];
396 pv[6] = x[6];
397 pv[7] = x[7];
398 pv[8] = x[8];
399 pv += 9;
400 }
401 /* invert diagonal block */
402 w = ba + 9 * diag_offset[i];
403 PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
404 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
405 }
406
407 PetscCall(PetscFree(rtmp));
408
409 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
410 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
411 C->assembled = PETSC_TRUE;
412
413 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
414 PetscFunctionReturn(PETSC_SUCCESS);
415 }
416
417 /*
418 MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
419 copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
420 */
MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)421 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
422 {
423 Mat C = B;
424 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
425 PetscInt i, j, k, nz, nzL, row;
426 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
427 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
428 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
429 PetscInt flg;
430 PetscReal shift = info->shiftamount;
431 PetscBool allowzeropivot, zeropivotdetected;
432
433 PetscFunctionBegin;
434 allowzeropivot = PetscNot(A->erroriffailure);
435
436 /* generate work space needed by the factorization */
437 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
438 PetscCall(PetscArrayzero(rtmp, bs2 * n));
439
440 for (i = 0; i < n; i++) {
441 /* zero rtmp */
442 /* L part */
443 nz = bi[i + 1] - bi[i];
444 bjtmp = bj + bi[i];
445 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
446
447 /* U part */
448 nz = bdiag[i] - bdiag[i + 1];
449 bjtmp = bj + bdiag[i + 1] + 1;
450 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
451
452 /* load in initial (unfactored row) */
453 nz = ai[i + 1] - ai[i];
454 ajtmp = aj + ai[i];
455 v = aa + bs2 * ai[i];
456 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
457
458 /* elimination */
459 bjtmp = bj + bi[i];
460 nzL = bi[i + 1] - bi[i];
461 for (k = 0; k < nzL; k++) {
462 row = bjtmp[k];
463 pc = rtmp + bs2 * row;
464 for (flg = 0, j = 0; j < bs2; j++) {
465 if (pc[j] != 0.0) {
466 flg = 1;
467 break;
468 }
469 }
470 if (flg) {
471 pv = b->a + bs2 * bdiag[row];
472 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
473 PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
474
475 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
476 pv = b->a + bs2 * (bdiag[row + 1] + 1);
477 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
478 for (j = 0; j < nz; j++) {
479 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
480 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
481 v = rtmp + bs2 * pj[j];
482 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
483 pv += bs2;
484 }
485 PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
486 }
487 }
488
489 /* finished row so stick it into b->a */
490 /* L part */
491 pv = b->a + bs2 * bi[i];
492 pj = b->j + bi[i];
493 nz = bi[i + 1] - bi[i];
494 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
495
496 /* Mark diagonal and invert diagonal for simpler triangular solves */
497 pv = b->a + bs2 * bdiag[i];
498 pj = b->j + bdiag[i];
499 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
500 PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
501 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
502
503 /* U part */
504 pv = b->a + bs2 * (bdiag[i + 1] + 1);
505 pj = b->j + bdiag[i + 1] + 1;
506 nz = bdiag[i] - bdiag[i + 1] - 1;
507 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
508 }
509 PetscCall(PetscFree2(rtmp, mwork));
510
511 C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
512 C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
513 C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
514 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
515 C->assembled = PETSC_TRUE;
516
517 PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
518 PetscFunctionReturn(PETSC_SUCCESS);
519 }
520