1 /*
2 Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6
7 /*
8 Version for when blocks are 4 by 4
9 */
MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo * info)10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13 IS isrow = b->row, isicol = b->icol;
14 const PetscInt *r, *ic;
15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16 PetscInt *ajtmpold, *ajtmp, nz, row;
17 const PetscInt *diag_offset;
18 PetscInt idx, *ai = a->i, *aj = a->j, *pj;
19 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
22 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
23 MatScalar m13, m14, m15, m16;
24 MatScalar *ba = b->a, *aa = a->a;
25 PetscBool pivotinblocks = b->pivotinblocks;
26 PetscReal shift = info->shiftamount;
27 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
28
29 PetscFunctionBegin;
30 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
31 A->factortype = MAT_FACTOR_NONE;
32 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
33 A->factortype = MAT_FACTOR_ILU;
34 PetscCall(ISGetIndices(isrow, &r));
35 PetscCall(ISGetIndices(isicol, &ic));
36 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
37 allowzeropivot = PetscNot(A->erroriffailure);
38
39 for (i = 0; i < n; i++) {
40 nz = bi[i + 1] - bi[i];
41 ajtmp = bj + bi[i];
42 for (j = 0; j < nz; j++) {
43 x = rtmp + 16 * ajtmp[j];
44 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
45 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
46 }
47 /* load in initial (unfactored row) */
48 idx = r[i];
49 nz = ai[idx + 1] - ai[idx];
50 ajtmpold = aj + ai[idx];
51 v = aa + 16 * ai[idx];
52 for (j = 0; j < nz; j++) {
53 x = rtmp + 16 * ic[ajtmpold[j]];
54 x[0] = v[0];
55 x[1] = v[1];
56 x[2] = v[2];
57 x[3] = v[3];
58 x[4] = v[4];
59 x[5] = v[5];
60 x[6] = v[6];
61 x[7] = v[7];
62 x[8] = v[8];
63 x[9] = v[9];
64 x[10] = v[10];
65 x[11] = v[11];
66 x[12] = v[12];
67 x[13] = v[13];
68 x[14] = v[14];
69 x[15] = v[15];
70 v += 16;
71 }
72 row = *ajtmp++;
73 while (row < i) {
74 pc = rtmp + 16 * row;
75 p1 = pc[0];
76 p2 = pc[1];
77 p3 = pc[2];
78 p4 = pc[3];
79 p5 = pc[4];
80 p6 = pc[5];
81 p7 = pc[6];
82 p8 = pc[7];
83 p9 = pc[8];
84 p10 = pc[9];
85 p11 = pc[10];
86 p12 = pc[11];
87 p13 = pc[12];
88 p14 = pc[13];
89 p15 = pc[14];
90 p16 = pc[15];
91 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
92 pv = ba + 16 * diag_offset[row];
93 pj = bj + diag_offset[row] + 1;
94 x1 = pv[0];
95 x2 = pv[1];
96 x3 = pv[2];
97 x4 = pv[3];
98 x5 = pv[4];
99 x6 = pv[5];
100 x7 = pv[6];
101 x8 = pv[7];
102 x9 = pv[8];
103 x10 = pv[9];
104 x11 = pv[10];
105 x12 = pv[11];
106 x13 = pv[12];
107 x14 = pv[13];
108 x15 = pv[14];
109 x16 = pv[15];
110 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
111 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
112 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
113 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
114
115 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
116 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
117 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
118 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
119
120 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
121 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
122 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
123 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
124
125 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
126 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
127 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
128 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
129
130 nz = bi[row + 1] - diag_offset[row] - 1;
131 pv += 16;
132 for (j = 0; j < nz; j++) {
133 x1 = pv[0];
134 x2 = pv[1];
135 x3 = pv[2];
136 x4 = pv[3];
137 x5 = pv[4];
138 x6 = pv[5];
139 x7 = pv[6];
140 x8 = pv[7];
141 x9 = pv[8];
142 x10 = pv[9];
143 x11 = pv[10];
144 x12 = pv[11];
145 x13 = pv[12];
146 x14 = pv[13];
147 x15 = pv[14];
148 x16 = pv[15];
149 x = rtmp + 16 * pj[j];
150 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
151 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
152 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
153 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
154
155 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
156 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
157 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
158 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
159
160 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
161 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
162 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
163 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
164
165 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
166 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
167 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
168 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
169
170 pv += 16;
171 }
172 PetscCall(PetscLogFlops(128.0 * nz + 112.0));
173 }
174 row = *ajtmp++;
175 }
176 /* finished row so stick it into b->a */
177 pv = ba + 16 * bi[i];
178 pj = bj + bi[i];
179 nz = bi[i + 1] - bi[i];
180 for (j = 0; j < nz; j++) {
181 x = rtmp + 16 * pj[j];
182 pv[0] = x[0];
183 pv[1] = x[1];
184 pv[2] = x[2];
185 pv[3] = x[3];
186 pv[4] = x[4];
187 pv[5] = x[5];
188 pv[6] = x[6];
189 pv[7] = x[7];
190 pv[8] = x[8];
191 pv[9] = x[9];
192 pv[10] = x[10];
193 pv[11] = x[11];
194 pv[12] = x[12];
195 pv[13] = x[13];
196 pv[14] = x[14];
197 pv[15] = x[15];
198 pv += 16;
199 }
200 /* invert diagonal block */
201 w = ba + 16 * diag_offset[i];
202 if (pivotinblocks) {
203 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
204 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
205 } else {
206 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
207 }
208 }
209
210 PetscCall(PetscFree(rtmp));
211 PetscCall(ISRestoreIndices(isicol, &ic));
212 PetscCall(ISRestoreIndices(isrow, &r));
213
214 C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
215 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
216 C->assembled = PETSC_TRUE;
217
218 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
219 PetscFunctionReturn(PETSC_SUCCESS);
220 }
221
222 /* MatLUFactorNumeric_SeqBAIJ_4 -
223 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
224 PetscKernel_A_gets_A_times_B()
225 PetscKernel_A_gets_A_minus_B_times_C()
226 PetscKernel_A_gets_inverse_A()
227 */
228
MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo * info)229 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info)
230 {
231 Mat C = B;
232 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
233 IS isrow = b->row, isicol = b->icol;
234 const PetscInt *r, *ic;
235 PetscInt i, j, k, nz, nzL, row;
236 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
237 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
238 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
239 PetscInt flg;
240 PetscReal shift;
241 PetscBool allowzeropivot, zeropivotdetected;
242
243 PetscFunctionBegin;
244 allowzeropivot = PetscNot(A->erroriffailure);
245 PetscCall(ISGetIndices(isrow, &r));
246 PetscCall(ISGetIndices(isicol, &ic));
247
248 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
249 shift = 0;
250 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
251 shift = info->shiftamount;
252 }
253
254 /* generate work space needed by the factorization */
255 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
256 PetscCall(PetscArrayzero(rtmp, bs2 * n));
257
258 for (i = 0; i < n; i++) {
259 /* zero rtmp */
260 /* L part */
261 nz = bi[i + 1] - bi[i];
262 bjtmp = bj + bi[i];
263 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
264
265 /* U part */
266 nz = bdiag[i] - bdiag[i + 1];
267 bjtmp = bj + bdiag[i + 1] + 1;
268 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
269
270 /* load in initial (unfactored row) */
271 nz = ai[r[i] + 1] - ai[r[i]];
272 ajtmp = aj + ai[r[i]];
273 v = aa + bs2 * ai[r[i]];
274 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
275
276 /* elimination */
277 bjtmp = bj + bi[i];
278 nzL = bi[i + 1] - bi[i];
279 for (k = 0; k < nzL; k++) {
280 row = bjtmp[k];
281 pc = rtmp + bs2 * row;
282 for (flg = 0, j = 0; j < bs2; j++) {
283 if (pc[j] != 0.0) {
284 flg = 1;
285 break;
286 }
287 }
288 if (flg) {
289 pv = b->a + bs2 * bdiag[row];
290 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
291 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
292
293 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
294 pv = b->a + bs2 * (bdiag[row + 1] + 1);
295 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
296 for (j = 0; j < nz; j++) {
297 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
298 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
299 v = rtmp + bs2 * pj[j];
300 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
301 pv += bs2;
302 }
303 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
304 }
305 }
306
307 /* finished row so stick it into b->a */
308 /* L part */
309 pv = b->a + bs2 * bi[i];
310 pj = b->j + bi[i];
311 nz = bi[i + 1] - bi[i];
312 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
313
314 /* Mark diagonal and invert diagonal for simpler triangular solves */
315 pv = b->a + bs2 * bdiag[i];
316 pj = b->j + bdiag[i];
317 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
318 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
319 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
320
321 /* U part */
322 pv = b->a + bs2 * (bdiag[i + 1] + 1);
323 pj = b->j + bdiag[i + 1] + 1;
324 nz = bdiag[i] - bdiag[i + 1] - 1;
325 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
326 }
327
328 PetscCall(PetscFree2(rtmp, mwork));
329 PetscCall(ISRestoreIndices(isicol, &ic));
330 PetscCall(ISRestoreIndices(isrow, &r));
331
332 C->ops->solve = MatSolve_SeqBAIJ_4;
333 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
334 C->assembled = PETSC_TRUE;
335
336 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
337 PetscFunctionReturn(PETSC_SUCCESS);
338 }
339
MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)340 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
341 {
342 /*
343 Default Version for when blocks are 4 by 4 Using natural ordering
344 */
345 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
346 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
347 PetscInt *ajtmpold, *ajtmp, nz, row;
348 const PetscInt *diag_offset;
349 PetscInt *ai = a->i, *aj = a->j, *pj;
350 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
351 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
352 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
353 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12;
354 MatScalar m13, m14, m15, m16;
355 MatScalar *ba = b->a, *aa = a->a;
356 PetscBool pivotinblocks = b->pivotinblocks;
357 PetscReal shift = info->shiftamount;
358 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE;
359
360 PetscFunctionBegin;
361 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
362 A->factortype = MAT_FACTOR_NONE;
363 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
364 A->factortype = MAT_FACTOR_ILU;
365 allowzeropivot = PetscNot(A->erroriffailure);
366 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp));
367
368 for (i = 0; i < n; i++) {
369 nz = bi[i + 1] - bi[i];
370 ajtmp = bj + bi[i];
371 for (j = 0; j < nz; j++) {
372 x = rtmp + 16 * ajtmp[j];
373 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
374 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
375 }
376 /* load in initial (unfactored row) */
377 nz = ai[i + 1] - ai[i];
378 ajtmpold = aj + ai[i];
379 v = aa + 16 * ai[i];
380 for (j = 0; j < nz; j++) {
381 x = rtmp + 16 * ajtmpold[j];
382 x[0] = v[0];
383 x[1] = v[1];
384 x[2] = v[2];
385 x[3] = v[3];
386 x[4] = v[4];
387 x[5] = v[5];
388 x[6] = v[6];
389 x[7] = v[7];
390 x[8] = v[8];
391 x[9] = v[9];
392 x[10] = v[10];
393 x[11] = v[11];
394 x[12] = v[12];
395 x[13] = v[13];
396 x[14] = v[14];
397 x[15] = v[15];
398 v += 16;
399 }
400 row = *ajtmp++;
401 while (row < i) {
402 pc = rtmp + 16 * row;
403 p1 = pc[0];
404 p2 = pc[1];
405 p3 = pc[2];
406 p4 = pc[3];
407 p5 = pc[4];
408 p6 = pc[5];
409 p7 = pc[6];
410 p8 = pc[7];
411 p9 = pc[8];
412 p10 = pc[9];
413 p11 = pc[10];
414 p12 = pc[11];
415 p13 = pc[12];
416 p14 = pc[13];
417 p15 = pc[14];
418 p16 = pc[15];
419 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) {
420 pv = ba + 16 * diag_offset[row];
421 pj = bj + diag_offset[row] + 1;
422 x1 = pv[0];
423 x2 = pv[1];
424 x3 = pv[2];
425 x4 = pv[3];
426 x5 = pv[4];
427 x6 = pv[5];
428 x7 = pv[6];
429 x8 = pv[7];
430 x9 = pv[8];
431 x10 = pv[9];
432 x11 = pv[10];
433 x12 = pv[11];
434 x13 = pv[12];
435 x14 = pv[13];
436 x15 = pv[14];
437 x16 = pv[15];
438 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4;
439 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4;
440 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4;
441 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4;
442
443 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8;
444 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8;
445 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8;
446 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8;
447
448 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12;
449 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12;
450 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12;
451 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12;
452
453 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16;
454 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16;
455 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16;
456 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16;
457 nz = bi[row + 1] - diag_offset[row] - 1;
458 pv += 16;
459 for (j = 0; j < nz; j++) {
460 x1 = pv[0];
461 x2 = pv[1];
462 x3 = pv[2];
463 x4 = pv[3];
464 x5 = pv[4];
465 x6 = pv[5];
466 x7 = pv[6];
467 x8 = pv[7];
468 x9 = pv[8];
469 x10 = pv[9];
470 x11 = pv[10];
471 x12 = pv[11];
472 x13 = pv[12];
473 x14 = pv[13];
474 x15 = pv[14];
475 x16 = pv[15];
476 x = rtmp + 16 * pj[j];
477 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4;
478 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4;
479 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4;
480 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4;
481
482 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8;
483 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8;
484 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8;
485 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8;
486
487 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12;
488 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12;
489 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12;
490 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12;
491
492 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16;
493 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16;
494 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16;
495 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16;
496
497 pv += 16;
498 }
499 PetscCall(PetscLogFlops(128.0 * nz + 112.0));
500 }
501 row = *ajtmp++;
502 }
503 /* finished row so stick it into b->a */
504 pv = ba + 16 * bi[i];
505 pj = bj + bi[i];
506 nz = bi[i + 1] - bi[i];
507 for (j = 0; j < nz; j++) {
508 x = rtmp + 16 * pj[j];
509 pv[0] = x[0];
510 pv[1] = x[1];
511 pv[2] = x[2];
512 pv[3] = x[3];
513 pv[4] = x[4];
514 pv[5] = x[5];
515 pv[6] = x[6];
516 pv[7] = x[7];
517 pv[8] = x[8];
518 pv[9] = x[9];
519 pv[10] = x[10];
520 pv[11] = x[11];
521 pv[12] = x[12];
522 pv[13] = x[13];
523 pv[14] = x[14];
524 pv[15] = x[15];
525 pv += 16;
526 }
527 /* invert diagonal block */
528 w = ba + 16 * diag_offset[i];
529 if (pivotinblocks) {
530 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected));
531 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
532 } else {
533 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w));
534 }
535 }
536
537 PetscCall(PetscFree(rtmp));
538
539 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
540 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
541 C->assembled = PETSC_TRUE;
542
543 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */
544 PetscFunctionReturn(PETSC_SUCCESS);
545 }
546
547 /*
548 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
549 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
550 */
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)551 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
552 {
553 Mat C = B;
554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
555 PetscInt i, j, k, nz, nzL, row;
556 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
557 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
558 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
559 PetscInt flg;
560 PetscReal shift;
561 PetscBool allowzeropivot, zeropivotdetected;
562
563 PetscFunctionBegin;
564 allowzeropivot = PetscNot(A->erroriffailure);
565
566 /* generate work space needed by the factorization */
567 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
568 PetscCall(PetscArrayzero(rtmp, bs2 * n));
569
570 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) {
571 shift = 0;
572 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
573 shift = info->shiftamount;
574 }
575
576 for (i = 0; i < n; i++) {
577 /* zero rtmp */
578 /* L part */
579 nz = bi[i + 1] - bi[i];
580 bjtmp = bj + bi[i];
581 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
582
583 /* U part */
584 nz = bdiag[i] - bdiag[i + 1];
585 bjtmp = bj + bdiag[i + 1] + 1;
586 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
587
588 /* load in initial (unfactored row) */
589 nz = ai[i + 1] - ai[i];
590 ajtmp = aj + ai[i];
591 v = aa + bs2 * ai[i];
592 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
593
594 /* elimination */
595 bjtmp = bj + bi[i];
596 nzL = bi[i + 1] - bi[i];
597 for (k = 0; k < nzL; k++) {
598 row = bjtmp[k];
599 pc = rtmp + bs2 * row;
600 for (flg = 0, j = 0; j < bs2; j++) {
601 if (pc[j] != 0.0) {
602 flg = 1;
603 break;
604 }
605 }
606 if (flg) {
607 pv = b->a + bs2 * bdiag[row];
608 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
609 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork));
610
611 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
612 pv = b->a + bs2 * (bdiag[row + 1] + 1);
613 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
614 for (j = 0; j < nz; j++) {
615 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
616 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
617 v = rtmp + bs2 * pj[j];
618 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv));
619 pv += bs2;
620 }
621 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
622 }
623 }
624
625 /* finished row so stick it into b->a */
626 /* L part */
627 pv = b->a + bs2 * bi[i];
628 pj = b->j + bi[i];
629 nz = bi[i + 1] - bi[i];
630 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
631
632 /* Mark diagonal and invert diagonal for simpler triangular solves */
633 pv = b->a + bs2 * bdiag[i];
634 pj = b->j + bdiag[i];
635 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
636 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected));
637 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
638
639 /* U part */
640 pv = b->a + bs2 * (bdiag[i + 1] + 1);
641 pj = b->j + bdiag[i + 1] + 1;
642 nz = bdiag[i] - bdiag[i + 1] - 1;
643 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
644 }
645 PetscCall(PetscFree2(rtmp, mwork));
646
647 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
648 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
649 C->assembled = PETSC_TRUE;
650
651 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */
652 PetscFunctionReturn(PETSC_SUCCESS);
653 }
654