1 /*
2 Factorization code for BAIJ format.
3 */
4 #include <../src/mat/impls/baij/seq/baij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6
7 /*
8 Version for when blocks are 6 by 6
9 */
MatILUFactorNumeric_SeqBAIJ_6_inplace(Mat C,Mat A,const MatFactorInfo * info)10 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_6_inplace(Mat C, Mat A, const MatFactorInfo *info)
11 {
12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
13 IS isrow = b->row, isicol = b->icol;
14 const PetscInt *ajtmpold, *ajtmp, *diag_offset = b->diag, *r, *ic, *bi = b->i, *bj = b->j, *ai = a->i, *aj = a->j, *pj;
15 PetscInt nz, row, i, j, n = a->mbs, idx;
16 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
17 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
18 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16;
19 MatScalar x17, x18, x19, x20, x21, x22, x23, x24, x25, p10, p11, p12, p13, p14;
20 MatScalar p15, p16, p17, p18, p19, p20, p21, p22, p23, p24, p25, m10, m11, m12;
21 MatScalar m13, m14, m15, m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
22 MatScalar p26, p27, p28, p29, p30, p31, p32, p33, p34, p35, p36;
23 MatScalar x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36;
24 MatScalar m26, m27, m28, m29, m30, m31, m32, m33, m34, m35, m36;
25 MatScalar *ba = b->a, *aa = a->a;
26 PetscReal shift = info->shiftamount;
27 PetscBool allowzeropivot, zeropivotdetected;
28
29 PetscFunctionBegin;
30 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
31 A->factortype = MAT_FACTOR_NONE;
32 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
33 A->factortype = MAT_FACTOR_ILU;
34 allowzeropivot = PetscNot(A->erroriffailure);
35 PetscCall(ISGetIndices(isrow, &r));
36 PetscCall(ISGetIndices(isicol, &ic));
37 PetscCall(PetscMalloc1(36 * (n + 1), &rtmp));
38
39 for (i = 0; i < n; i++) {
40 nz = bi[i + 1] - bi[i];
41 ajtmp = bj + bi[i];
42 for (j = 0; j < nz; j++) {
43 x = rtmp + 36 * ajtmp[j];
44 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
45 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
46 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0;
47 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0;
48 x[34] = x[35] = 0.0;
49 }
50 /* load in initial (unfactored row) */
51 idx = r[i];
52 nz = ai[idx + 1] - ai[idx];
53 ajtmpold = aj + ai[idx];
54 v = aa + 36 * ai[idx];
55 for (j = 0; j < nz; j++) {
56 x = rtmp + 36 * ic[ajtmpold[j]];
57 x[0] = v[0];
58 x[1] = v[1];
59 x[2] = v[2];
60 x[3] = v[3];
61 x[4] = v[4];
62 x[5] = v[5];
63 x[6] = v[6];
64 x[7] = v[7];
65 x[8] = v[8];
66 x[9] = v[9];
67 x[10] = v[10];
68 x[11] = v[11];
69 x[12] = v[12];
70 x[13] = v[13];
71 x[14] = v[14];
72 x[15] = v[15];
73 x[16] = v[16];
74 x[17] = v[17];
75 x[18] = v[18];
76 x[19] = v[19];
77 x[20] = v[20];
78 x[21] = v[21];
79 x[22] = v[22];
80 x[23] = v[23];
81 x[24] = v[24];
82 x[25] = v[25];
83 x[26] = v[26];
84 x[27] = v[27];
85 x[28] = v[28];
86 x[29] = v[29];
87 x[30] = v[30];
88 x[31] = v[31];
89 x[32] = v[32];
90 x[33] = v[33];
91 x[34] = v[34];
92 x[35] = v[35];
93 v += 36;
94 }
95 row = *ajtmp++;
96 while (row < i) {
97 pc = rtmp + 36 * row;
98 p1 = pc[0];
99 p2 = pc[1];
100 p3 = pc[2];
101 p4 = pc[3];
102 p5 = pc[4];
103 p6 = pc[5];
104 p7 = pc[6];
105 p8 = pc[7];
106 p9 = pc[8];
107 p10 = pc[9];
108 p11 = pc[10];
109 p12 = pc[11];
110 p13 = pc[12];
111 p14 = pc[13];
112 p15 = pc[14];
113 p16 = pc[15];
114 p17 = pc[16];
115 p18 = pc[17];
116 p19 = pc[18];
117 p20 = pc[19];
118 p21 = pc[20];
119 p22 = pc[21];
120 p23 = pc[22];
121 p24 = pc[23];
122 p25 = pc[24];
123 p26 = pc[25];
124 p27 = pc[26];
125 p28 = pc[27];
126 p29 = pc[28];
127 p30 = pc[29];
128 p31 = pc[30];
129 p32 = pc[31];
130 p33 = pc[32];
131 p34 = pc[33];
132 p35 = pc[34];
133 p36 = pc[35];
134 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
135 pv = ba + 36 * diag_offset[row];
136 pj = bj + diag_offset[row] + 1;
137 x1 = pv[0];
138 x2 = pv[1];
139 x3 = pv[2];
140 x4 = pv[3];
141 x5 = pv[4];
142 x6 = pv[5];
143 x7 = pv[6];
144 x8 = pv[7];
145 x9 = pv[8];
146 x10 = pv[9];
147 x11 = pv[10];
148 x12 = pv[11];
149 x13 = pv[12];
150 x14 = pv[13];
151 x15 = pv[14];
152 x16 = pv[15];
153 x17 = pv[16];
154 x18 = pv[17];
155 x19 = pv[18];
156 x20 = pv[19];
157 x21 = pv[20];
158 x22 = pv[21];
159 x23 = pv[22];
160 x24 = pv[23];
161 x25 = pv[24];
162 x26 = pv[25];
163 x27 = pv[26];
164 x28 = pv[27];
165 x29 = pv[28];
166 x30 = pv[29];
167 x31 = pv[30];
168 x32 = pv[31];
169 x33 = pv[32];
170 x34 = pv[33];
171 x35 = pv[34];
172 x36 = pv[35];
173 pc[0] = m1 = p1 * x1 + p7 * x2 + p13 * x3 + p19 * x4 + p25 * x5 + p31 * x6;
174 pc[1] = m2 = p2 * x1 + p8 * x2 + p14 * x3 + p20 * x4 + p26 * x5 + p32 * x6;
175 pc[2] = m3 = p3 * x1 + p9 * x2 + p15 * x3 + p21 * x4 + p27 * x5 + p33 * x6;
176 pc[3] = m4 = p4 * x1 + p10 * x2 + p16 * x3 + p22 * x4 + p28 * x5 + p34 * x6;
177 pc[4] = m5 = p5 * x1 + p11 * x2 + p17 * x3 + p23 * x4 + p29 * x5 + p35 * x6;
178 pc[5] = m6 = p6 * x1 + p12 * x2 + p18 * x3 + p24 * x4 + p30 * x5 + p36 * x6;
179
180 pc[6] = m7 = p1 * x7 + p7 * x8 + p13 * x9 + p19 * x10 + p25 * x11 + p31 * x12;
181 pc[7] = m8 = p2 * x7 + p8 * x8 + p14 * x9 + p20 * x10 + p26 * x11 + p32 * x12;
182 pc[8] = m9 = p3 * x7 + p9 * x8 + p15 * x9 + p21 * x10 + p27 * x11 + p33 * x12;
183 pc[9] = m10 = p4 * x7 + p10 * x8 + p16 * x9 + p22 * x10 + p28 * x11 + p34 * x12;
184 pc[10] = m11 = p5 * x7 + p11 * x8 + p17 * x9 + p23 * x10 + p29 * x11 + p35 * x12;
185 pc[11] = m12 = p6 * x7 + p12 * x8 + p18 * x9 + p24 * x10 + p30 * x11 + p36 * x12;
186
187 pc[12] = m13 = p1 * x13 + p7 * x14 + p13 * x15 + p19 * x16 + p25 * x17 + p31 * x18;
188 pc[13] = m14 = p2 * x13 + p8 * x14 + p14 * x15 + p20 * x16 + p26 * x17 + p32 * x18;
189 pc[14] = m15 = p3 * x13 + p9 * x14 + p15 * x15 + p21 * x16 + p27 * x17 + p33 * x18;
190 pc[15] = m16 = p4 * x13 + p10 * x14 + p16 * x15 + p22 * x16 + p28 * x17 + p34 * x18;
191 pc[16] = m17 = p5 * x13 + p11 * x14 + p17 * x15 + p23 * x16 + p29 * x17 + p35 * x18;
192 pc[17] = m18 = p6 * x13 + p12 * x14 + p18 * x15 + p24 * x16 + p30 * x17 + p36 * x18;
193
194 pc[18] = m19 = p1 * x19 + p7 * x20 + p13 * x21 + p19 * x22 + p25 * x23 + p31 * x24;
195 pc[19] = m20 = p2 * x19 + p8 * x20 + p14 * x21 + p20 * x22 + p26 * x23 + p32 * x24;
196 pc[20] = m21 = p3 * x19 + p9 * x20 + p15 * x21 + p21 * x22 + p27 * x23 + p33 * x24;
197 pc[21] = m22 = p4 * x19 + p10 * x20 + p16 * x21 + p22 * x22 + p28 * x23 + p34 * x24;
198 pc[22] = m23 = p5 * x19 + p11 * x20 + p17 * x21 + p23 * x22 + p29 * x23 + p35 * x24;
199 pc[23] = m24 = p6 * x19 + p12 * x20 + p18 * x21 + p24 * x22 + p30 * x23 + p36 * x24;
200
201 pc[24] = m25 = p1 * x25 + p7 * x26 + p13 * x27 + p19 * x28 + p25 * x29 + p31 * x30;
202 pc[25] = m26 = p2 * x25 + p8 * x26 + p14 * x27 + p20 * x28 + p26 * x29 + p32 * x30;
203 pc[26] = m27 = p3 * x25 + p9 * x26 + p15 * x27 + p21 * x28 + p27 * x29 + p33 * x30;
204 pc[27] = m28 = p4 * x25 + p10 * x26 + p16 * x27 + p22 * x28 + p28 * x29 + p34 * x30;
205 pc[28] = m29 = p5 * x25 + p11 * x26 + p17 * x27 + p23 * x28 + p29 * x29 + p35 * x30;
206 pc[29] = m30 = p6 * x25 + p12 * x26 + p18 * x27 + p24 * x28 + p30 * x29 + p36 * x30;
207
208 pc[30] = m31 = p1 * x31 + p7 * x32 + p13 * x33 + p19 * x34 + p25 * x35 + p31 * x36;
209 pc[31] = m32 = p2 * x31 + p8 * x32 + p14 * x33 + p20 * x34 + p26 * x35 + p32 * x36;
210 pc[32] = m33 = p3 * x31 + p9 * x32 + p15 * x33 + p21 * x34 + p27 * x35 + p33 * x36;
211 pc[33] = m34 = p4 * x31 + p10 * x32 + p16 * x33 + p22 * x34 + p28 * x35 + p34 * x36;
212 pc[34] = m35 = p5 * x31 + p11 * x32 + p17 * x33 + p23 * x34 + p29 * x35 + p35 * x36;
213 pc[35] = m36 = p6 * x31 + p12 * x32 + p18 * x33 + p24 * x34 + p30 * x35 + p36 * x36;
214
215 nz = bi[row + 1] - diag_offset[row] - 1;
216 pv += 36;
217 for (j = 0; j < nz; j++) {
218 x1 = pv[0];
219 x2 = pv[1];
220 x3 = pv[2];
221 x4 = pv[3];
222 x5 = pv[4];
223 x6 = pv[5];
224 x7 = pv[6];
225 x8 = pv[7];
226 x9 = pv[8];
227 x10 = pv[9];
228 x11 = pv[10];
229 x12 = pv[11];
230 x13 = pv[12];
231 x14 = pv[13];
232 x15 = pv[14];
233 x16 = pv[15];
234 x17 = pv[16];
235 x18 = pv[17];
236 x19 = pv[18];
237 x20 = pv[19];
238 x21 = pv[20];
239 x22 = pv[21];
240 x23 = pv[22];
241 x24 = pv[23];
242 x25 = pv[24];
243 x26 = pv[25];
244 x27 = pv[26];
245 x28 = pv[27];
246 x29 = pv[28];
247 x30 = pv[29];
248 x31 = pv[30];
249 x32 = pv[31];
250 x33 = pv[32];
251 x34 = pv[33];
252 x35 = pv[34];
253 x36 = pv[35];
254 x = rtmp + 36 * pj[j];
255 x[0] -= m1 * x1 + m7 * x2 + m13 * x3 + m19 * x4 + m25 * x5 + m31 * x6;
256 x[1] -= m2 * x1 + m8 * x2 + m14 * x3 + m20 * x4 + m26 * x5 + m32 * x6;
257 x[2] -= m3 * x1 + m9 * x2 + m15 * x3 + m21 * x4 + m27 * x5 + m33 * x6;
258 x[3] -= m4 * x1 + m10 * x2 + m16 * x3 + m22 * x4 + m28 * x5 + m34 * x6;
259 x[4] -= m5 * x1 + m11 * x2 + m17 * x3 + m23 * x4 + m29 * x5 + m35 * x6;
260 x[5] -= m6 * x1 + m12 * x2 + m18 * x3 + m24 * x4 + m30 * x5 + m36 * x6;
261
262 x[6] -= m1 * x7 + m7 * x8 + m13 * x9 + m19 * x10 + m25 * x11 + m31 * x12;
263 x[7] -= m2 * x7 + m8 * x8 + m14 * x9 + m20 * x10 + m26 * x11 + m32 * x12;
264 x[8] -= m3 * x7 + m9 * x8 + m15 * x9 + m21 * x10 + m27 * x11 + m33 * x12;
265 x[9] -= m4 * x7 + m10 * x8 + m16 * x9 + m22 * x10 + m28 * x11 + m34 * x12;
266 x[10] -= m5 * x7 + m11 * x8 + m17 * x9 + m23 * x10 + m29 * x11 + m35 * x12;
267 x[11] -= m6 * x7 + m12 * x8 + m18 * x9 + m24 * x10 + m30 * x11 + m36 * x12;
268
269 x[12] -= m1 * x13 + m7 * x14 + m13 * x15 + m19 * x16 + m25 * x17 + m31 * x18;
270 x[13] -= m2 * x13 + m8 * x14 + m14 * x15 + m20 * x16 + m26 * x17 + m32 * x18;
271 x[14] -= m3 * x13 + m9 * x14 + m15 * x15 + m21 * x16 + m27 * x17 + m33 * x18;
272 x[15] -= m4 * x13 + m10 * x14 + m16 * x15 + m22 * x16 + m28 * x17 + m34 * x18;
273 x[16] -= m5 * x13 + m11 * x14 + m17 * x15 + m23 * x16 + m29 * x17 + m35 * x18;
274 x[17] -= m6 * x13 + m12 * x14 + m18 * x15 + m24 * x16 + m30 * x17 + m36 * x18;
275
276 x[18] -= m1 * x19 + m7 * x20 + m13 * x21 + m19 * x22 + m25 * x23 + m31 * x24;
277 x[19] -= m2 * x19 + m8 * x20 + m14 * x21 + m20 * x22 + m26 * x23 + m32 * x24;
278 x[20] -= m3 * x19 + m9 * x20 + m15 * x21 + m21 * x22 + m27 * x23 + m33 * x24;
279 x[21] -= m4 * x19 + m10 * x20 + m16 * x21 + m22 * x22 + m28 * x23 + m34 * x24;
280 x[22] -= m5 * x19 + m11 * x20 + m17 * x21 + m23 * x22 + m29 * x23 + m35 * x24;
281 x[23] -= m6 * x19 + m12 * x20 + m18 * x21 + m24 * x22 + m30 * x23 + m36 * x24;
282
283 x[24] -= m1 * x25 + m7 * x26 + m13 * x27 + m19 * x28 + m25 * x29 + m31 * x30;
284 x[25] -= m2 * x25 + m8 * x26 + m14 * x27 + m20 * x28 + m26 * x29 + m32 * x30;
285 x[26] -= m3 * x25 + m9 * x26 + m15 * x27 + m21 * x28 + m27 * x29 + m33 * x30;
286 x[27] -= m4 * x25 + m10 * x26 + m16 * x27 + m22 * x28 + m28 * x29 + m34 * x30;
287 x[28] -= m5 * x25 + m11 * x26 + m17 * x27 + m23 * x28 + m29 * x29 + m35 * x30;
288 x[29] -= m6 * x25 + m12 * x26 + m18 * x27 + m24 * x28 + m30 * x29 + m36 * x30;
289
290 x[30] -= m1 * x31 + m7 * x32 + m13 * x33 + m19 * x34 + m25 * x35 + m31 * x36;
291 x[31] -= m2 * x31 + m8 * x32 + m14 * x33 + m20 * x34 + m26 * x35 + m32 * x36;
292 x[32] -= m3 * x31 + m9 * x32 + m15 * x33 + m21 * x34 + m27 * x35 + m33 * x36;
293 x[33] -= m4 * x31 + m10 * x32 + m16 * x33 + m22 * x34 + m28 * x35 + m34 * x36;
294 x[34] -= m5 * x31 + m11 * x32 + m17 * x33 + m23 * x34 + m29 * x35 + m35 * x36;
295 x[35] -= m6 * x31 + m12 * x32 + m18 * x33 + m24 * x34 + m30 * x35 + m36 * x36;
296
297 pv += 36;
298 }
299 PetscCall(PetscLogFlops(432.0 * nz + 396.0));
300 }
301 row = *ajtmp++;
302 }
303 /* finished row so stick it into b->a */
304 pv = ba + 36 * bi[i];
305 pj = bj + bi[i];
306 nz = bi[i + 1] - bi[i];
307 for (j = 0; j < nz; j++) {
308 x = rtmp + 36 * pj[j];
309 pv[0] = x[0];
310 pv[1] = x[1];
311 pv[2] = x[2];
312 pv[3] = x[3];
313 pv[4] = x[4];
314 pv[5] = x[5];
315 pv[6] = x[6];
316 pv[7] = x[7];
317 pv[8] = x[8];
318 pv[9] = x[9];
319 pv[10] = x[10];
320 pv[11] = x[11];
321 pv[12] = x[12];
322 pv[13] = x[13];
323 pv[14] = x[14];
324 pv[15] = x[15];
325 pv[16] = x[16];
326 pv[17] = x[17];
327 pv[18] = x[18];
328 pv[19] = x[19];
329 pv[20] = x[20];
330 pv[21] = x[21];
331 pv[22] = x[22];
332 pv[23] = x[23];
333 pv[24] = x[24];
334 pv[25] = x[25];
335 pv[26] = x[26];
336 pv[27] = x[27];
337 pv[28] = x[28];
338 pv[29] = x[29];
339 pv[30] = x[30];
340 pv[31] = x[31];
341 pv[32] = x[32];
342 pv[33] = x[33];
343 pv[34] = x[34];
344 pv[35] = x[35];
345 pv += 36;
346 }
347 /* invert diagonal block */
348 w = ba + 36 * diag_offset[i];
349 PetscCall(PetscKernel_A_gets_inverse_A_6(w, shift, allowzeropivot, &zeropivotdetected));
350 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
351 }
352
353 PetscCall(PetscFree(rtmp));
354 PetscCall(ISRestoreIndices(isicol, &ic));
355 PetscCall(ISRestoreIndices(isrow, &r));
356
357 C->ops->solve = MatSolve_SeqBAIJ_6_inplace;
358 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_inplace;
359 C->assembled = PETSC_TRUE;
360
361 PetscCall(PetscLogFlops(1.333333333333 * 6 * 6 * 6 * b->mbs)); /* from inverting diagonal blocks */
362 PetscFunctionReturn(PETSC_SUCCESS);
363 }
364
MatLUFactorNumeric_SeqBAIJ_6(Mat B,Mat A,const MatFactorInfo * info)365 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_6(Mat B, Mat A, const MatFactorInfo *info)
366 {
367 Mat C = B;
368 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
369 IS isrow = b->row, isicol = b->icol;
370 const PetscInt *r, *ic;
371 PetscInt i, j, k, nz, nzL, row;
372 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
373 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
374 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
375 PetscInt flg;
376 PetscReal shift = info->shiftamount;
377 PetscBool allowzeropivot, zeropivotdetected;
378
379 PetscFunctionBegin;
380 allowzeropivot = PetscNot(A->erroriffailure);
381 PetscCall(ISGetIndices(isrow, &r));
382 PetscCall(ISGetIndices(isicol, &ic));
383
384 /* generate work space needed by the factorization */
385 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
386 PetscCall(PetscArrayzero(rtmp, bs2 * n));
387
388 for (i = 0; i < n; i++) {
389 /* zero rtmp */
390 /* L part */
391 nz = bi[i + 1] - bi[i];
392 bjtmp = bj + bi[i];
393 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
394
395 /* U part */
396 nz = bdiag[i] - bdiag[i + 1];
397 bjtmp = bj + bdiag[i + 1] + 1;
398 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
399
400 /* load in initial (unfactored row) */
401 nz = ai[r[i] + 1] - ai[r[i]];
402 ajtmp = aj + ai[r[i]];
403 v = aa + bs2 * ai[r[i]];
404 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
405
406 /* elimination */
407 bjtmp = bj + bi[i];
408 nzL = bi[i + 1] - bi[i];
409 for (k = 0; k < nzL; k++) {
410 row = bjtmp[k];
411 pc = rtmp + bs2 * row;
412 for (flg = 0, j = 0; j < bs2; j++) {
413 if (pc[j] != 0.0) {
414 flg = 1;
415 break;
416 }
417 }
418 if (flg) {
419 pv = b->a + bs2 * bdiag[row];
420 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
421 PetscCall(PetscKernel_A_gets_A_times_B_6(pc, pv, mwork));
422
423 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
424 pv = b->a + bs2 * (bdiag[row + 1] + 1);
425 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
426 for (j = 0; j < nz; j++) {
427 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
428 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
429 v = rtmp + bs2 * pj[j];
430 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_6(v, pc, pv));
431 pv += bs2;
432 }
433 PetscCall(PetscLogFlops(432.0 * nz + 396)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
434 }
435 }
436
437 /* finished row so stick it into b->a */
438 /* L part */
439 pv = b->a + bs2 * bi[i];
440 pj = b->j + bi[i];
441 nz = bi[i + 1] - bi[i];
442 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
443
444 /* Mark diagonal and invert diagonal for simpler triangular solves */
445 pv = b->a + bs2 * bdiag[i];
446 pj = b->j + bdiag[i];
447 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
448 PetscCall(PetscKernel_A_gets_inverse_A_6(pv, shift, allowzeropivot, &zeropivotdetected));
449 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
450
451 /* U part */
452 pv = b->a + bs2 * (bdiag[i + 1] + 1);
453 pj = b->j + bdiag[i + 1] + 1;
454 nz = bdiag[i] - bdiag[i + 1] - 1;
455 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
456 }
457
458 PetscCall(PetscFree2(rtmp, mwork));
459 PetscCall(ISRestoreIndices(isicol, &ic));
460 PetscCall(ISRestoreIndices(isrow, &r));
461
462 C->ops->solve = MatSolve_SeqBAIJ_6;
463 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6;
464 C->assembled = PETSC_TRUE;
465
466 PetscCall(PetscLogFlops(1.333333333333 * 6 * 6 * 6 * n)); /* from inverting diagonal blocks */
467 PetscFunctionReturn(PETSC_SUCCESS);
468 }
469
MatILUFactorNumeric_SeqBAIJ_6_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)470 PetscErrorCode MatILUFactorNumeric_SeqBAIJ_6_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
471 {
472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
473 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
474 PetscInt *ajtmpold, *ajtmp, nz, row;
475 PetscInt *ai = a->i, *aj = a->j, *pj;
476 const PetscInt *diag_offset;
477 MatScalar *pv, *v, *rtmp, *pc, *w, *x;
478 MatScalar x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
479 MatScalar x16, x17, x18, x19, x20, x21, x22, x23, x24, x25;
480 MatScalar p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15;
481 MatScalar p16, p17, p18, p19, p20, p21, p22, p23, p24, p25;
482 MatScalar m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
483 MatScalar m16, m17, m18, m19, m20, m21, m22, m23, m24, m25;
484 MatScalar p26, p27, p28, p29, p30, p31, p32, p33, p34, p35, p36;
485 MatScalar x26, x27, x28, x29, x30, x31, x32, x33, x34, x35, x36;
486 MatScalar m26, m27, m28, m29, m30, m31, m32, m33, m34, m35, m36;
487 MatScalar *ba = b->a, *aa = a->a;
488 PetscReal shift = info->shiftamount;
489 PetscBool allowzeropivot, zeropivotdetected;
490
491 PetscFunctionBegin;
492 /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
493 A->factortype = MAT_FACTOR_NONE;
494 PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
495 A->factortype = MAT_FACTOR_ILU;
496 allowzeropivot = PetscNot(A->erroriffailure);
497 PetscCall(PetscMalloc1(36 * (n + 1), &rtmp));
498 for (i = 0; i < n; i++) {
499 nz = bi[i + 1] - bi[i];
500 ajtmp = bj + bi[i];
501 for (j = 0; j < nz; j++) {
502 x = rtmp + 36 * ajtmp[j];
503 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
504 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
505 x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0;
506 x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0;
507 x[34] = x[35] = 0.0;
508 }
509 /* load in initial (unfactored row) */
510 nz = ai[i + 1] - ai[i];
511 ajtmpold = aj + ai[i];
512 v = aa + 36 * ai[i];
513 for (j = 0; j < nz; j++) {
514 x = rtmp + 36 * ajtmpold[j];
515 x[0] = v[0];
516 x[1] = v[1];
517 x[2] = v[2];
518 x[3] = v[3];
519 x[4] = v[4];
520 x[5] = v[5];
521 x[6] = v[6];
522 x[7] = v[7];
523 x[8] = v[8];
524 x[9] = v[9];
525 x[10] = v[10];
526 x[11] = v[11];
527 x[12] = v[12];
528 x[13] = v[13];
529 x[14] = v[14];
530 x[15] = v[15];
531 x[16] = v[16];
532 x[17] = v[17];
533 x[18] = v[18];
534 x[19] = v[19];
535 x[20] = v[20];
536 x[21] = v[21];
537 x[22] = v[22];
538 x[23] = v[23];
539 x[24] = v[24];
540 x[25] = v[25];
541 x[26] = v[26];
542 x[27] = v[27];
543 x[28] = v[28];
544 x[29] = v[29];
545 x[30] = v[30];
546 x[31] = v[31];
547 x[32] = v[32];
548 x[33] = v[33];
549 x[34] = v[34];
550 x[35] = v[35];
551 v += 36;
552 }
553 row = *ajtmp++;
554 while (row < i) {
555 pc = rtmp + 36 * row;
556 p1 = pc[0];
557 p2 = pc[1];
558 p3 = pc[2];
559 p4 = pc[3];
560 p5 = pc[4];
561 p6 = pc[5];
562 p7 = pc[6];
563 p8 = pc[7];
564 p9 = pc[8];
565 p10 = pc[9];
566 p11 = pc[10];
567 p12 = pc[11];
568 p13 = pc[12];
569 p14 = pc[13];
570 p15 = pc[14];
571 p16 = pc[15];
572 p17 = pc[16];
573 p18 = pc[17];
574 p19 = pc[18];
575 p20 = pc[19];
576 p21 = pc[20];
577 p22 = pc[21];
578 p23 = pc[22];
579 p24 = pc[23];
580 p25 = pc[24];
581 p26 = pc[25];
582 p27 = pc[26];
583 p28 = pc[27];
584 p29 = pc[28];
585 p30 = pc[29];
586 p31 = pc[30];
587 p32 = pc[31];
588 p33 = pc[32];
589 p34 = pc[33];
590 p35 = pc[34];
591 p36 = pc[35];
592 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 || p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 || p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
593 pv = ba + 36 * diag_offset[row];
594 pj = bj + diag_offset[row] + 1;
595 x1 = pv[0];
596 x2 = pv[1];
597 x3 = pv[2];
598 x4 = pv[3];
599 x5 = pv[4];
600 x6 = pv[5];
601 x7 = pv[6];
602 x8 = pv[7];
603 x9 = pv[8];
604 x10 = pv[9];
605 x11 = pv[10];
606 x12 = pv[11];
607 x13 = pv[12];
608 x14 = pv[13];
609 x15 = pv[14];
610 x16 = pv[15];
611 x17 = pv[16];
612 x18 = pv[17];
613 x19 = pv[18];
614 x20 = pv[19];
615 x21 = pv[20];
616 x22 = pv[21];
617 x23 = pv[22];
618 x24 = pv[23];
619 x25 = pv[24];
620 x26 = pv[25];
621 x27 = pv[26];
622 x28 = pv[27];
623 x29 = pv[28];
624 x30 = pv[29];
625 x31 = pv[30];
626 x32 = pv[31];
627 x33 = pv[32];
628 x34 = pv[33];
629 x35 = pv[34];
630 x36 = pv[35];
631 pc[0] = m1 = p1 * x1 + p7 * x2 + p13 * x3 + p19 * x4 + p25 * x5 + p31 * x6;
632 pc[1] = m2 = p2 * x1 + p8 * x2 + p14 * x3 + p20 * x4 + p26 * x5 + p32 * x6;
633 pc[2] = m3 = p3 * x1 + p9 * x2 + p15 * x3 + p21 * x4 + p27 * x5 + p33 * x6;
634 pc[3] = m4 = p4 * x1 + p10 * x2 + p16 * x3 + p22 * x4 + p28 * x5 + p34 * x6;
635 pc[4] = m5 = p5 * x1 + p11 * x2 + p17 * x3 + p23 * x4 + p29 * x5 + p35 * x6;
636 pc[5] = m6 = p6 * x1 + p12 * x2 + p18 * x3 + p24 * x4 + p30 * x5 + p36 * x6;
637
638 pc[6] = m7 = p1 * x7 + p7 * x8 + p13 * x9 + p19 * x10 + p25 * x11 + p31 * x12;
639 pc[7] = m8 = p2 * x7 + p8 * x8 + p14 * x9 + p20 * x10 + p26 * x11 + p32 * x12;
640 pc[8] = m9 = p3 * x7 + p9 * x8 + p15 * x9 + p21 * x10 + p27 * x11 + p33 * x12;
641 pc[9] = m10 = p4 * x7 + p10 * x8 + p16 * x9 + p22 * x10 + p28 * x11 + p34 * x12;
642 pc[10] = m11 = p5 * x7 + p11 * x8 + p17 * x9 + p23 * x10 + p29 * x11 + p35 * x12;
643 pc[11] = m12 = p6 * x7 + p12 * x8 + p18 * x9 + p24 * x10 + p30 * x11 + p36 * x12;
644
645 pc[12] = m13 = p1 * x13 + p7 * x14 + p13 * x15 + p19 * x16 + p25 * x17 + p31 * x18;
646 pc[13] = m14 = p2 * x13 + p8 * x14 + p14 * x15 + p20 * x16 + p26 * x17 + p32 * x18;
647 pc[14] = m15 = p3 * x13 + p9 * x14 + p15 * x15 + p21 * x16 + p27 * x17 + p33 * x18;
648 pc[15] = m16 = p4 * x13 + p10 * x14 + p16 * x15 + p22 * x16 + p28 * x17 + p34 * x18;
649 pc[16] = m17 = p5 * x13 + p11 * x14 + p17 * x15 + p23 * x16 + p29 * x17 + p35 * x18;
650 pc[17] = m18 = p6 * x13 + p12 * x14 + p18 * x15 + p24 * x16 + p30 * x17 + p36 * x18;
651
652 pc[18] = m19 = p1 * x19 + p7 * x20 + p13 * x21 + p19 * x22 + p25 * x23 + p31 * x24;
653 pc[19] = m20 = p2 * x19 + p8 * x20 + p14 * x21 + p20 * x22 + p26 * x23 + p32 * x24;
654 pc[20] = m21 = p3 * x19 + p9 * x20 + p15 * x21 + p21 * x22 + p27 * x23 + p33 * x24;
655 pc[21] = m22 = p4 * x19 + p10 * x20 + p16 * x21 + p22 * x22 + p28 * x23 + p34 * x24;
656 pc[22] = m23 = p5 * x19 + p11 * x20 + p17 * x21 + p23 * x22 + p29 * x23 + p35 * x24;
657 pc[23] = m24 = p6 * x19 + p12 * x20 + p18 * x21 + p24 * x22 + p30 * x23 + p36 * x24;
658
659 pc[24] = m25 = p1 * x25 + p7 * x26 + p13 * x27 + p19 * x28 + p25 * x29 + p31 * x30;
660 pc[25] = m26 = p2 * x25 + p8 * x26 + p14 * x27 + p20 * x28 + p26 * x29 + p32 * x30;
661 pc[26] = m27 = p3 * x25 + p9 * x26 + p15 * x27 + p21 * x28 + p27 * x29 + p33 * x30;
662 pc[27] = m28 = p4 * x25 + p10 * x26 + p16 * x27 + p22 * x28 + p28 * x29 + p34 * x30;
663 pc[28] = m29 = p5 * x25 + p11 * x26 + p17 * x27 + p23 * x28 + p29 * x29 + p35 * x30;
664 pc[29] = m30 = p6 * x25 + p12 * x26 + p18 * x27 + p24 * x28 + p30 * x29 + p36 * x30;
665
666 pc[30] = m31 = p1 * x31 + p7 * x32 + p13 * x33 + p19 * x34 + p25 * x35 + p31 * x36;
667 pc[31] = m32 = p2 * x31 + p8 * x32 + p14 * x33 + p20 * x34 + p26 * x35 + p32 * x36;
668 pc[32] = m33 = p3 * x31 + p9 * x32 + p15 * x33 + p21 * x34 + p27 * x35 + p33 * x36;
669 pc[33] = m34 = p4 * x31 + p10 * x32 + p16 * x33 + p22 * x34 + p28 * x35 + p34 * x36;
670 pc[34] = m35 = p5 * x31 + p11 * x32 + p17 * x33 + p23 * x34 + p29 * x35 + p35 * x36;
671 pc[35] = m36 = p6 * x31 + p12 * x32 + p18 * x33 + p24 * x34 + p30 * x35 + p36 * x36;
672
673 nz = bi[row + 1] - diag_offset[row] - 1;
674 pv += 36;
675 for (j = 0; j < nz; j++) {
676 x1 = pv[0];
677 x2 = pv[1];
678 x3 = pv[2];
679 x4 = pv[3];
680 x5 = pv[4];
681 x6 = pv[5];
682 x7 = pv[6];
683 x8 = pv[7];
684 x9 = pv[8];
685 x10 = pv[9];
686 x11 = pv[10];
687 x12 = pv[11];
688 x13 = pv[12];
689 x14 = pv[13];
690 x15 = pv[14];
691 x16 = pv[15];
692 x17 = pv[16];
693 x18 = pv[17];
694 x19 = pv[18];
695 x20 = pv[19];
696 x21 = pv[20];
697 x22 = pv[21];
698 x23 = pv[22];
699 x24 = pv[23];
700 x25 = pv[24];
701 x26 = pv[25];
702 x27 = pv[26];
703 x28 = pv[27];
704 x29 = pv[28];
705 x30 = pv[29];
706 x31 = pv[30];
707 x32 = pv[31];
708 x33 = pv[32];
709 x34 = pv[33];
710 x35 = pv[34];
711 x36 = pv[35];
712 x = rtmp + 36 * pj[j];
713 x[0] -= m1 * x1 + m7 * x2 + m13 * x3 + m19 * x4 + m25 * x5 + m31 * x6;
714 x[1] -= m2 * x1 + m8 * x2 + m14 * x3 + m20 * x4 + m26 * x5 + m32 * x6;
715 x[2] -= m3 * x1 + m9 * x2 + m15 * x3 + m21 * x4 + m27 * x5 + m33 * x6;
716 x[3] -= m4 * x1 + m10 * x2 + m16 * x3 + m22 * x4 + m28 * x5 + m34 * x6;
717 x[4] -= m5 * x1 + m11 * x2 + m17 * x3 + m23 * x4 + m29 * x5 + m35 * x6;
718 x[5] -= m6 * x1 + m12 * x2 + m18 * x3 + m24 * x4 + m30 * x5 + m36 * x6;
719
720 x[6] -= m1 * x7 + m7 * x8 + m13 * x9 + m19 * x10 + m25 * x11 + m31 * x12;
721 x[7] -= m2 * x7 + m8 * x8 + m14 * x9 + m20 * x10 + m26 * x11 + m32 * x12;
722 x[8] -= m3 * x7 + m9 * x8 + m15 * x9 + m21 * x10 + m27 * x11 + m33 * x12;
723 x[9] -= m4 * x7 + m10 * x8 + m16 * x9 + m22 * x10 + m28 * x11 + m34 * x12;
724 x[10] -= m5 * x7 + m11 * x8 + m17 * x9 + m23 * x10 + m29 * x11 + m35 * x12;
725 x[11] -= m6 * x7 + m12 * x8 + m18 * x9 + m24 * x10 + m30 * x11 + m36 * x12;
726
727 x[12] -= m1 * x13 + m7 * x14 + m13 * x15 + m19 * x16 + m25 * x17 + m31 * x18;
728 x[13] -= m2 * x13 + m8 * x14 + m14 * x15 + m20 * x16 + m26 * x17 + m32 * x18;
729 x[14] -= m3 * x13 + m9 * x14 + m15 * x15 + m21 * x16 + m27 * x17 + m33 * x18;
730 x[15] -= m4 * x13 + m10 * x14 + m16 * x15 + m22 * x16 + m28 * x17 + m34 * x18;
731 x[16] -= m5 * x13 + m11 * x14 + m17 * x15 + m23 * x16 + m29 * x17 + m35 * x18;
732 x[17] -= m6 * x13 + m12 * x14 + m18 * x15 + m24 * x16 + m30 * x17 + m36 * x18;
733
734 x[18] -= m1 * x19 + m7 * x20 + m13 * x21 + m19 * x22 + m25 * x23 + m31 * x24;
735 x[19] -= m2 * x19 + m8 * x20 + m14 * x21 + m20 * x22 + m26 * x23 + m32 * x24;
736 x[20] -= m3 * x19 + m9 * x20 + m15 * x21 + m21 * x22 + m27 * x23 + m33 * x24;
737 x[21] -= m4 * x19 + m10 * x20 + m16 * x21 + m22 * x22 + m28 * x23 + m34 * x24;
738 x[22] -= m5 * x19 + m11 * x20 + m17 * x21 + m23 * x22 + m29 * x23 + m35 * x24;
739 x[23] -= m6 * x19 + m12 * x20 + m18 * x21 + m24 * x22 + m30 * x23 + m36 * x24;
740
741 x[24] -= m1 * x25 + m7 * x26 + m13 * x27 + m19 * x28 + m25 * x29 + m31 * x30;
742 x[25] -= m2 * x25 + m8 * x26 + m14 * x27 + m20 * x28 + m26 * x29 + m32 * x30;
743 x[26] -= m3 * x25 + m9 * x26 + m15 * x27 + m21 * x28 + m27 * x29 + m33 * x30;
744 x[27] -= m4 * x25 + m10 * x26 + m16 * x27 + m22 * x28 + m28 * x29 + m34 * x30;
745 x[28] -= m5 * x25 + m11 * x26 + m17 * x27 + m23 * x28 + m29 * x29 + m35 * x30;
746 x[29] -= m6 * x25 + m12 * x26 + m18 * x27 + m24 * x28 + m30 * x29 + m36 * x30;
747
748 x[30] -= m1 * x31 + m7 * x32 + m13 * x33 + m19 * x34 + m25 * x35 + m31 * x36;
749 x[31] -= m2 * x31 + m8 * x32 + m14 * x33 + m20 * x34 + m26 * x35 + m32 * x36;
750 x[32] -= m3 * x31 + m9 * x32 + m15 * x33 + m21 * x34 + m27 * x35 + m33 * x36;
751 x[33] -= m4 * x31 + m10 * x32 + m16 * x33 + m22 * x34 + m28 * x35 + m34 * x36;
752 x[34] -= m5 * x31 + m11 * x32 + m17 * x33 + m23 * x34 + m29 * x35 + m35 * x36;
753 x[35] -= m6 * x31 + m12 * x32 + m18 * x33 + m24 * x34 + m30 * x35 + m36 * x36;
754
755 pv += 36;
756 }
757 PetscCall(PetscLogFlops(432.0 * nz + 396.0));
758 }
759 row = *ajtmp++;
760 }
761 /* finished row so stick it into b->a */
762 pv = ba + 36 * bi[i];
763 pj = bj + bi[i];
764 nz = bi[i + 1] - bi[i];
765 for (j = 0; j < nz; j++) {
766 x = rtmp + 36 * pj[j];
767 pv[0] = x[0];
768 pv[1] = x[1];
769 pv[2] = x[2];
770 pv[3] = x[3];
771 pv[4] = x[4];
772 pv[5] = x[5];
773 pv[6] = x[6];
774 pv[7] = x[7];
775 pv[8] = x[8];
776 pv[9] = x[9];
777 pv[10] = x[10];
778 pv[11] = x[11];
779 pv[12] = x[12];
780 pv[13] = x[13];
781 pv[14] = x[14];
782 pv[15] = x[15];
783 pv[16] = x[16];
784 pv[17] = x[17];
785 pv[18] = x[18];
786 pv[19] = x[19];
787 pv[20] = x[20];
788 pv[21] = x[21];
789 pv[22] = x[22];
790 pv[23] = x[23];
791 pv[24] = x[24];
792 pv[25] = x[25];
793 pv[26] = x[26];
794 pv[27] = x[27];
795 pv[28] = x[28];
796 pv[29] = x[29];
797 pv[30] = x[30];
798 pv[31] = x[31];
799 pv[32] = x[32];
800 pv[33] = x[33];
801 pv[34] = x[34];
802 pv[35] = x[35];
803 pv += 36;
804 }
805 /* invert diagonal block */
806 w = ba + 36 * diag_offset[i];
807 PetscCall(PetscKernel_A_gets_inverse_A_6(w, shift, allowzeropivot, &zeropivotdetected));
808 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
809 }
810
811 PetscCall(PetscFree(rtmp));
812
813 C->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_inplace;
814 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace;
815 C->assembled = PETSC_TRUE;
816
817 PetscCall(PetscLogFlops(1.333333333333 * 6 * 6 * 6 * b->mbs)); /* from inverting diagonal blocks */
818 PetscFunctionReturn(PETSC_SUCCESS);
819 }
820
MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)821 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
822 {
823 Mat C = B;
824 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
825 PetscInt i, j, k, nz, nzL, row;
826 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
827 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
828 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
829 PetscInt flg;
830 PetscReal shift = info->shiftamount;
831 PetscBool allowzeropivot, zeropivotdetected;
832
833 PetscFunctionBegin;
834 allowzeropivot = PetscNot(A->erroriffailure);
835
836 /* generate work space needed by the factorization */
837 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
838 PetscCall(PetscArrayzero(rtmp, bs2 * n));
839
840 for (i = 0; i < n; i++) {
841 /* zero rtmp */
842 /* L part */
843 nz = bi[i + 1] - bi[i];
844 bjtmp = bj + bi[i];
845 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
846
847 /* U part */
848 nz = bdiag[i] - bdiag[i + 1];
849 bjtmp = bj + bdiag[i + 1] + 1;
850 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
851
852 /* load in initial (unfactored row) */
853 nz = ai[i + 1] - ai[i];
854 ajtmp = aj + ai[i];
855 v = aa + bs2 * ai[i];
856 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
857
858 /* elimination */
859 bjtmp = bj + bi[i];
860 nzL = bi[i + 1] - bi[i];
861 for (k = 0; k < nzL; k++) {
862 row = bjtmp[k];
863 pc = rtmp + bs2 * row;
864 for (flg = 0, j = 0; j < bs2; j++) {
865 if (pc[j] != 0.0) {
866 flg = 1;
867 break;
868 }
869 }
870 if (flg) {
871 pv = b->a + bs2 * bdiag[row];
872 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
873 PetscCall(PetscKernel_A_gets_A_times_B_6(pc, pv, mwork));
874
875 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
876 pv = b->a + bs2 * (bdiag[row + 1] + 1);
877 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
878 for (j = 0; j < nz; j++) {
879 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
880 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
881 v = rtmp + bs2 * pj[j];
882 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_6(v, pc, pv));
883 pv += bs2;
884 }
885 PetscCall(PetscLogFlops(432.0 * nz + 396)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
886 }
887 }
888
889 /* finished row so stick it into b->a */
890 /* L part */
891 pv = b->a + bs2 * bi[i];
892 pj = b->j + bi[i];
893 nz = bi[i + 1] - bi[i];
894 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
895
896 /* Mark diagonal and invert diagonal for simpler triangular solves */
897 pv = b->a + bs2 * bdiag[i];
898 pj = b->j + bdiag[i];
899 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
900 PetscCall(PetscKernel_A_gets_inverse_A_6(pv, shift, allowzeropivot, &zeropivotdetected));
901 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
902
903 /* U part */
904 pv = b->a + bs2 * (bdiag[i + 1] + 1);
905 pj = b->j + bdiag[i + 1] + 1;
906 nz = bdiag[i] - bdiag[i + 1] - 1;
907 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
908 }
909 PetscCall(PetscFree2(rtmp, mwork));
910
911 C->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering;
912 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
913 C->assembled = PETSC_TRUE;
914
915 PetscCall(PetscLogFlops(1.333333333333 * 6 * 6 * 6 * n)); /* from inverting diagonal blocks */
916 PetscFunctionReturn(PETSC_SUCCESS);
917 }
918