1 /*
2 This file provides high performance routines for the Inode format (compressed sparse row)
3 by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7 #include <xmmintrin.h>
8 #endif
9
MatCreateColInode_Private(Mat A,PetscInt * size,PetscInt ** ns)10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
14
15 PetscFunctionBegin;
16 n = A->cmap->n;
17 m = A->rmap->n;
18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19 ns_row = a->inode.size_csr;
20
21 min_mn = (m < n) ? m : n;
22 if (!ns) {
23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24 for (; count + 1 < n; count++, i++);
25 if (count < n) i++;
26 *size = i;
27 PetscFunctionReturn(PETSC_SUCCESS);
28 }
29 PetscCall(PetscMalloc1(n + 1, &ns_col));
30 ns_col[0] = 0;
31
32 /* Use the same row structure wherever feasible. */
33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34
35 /* if m < n; pad up the remainder with inode_limit */
36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37 /* The last node is the odd ball. pad it up with the remaining rows; */
38 if (count < n) {
39 ns_col[i + 1] = ns_col[i] + (n - count);
40 i++;
41 } else if (count > n) {
42 /* Adjust for the over estimation */
43 ns_col[i] += n - count;
44 }
45 *size = i;
46 *ns = ns_col;
47 PetscFunctionReturn(PETSC_SUCCESS);
48 }
49
50 /*
51 This builds symmetric version of nonzero structure,
52 */
MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59
60 PetscFunctionBegin;
61 nslim_row = a->inode.node_count;
62 m = A->rmap->n;
63 n = A->cmap->n;
64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66
67 /* Use the row_inode as column_inode */
68 nslim_col = nslim_row;
69 ns_col = ns_row;
70
71 /* allocate space for reformatted inode structure */
72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74
75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76 nsz = ns_col[i1 + 1] - ns_col[i1];
77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78 }
79 /* allocate space for row pointers */
80 PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81 *iia = ia;
82 PetscCall(PetscMalloc1(nslim_row + 1, &work));
83
84 /* determine the number of columns in each row */
85 ia[0] = oshift;
86 for (i1 = 0; i1 < nslim_row; i1++) {
87 row = ns_row[i1];
88 j = aj + ai[row] + ishift;
89 jmax = aj + ai[row + 1] + ishift;
90 if (j == jmax) continue; /* empty row */
91 col = *j++ + ishift;
92 i2 = tvc[col];
93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94 ia[i1 + 1]++;
95 ia[i2 + 1]++;
96 i2++; /* Start col of next node */
97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98 i2 = tvc[col];
99 }
100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101 }
102
103 /* shift ia[i] to point to next row */
104 for (i1 = 1; i1 < nslim_row + 1; i1++) {
105 row = ia[i1 - 1];
106 ia[i1] += row;
107 work[i1 - 1] = row - oshift;
108 }
109
110 /* allocate space for column pointers */
111 nz = ia[nslim_row] + (!ishift);
112 PetscCall(PetscMalloc1(nz, &ja));
113 *jja = ja;
114
115 /* loop over lower triangular part putting into ja */
116 for (i1 = 0; i1 < nslim_row; i1++) {
117 row = ns_row[i1];
118 j = aj + ai[row] + ishift;
119 jmax = aj + ai[row + 1] + ishift;
120 if (j == jmax) continue; /* empty row */
121 col = *j++ + ishift;
122 i2 = tvc[col];
123 while (i2 < i1 && j < jmax) {
124 ja[work[i2]++] = i1 + oshift;
125 ja[work[i1]++] = i2 + oshift;
126 ++i2;
127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128 i2 = tvc[col];
129 }
130 if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131 }
132 PetscCall(PetscFree(work));
133 PetscCall(PetscFree2(tns, tvc));
134 PetscFunctionReturn(PETSC_SUCCESS);
135 }
136
137 /*
138 This builds nonsymmetric version of nonzero structure,
139 */
MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144 PetscInt *tns, *tvc, nsz, i1, i2;
145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146
147 PetscFunctionBegin;
148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149 nslim_row = a->inode.node_count;
150 n = A->cmap->n;
151
152 /* Create The column_inode for this matrix */
153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154
155 /* allocate space for reformatted column_inode structure */
156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158
159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160 nsz = ns_col[i1 + 1] - ns_col[i1];
161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162 }
163 /* allocate space for row pointers */
164 PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165 *iia = ia;
166 PetscCall(PetscMalloc1(nslim_row + 1, &work));
167
168 /* determine the number of columns in each row */
169 ia[0] = oshift;
170 for (i1 = 0; i1 < nslim_row; i1++) {
171 row = ns_row[i1];
172 j = aj + ai[row] + ishift;
173 nz = ai[row + 1] - ai[row];
174 if (!nz) continue; /* empty row */
175 col = *j++ + ishift;
176 i2 = tvc[col];
177 while (nz-- > 0) { /* off-diagonal elements */
178 ia[i1 + 1]++;
179 i2++; /* Start col of next node */
180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181 if (nz > 0) i2 = tvc[col];
182 }
183 }
184
185 /* shift ia[i] to point to next row */
186 for (i1 = 1; i1 < nslim_row + 1; i1++) {
187 row = ia[i1 - 1];
188 ia[i1] += row;
189 work[i1 - 1] = row - oshift;
190 }
191
192 /* allocate space for column pointers */
193 nz = ia[nslim_row] + (!ishift);
194 PetscCall(PetscMalloc1(nz, &ja));
195 *jja = ja;
196
197 /* loop over matrix putting into ja */
198 for (i1 = 0; i1 < nslim_row; i1++) {
199 row = ns_row[i1];
200 j = aj + ai[row] + ishift;
201 nz = ai[row + 1] - ai[row];
202 if (!nz) continue; /* empty row */
203 col = *j++ + ishift;
204 i2 = tvc[col];
205 while (nz-- > 0) {
206 ja[work[i1]++] = i2 + oshift;
207 ++i2;
208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209 if (nz > 0) i2 = tvc[col];
210 }
211 }
212 PetscCall(PetscFree(ns_col));
213 PetscCall(PetscFree(work));
214 PetscCall(PetscFree2(tns, tvc));
215 PetscFunctionReturn(PETSC_SUCCESS);
216 }
217
MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221
222 PetscFunctionBegin;
223 if (n) *n = a->inode.node_count;
224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225 if (!blockcompressed) {
226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227 } else if (symmetric) {
228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229 } else {
230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231 }
232 PetscFunctionReturn(PETSC_SUCCESS);
233 }
234
MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237 PetscFunctionBegin;
238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239
240 if (!blockcompressed) {
241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242 } else {
243 PetscCall(PetscFree(*ia));
244 PetscCall(PetscFree(*ja));
245 }
246 PetscFunctionReturn(PETSC_SUCCESS);
247 }
248
MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254
255 PetscFunctionBegin;
256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257 nslim_row = a->inode.node_count;
258 n = A->cmap->n;
259
260 /* Create The column_inode for this matrix */
261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262
263 /* allocate space for reformatted column_inode structure */
264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266
267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268 nsz = ns_col[i1 + 1] - ns_col[i1];
269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270 }
271 /* allocate space for column pointers */
272 PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273 *iia = ia;
274 PetscCall(PetscMalloc1(nslim_col + 1, &work));
275
276 /* determine the number of columns in each row */
277 ia[0] = oshift;
278 for (i1 = 0; i1 < nslim_row; i1++) {
279 row = ns_row[i1];
280 j = aj + ai[row] + ishift;
281 col = *j++ + ishift;
282 i2 = tvc[col];
283 nz = ai[row + 1] - ai[row];
284 while (nz-- > 0) { /* off-diagonal elements */
285 /* ia[i1+1]++; */
286 ia[i2 + 1]++;
287 i2++;
288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289 if (nz > 0) i2 = tvc[col];
290 }
291 }
292
293 /* shift ia[i] to point to next col */
294 for (i1 = 1; i1 < nslim_col + 1; i1++) {
295 col = ia[i1 - 1];
296 ia[i1] += col;
297 work[i1 - 1] = col - oshift;
298 }
299
300 /* allocate space for column pointers */
301 nz = ia[nslim_col] + (!ishift);
302 PetscCall(PetscMalloc1(nz, &ja));
303 *jja = ja;
304
305 /* loop over matrix putting into ja */
306 for (i1 = 0; i1 < nslim_row; i1++) {
307 row = ns_row[i1];
308 j = aj + ai[row] + ishift;
309 col = *j++ + ishift;
310 i2 = tvc[col];
311 nz = ai[row + 1] - ai[row];
312 while (nz-- > 0) {
313 /* ja[work[i1]++] = i2 + oshift; */
314 ja[work[i2]++] = i1 + oshift;
315 i2++;
316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317 if (nz > 0) i2 = tvc[col];
318 }
319 }
320 PetscCall(PetscFree(ns_col));
321 PetscCall(PetscFree(work));
322 PetscCall(PetscFree2(tns, tvc));
323 PetscFunctionReturn(PETSC_SUCCESS);
324 }
325
MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328 PetscFunctionBegin;
329 PetscCall(MatCreateColInode_Private(A, n, NULL));
330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331
332 if (!blockcompressed) {
333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334 } else if (symmetric) {
335 /* Since the indices are symmetric it doesn't matter */
336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337 } else {
338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339 }
340 PetscFunctionReturn(PETSC_SUCCESS);
341 }
342
MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345 PetscFunctionBegin;
346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347 if (!blockcompressed) {
348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349 } else {
350 PetscCall(PetscFree(*ia));
351 PetscCall(PetscFree(*ja));
352 }
353 PetscFunctionReturn(PETSC_SUCCESS);
354 }
355
MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359 PetscScalar *y;
360 const PetscScalar *x;
361 PetscInt row, node_max, nonzerorow = 0;
362 PetscInt *ns;
363
364 PetscFunctionBegin;
365 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366 node_max = a->inode.node_count;
367 ns = a->inode.size_csr; /* Node Size array */
368 PetscCall(VecGetArrayRead(xx, &x));
369 PetscCall(VecGetArray(yy, &y));
370
371 PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372 for (PetscInt i = 0; i < node_max; ++i) {
373 PetscInt i1, i2, nsz, n, sz;
374 const MatScalar *v1, *v2, *v3, *v4, *v5;
375 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376 const PetscInt *idx;
377
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380 #endif
381 row = ns[i];
382 nsz = ns[i + 1] - ns[i];
383 n = a->i[row + 1] - a->i[row];
384 nonzerorow += (n > 0) * nsz;
385
386 idx = &a->j[a->i[row]];
387 v1 = &a->a[a->i[row]];
388 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
389 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
390 sz = n; /* No of non zeros in this row */
391 /* Switch on the size of Node */
392 switch (nsz) { /* Each loop in 'case' is unrolled */
393 case 1:
394 sum1 = 0.;
395
396 for (n = 0; n < sz - 1; n += 2) {
397 i1 = idx[0]; /* The instructions are ordered to */
398 i2 = idx[1]; /* make the compiler's job easy */
399 idx += 2;
400 tmp0 = x[i1];
401 tmp1 = x[i2];
402 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403 v1 += 2;
404 }
405
406 if (n == sz - 1) { /* Take care of the last nonzero */
407 tmp0 = x[*idx++];
408 sum1 += *v1++ * tmp0;
409 }
410 y[row++] = sum1;
411 break;
412 case 2:
413 sum1 = 0.;
414 sum2 = 0.;
415 v2 = v1 + n;
416
417 for (n = 0; n < sz - 1; n += 2) {
418 i1 = idx[0];
419 i2 = idx[1];
420 idx += 2;
421 tmp0 = x[i1];
422 tmp1 = x[i2];
423 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424 v1 += 2;
425 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426 v2 += 2;
427 }
428 if (n == sz - 1) {
429 tmp0 = x[*idx++];
430 sum1 += *v1++ * tmp0;
431 sum2 += *v2++ * tmp0;
432 }
433 y[row++] = sum1;
434 y[row++] = sum2;
435 v1 = v2; /* Since the next block to be processed starts there*/
436 idx += sz;
437 break;
438 case 3:
439 sum1 = 0.;
440 sum2 = 0.;
441 sum3 = 0.;
442 v2 = v1 + n;
443 v3 = v2 + n;
444
445 for (n = 0; n < sz - 1; n += 2) {
446 i1 = idx[0];
447 i2 = idx[1];
448 idx += 2;
449 tmp0 = x[i1];
450 tmp1 = x[i2];
451 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452 v1 += 2;
453 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454 v2 += 2;
455 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456 v3 += 2;
457 }
458 if (n == sz - 1) {
459 tmp0 = x[*idx++];
460 sum1 += *v1++ * tmp0;
461 sum2 += *v2++ * tmp0;
462 sum3 += *v3++ * tmp0;
463 }
464 y[row++] = sum1;
465 y[row++] = sum2;
466 y[row++] = sum3;
467 v1 = v3; /* Since the next block to be processed starts there*/
468 idx += 2 * sz;
469 break;
470 case 4:
471 sum1 = 0.;
472 sum2 = 0.;
473 sum3 = 0.;
474 sum4 = 0.;
475 v2 = v1 + n;
476 v3 = v2 + n;
477 v4 = v3 + n;
478
479 for (n = 0; n < sz - 1; n += 2) {
480 i1 = idx[0];
481 i2 = idx[1];
482 idx += 2;
483 tmp0 = x[i1];
484 tmp1 = x[i2];
485 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486 v1 += 2;
487 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488 v2 += 2;
489 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490 v3 += 2;
491 sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492 v4 += 2;
493 }
494 if (n == sz - 1) {
495 tmp0 = x[*idx++];
496 sum1 += *v1++ * tmp0;
497 sum2 += *v2++ * tmp0;
498 sum3 += *v3++ * tmp0;
499 sum4 += *v4++ * tmp0;
500 }
501 y[row++] = sum1;
502 y[row++] = sum2;
503 y[row++] = sum3;
504 y[row++] = sum4;
505 v1 = v4; /* Since the next block to be processed starts there*/
506 idx += 3 * sz;
507 break;
508 case 5:
509 sum1 = 0.;
510 sum2 = 0.;
511 sum3 = 0.;
512 sum4 = 0.;
513 sum5 = 0.;
514 v2 = v1 + n;
515 v3 = v2 + n;
516 v4 = v3 + n;
517 v5 = v4 + n;
518
519 for (n = 0; n < sz - 1; n += 2) {
520 i1 = idx[0];
521 i2 = idx[1];
522 idx += 2;
523 tmp0 = x[i1];
524 tmp1 = x[i2];
525 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526 v1 += 2;
527 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528 v2 += 2;
529 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530 v3 += 2;
531 sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532 v4 += 2;
533 sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534 v5 += 2;
535 }
536 if (n == sz - 1) {
537 tmp0 = x[*idx++];
538 sum1 += *v1++ * tmp0;
539 sum2 += *v2++ * tmp0;
540 sum3 += *v3++ * tmp0;
541 sum4 += *v4++ * tmp0;
542 sum5 += *v5++ * tmp0;
543 }
544 y[row++] = sum1;
545 y[row++] = sum2;
546 y[row++] = sum3;
547 y[row++] = sum4;
548 y[row++] = sum5;
549 v1 = v5; /* Since the next block to be processed starts there */
550 idx += 4 * sz;
551 break;
552 default:
553 SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554 }
555 }
556 PetscCall(VecRestoreArrayRead(xx, &x));
557 PetscCall(VecRestoreArray(yy, &y));
558 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559 PetscFunctionReturn(PETSC_SUCCESS);
560 }
561
562 /* Almost same code as the MatMult_SeqAIJ_Inode() */
MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564 {
565 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
566 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567 const MatScalar *v1, *v2, *v3, *v4, *v5;
568 const PetscScalar *x;
569 PetscScalar *y, *z, *zt;
570 PetscInt i1, i2, n, i, row, node_max, nsz, sz;
571 const PetscInt *idx, *ns, *ii;
572
573 PetscFunctionBegin;
574 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575 node_max = a->inode.node_count;
576 ns = a->inode.size_csr; /* Node Size array */
577
578 PetscCall(VecGetArrayRead(xx, &x));
579 PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580 zt = z;
581
582 idx = a->j;
583 v1 = a->a;
584 ii = a->i;
585
586 for (i = 0; i < node_max; ++i) {
587 row = ns[i];
588 nsz = ns[i + 1] - ns[i];
589 n = ii[1] - ii[0];
590 ii += nsz;
591 sz = n; /* No of non zeros in this row */
592 /* Switch on the size of Node */
593 switch (nsz) { /* Each loop in 'case' is unrolled */
594 case 1:
595 sum1 = *zt++;
596
597 for (n = 0; n < sz - 1; n += 2) {
598 i1 = idx[0]; /* The instructions are ordered to */
599 i2 = idx[1]; /* make the compiler's job easy */
600 idx += 2;
601 tmp0 = x[i1];
602 tmp1 = x[i2];
603 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604 v1 += 2;
605 }
606
607 if (n == sz - 1) { /* Take care of the last nonzero */
608 tmp0 = x[*idx++];
609 sum1 += *v1++ * tmp0;
610 }
611 y[row++] = sum1;
612 break;
613 case 2:
614 sum1 = *zt++;
615 sum2 = *zt++;
616 v2 = v1 + n;
617
618 for (n = 0; n < sz - 1; n += 2) {
619 i1 = idx[0];
620 i2 = idx[1];
621 idx += 2;
622 tmp0 = x[i1];
623 tmp1 = x[i2];
624 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625 v1 += 2;
626 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627 v2 += 2;
628 }
629 if (n == sz - 1) {
630 tmp0 = x[*idx++];
631 sum1 += *v1++ * tmp0;
632 sum2 += *v2++ * tmp0;
633 }
634 y[row++] = sum1;
635 y[row++] = sum2;
636 v1 = v2; /* Since the next block to be processed starts there*/
637 idx += sz;
638 break;
639 case 3:
640 sum1 = *zt++;
641 sum2 = *zt++;
642 sum3 = *zt++;
643 v2 = v1 + n;
644 v3 = v2 + n;
645
646 for (n = 0; n < sz - 1; n += 2) {
647 i1 = idx[0];
648 i2 = idx[1];
649 idx += 2;
650 tmp0 = x[i1];
651 tmp1 = x[i2];
652 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653 v1 += 2;
654 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655 v2 += 2;
656 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657 v3 += 2;
658 }
659 if (n == sz - 1) {
660 tmp0 = x[*idx++];
661 sum1 += *v1++ * tmp0;
662 sum2 += *v2++ * tmp0;
663 sum3 += *v3++ * tmp0;
664 }
665 y[row++] = sum1;
666 y[row++] = sum2;
667 y[row++] = sum3;
668 v1 = v3; /* Since the next block to be processed starts there*/
669 idx += 2 * sz;
670 break;
671 case 4:
672 sum1 = *zt++;
673 sum2 = *zt++;
674 sum3 = *zt++;
675 sum4 = *zt++;
676 v2 = v1 + n;
677 v3 = v2 + n;
678 v4 = v3 + n;
679
680 for (n = 0; n < sz - 1; n += 2) {
681 i1 = idx[0];
682 i2 = idx[1];
683 idx += 2;
684 tmp0 = x[i1];
685 tmp1 = x[i2];
686 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687 v1 += 2;
688 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689 v2 += 2;
690 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691 v3 += 2;
692 sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693 v4 += 2;
694 }
695 if (n == sz - 1) {
696 tmp0 = x[*idx++];
697 sum1 += *v1++ * tmp0;
698 sum2 += *v2++ * tmp0;
699 sum3 += *v3++ * tmp0;
700 sum4 += *v4++ * tmp0;
701 }
702 y[row++] = sum1;
703 y[row++] = sum2;
704 y[row++] = sum3;
705 y[row++] = sum4;
706 v1 = v4; /* Since the next block to be processed starts there*/
707 idx += 3 * sz;
708 break;
709 case 5:
710 sum1 = *zt++;
711 sum2 = *zt++;
712 sum3 = *zt++;
713 sum4 = *zt++;
714 sum5 = *zt++;
715 v2 = v1 + n;
716 v3 = v2 + n;
717 v4 = v3 + n;
718 v5 = v4 + n;
719
720 for (n = 0; n < sz - 1; n += 2) {
721 i1 = idx[0];
722 i2 = idx[1];
723 idx += 2;
724 tmp0 = x[i1];
725 tmp1 = x[i2];
726 sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727 v1 += 2;
728 sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729 v2 += 2;
730 sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731 v3 += 2;
732 sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733 v4 += 2;
734 sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735 v5 += 2;
736 }
737 if (n == sz - 1) {
738 tmp0 = x[*idx++];
739 sum1 += *v1++ * tmp0;
740 sum2 += *v2++ * tmp0;
741 sum3 += *v3++ * tmp0;
742 sum4 += *v4++ * tmp0;
743 sum5 += *v5++ * tmp0;
744 }
745 y[row++] = sum1;
746 y[row++] = sum2;
747 y[row++] = sum3;
748 y[row++] = sum4;
749 y[row++] = sum5;
750 v1 = v5; /* Since the next block to be processed starts there */
751 idx += 4 * sz;
752 break;
753 default:
754 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755 }
756 }
757 PetscCall(VecRestoreArrayRead(xx, &x));
758 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759 PetscCall(PetscLogFlops(2.0 * a->nz));
760 PetscFunctionReturn(PETSC_SUCCESS);
761 }
762
MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764 {
765 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
766 IS iscol = a->col, isrow = a->row;
767 const PetscInt *r, *c, *rout, *cout;
768 PetscInt i, j, n = A->rmap->n, nz;
769 PetscInt node_max, *ns, row, nsz, aii, i0, i1;
770 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771 PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772 PetscScalar sum1, sum2, sum3, sum4, sum5;
773 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774 const PetscScalar *b;
775
776 PetscFunctionBegin;
777 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778 node_max = a->inode.node_count;
779 ns = a->inode.size_csr; /* Node Size array */
780
781 PetscCall(VecGetArrayRead(bb, &b));
782 PetscCall(VecGetArrayWrite(xx, &x));
783 tmp = a->solve_work;
784
785 PetscCall(ISGetIndices(isrow, &rout));
786 r = rout;
787 PetscCall(ISGetIndices(iscol, &cout));
788 c = cout + (n - 1);
789
790 /* forward solve the lower triangular */
791 tmps = tmp;
792 aa = a_a;
793 aj = a_j;
794 ad = a->diag;
795
796 for (i = 0, row = 0; i < node_max; ++i) {
797 row = ns[i];
798 nsz = ns[i + 1] - ns[i];
799 aii = ai[row];
800 v1 = aa + aii;
801 vi = aj + aii;
802 nz = ad[row] - aii;
803 if (i < node_max - 1) {
804 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805 * but our indexing to determine its size could. */
806 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810 }
811
812 switch (nsz) { /* Each loop in 'case' is unrolled */
813 case 1:
814 sum1 = b[*r++];
815 for (j = 0; j < nz - 1; j += 2) {
816 i0 = vi[0];
817 i1 = vi[1];
818 vi += 2;
819 tmp0 = tmps[i0];
820 tmp1 = tmps[i1];
821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822 v1 += 2;
823 }
824 if (j == nz - 1) {
825 tmp0 = tmps[*vi++];
826 sum1 -= *v1++ * tmp0;
827 }
828 tmp[row++] = sum1;
829 break;
830 case 2:
831 sum1 = b[*r++];
832 sum2 = b[*r++];
833 v2 = aa + ai[row + 1];
834
835 for (j = 0; j < nz - 1; j += 2) {
836 i0 = vi[0];
837 i1 = vi[1];
838 vi += 2;
839 tmp0 = tmps[i0];
840 tmp1 = tmps[i1];
841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842 v1 += 2;
843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844 v2 += 2;
845 }
846 if (j == nz - 1) {
847 tmp0 = tmps[*vi++];
848 sum1 -= *v1++ * tmp0;
849 sum2 -= *v2++ * tmp0;
850 }
851 sum2 -= *v2++ * sum1;
852 tmp[row++] = sum1;
853 tmp[row++] = sum2;
854 break;
855 case 3:
856 sum1 = b[*r++];
857 sum2 = b[*r++];
858 sum3 = b[*r++];
859 v2 = aa + ai[row + 1];
860 v3 = aa + ai[row + 2];
861
862 for (j = 0; j < nz - 1; j += 2) {
863 i0 = vi[0];
864 i1 = vi[1];
865 vi += 2;
866 tmp0 = tmps[i0];
867 tmp1 = tmps[i1];
868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869 v1 += 2;
870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871 v2 += 2;
872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873 v3 += 2;
874 }
875 if (j == nz - 1) {
876 tmp0 = tmps[*vi++];
877 sum1 -= *v1++ * tmp0;
878 sum2 -= *v2++ * tmp0;
879 sum3 -= *v3++ * tmp0;
880 }
881 sum2 -= *v2++ * sum1;
882 sum3 -= *v3++ * sum1;
883 sum3 -= *v3++ * sum2;
884
885 tmp[row++] = sum1;
886 tmp[row++] = sum2;
887 tmp[row++] = sum3;
888 break;
889
890 case 4:
891 sum1 = b[*r++];
892 sum2 = b[*r++];
893 sum3 = b[*r++];
894 sum4 = b[*r++];
895 v2 = aa + ai[row + 1];
896 v3 = aa + ai[row + 2];
897 v4 = aa + ai[row + 3];
898
899 for (j = 0; j < nz - 1; j += 2) {
900 i0 = vi[0];
901 i1 = vi[1];
902 vi += 2;
903 tmp0 = tmps[i0];
904 tmp1 = tmps[i1];
905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906 v1 += 2;
907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908 v2 += 2;
909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910 v3 += 2;
911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912 v4 += 2;
913 }
914 if (j == nz - 1) {
915 tmp0 = tmps[*vi++];
916 sum1 -= *v1++ * tmp0;
917 sum2 -= *v2++ * tmp0;
918 sum3 -= *v3++ * tmp0;
919 sum4 -= *v4++ * tmp0;
920 }
921 sum2 -= *v2++ * sum1;
922 sum3 -= *v3++ * sum1;
923 sum4 -= *v4++ * sum1;
924 sum3 -= *v3++ * sum2;
925 sum4 -= *v4++ * sum2;
926 sum4 -= *v4++ * sum3;
927
928 tmp[row++] = sum1;
929 tmp[row++] = sum2;
930 tmp[row++] = sum3;
931 tmp[row++] = sum4;
932 break;
933 case 5:
934 sum1 = b[*r++];
935 sum2 = b[*r++];
936 sum3 = b[*r++];
937 sum4 = b[*r++];
938 sum5 = b[*r++];
939 v2 = aa + ai[row + 1];
940 v3 = aa + ai[row + 2];
941 v4 = aa + ai[row + 3];
942 v5 = aa + ai[row + 4];
943
944 for (j = 0; j < nz - 1; j += 2) {
945 i0 = vi[0];
946 i1 = vi[1];
947 vi += 2;
948 tmp0 = tmps[i0];
949 tmp1 = tmps[i1];
950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951 v1 += 2;
952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953 v2 += 2;
954 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955 v3 += 2;
956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957 v4 += 2;
958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959 v5 += 2;
960 }
961 if (j == nz - 1) {
962 tmp0 = tmps[*vi++];
963 sum1 -= *v1++ * tmp0;
964 sum2 -= *v2++ * tmp0;
965 sum3 -= *v3++ * tmp0;
966 sum4 -= *v4++ * tmp0;
967 sum5 -= *v5++ * tmp0;
968 }
969
970 sum2 -= *v2++ * sum1;
971 sum3 -= *v3++ * sum1;
972 sum4 -= *v4++ * sum1;
973 sum5 -= *v5++ * sum1;
974 sum3 -= *v3++ * sum2;
975 sum4 -= *v4++ * sum2;
976 sum5 -= *v5++ * sum2;
977 sum4 -= *v4++ * sum3;
978 sum5 -= *v5++ * sum3;
979 sum5 -= *v5++ * sum4;
980
981 tmp[row++] = sum1;
982 tmp[row++] = sum2;
983 tmp[row++] = sum3;
984 tmp[row++] = sum4;
985 tmp[row++] = sum5;
986 break;
987 default:
988 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989 }
990 }
991 /* backward solve the upper triangular */
992 for (i = node_max - 1; i >= 0; i--) {
993 row = ns[i + 1];
994 nsz = ns[i + 1] - ns[i];
995 aii = ai[row + 1] - 1;
996 v1 = aa + aii;
997 vi = aj + aii;
998 nz = aii - ad[row];
999 switch (nsz) { /* Each loop in 'case' is unrolled */
1000 case 1:
1001 sum1 = tmp[row];
1002
1003 for (j = nz; j > 1; j -= 2) {
1004 vi -= 2;
1005 i0 = vi[2];
1006 i1 = vi[1];
1007 tmp0 = tmps[i0];
1008 tmp1 = tmps[i1];
1009 v1 -= 2;
1010 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011 }
1012 if (j == 1) {
1013 tmp0 = tmps[*vi--];
1014 sum1 -= *v1-- * tmp0;
1015 }
1016 x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017 row--;
1018 break;
1019 case 2:
1020 sum1 = tmp[row];
1021 sum2 = tmp[row - 1];
1022 v2 = aa + ai[row] - 1;
1023 for (j = nz; j > 1; j -= 2) {
1024 vi -= 2;
1025 i0 = vi[2];
1026 i1 = vi[1];
1027 tmp0 = tmps[i0];
1028 tmp1 = tmps[i1];
1029 v1 -= 2;
1030 v2 -= 2;
1031 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033 }
1034 if (j == 1) {
1035 tmp0 = tmps[*vi--];
1036 sum1 -= *v1-- * tmp0;
1037 sum2 -= *v2-- * tmp0;
1038 }
1039
1040 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041 row--;
1042 sum2 -= *v2-- * tmp0;
1043 x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044 row--;
1045 break;
1046 case 3:
1047 sum1 = tmp[row];
1048 sum2 = tmp[row - 1];
1049 sum3 = tmp[row - 2];
1050 v2 = aa + ai[row] - 1;
1051 v3 = aa + ai[row - 1] - 1;
1052 for (j = nz; j > 1; j -= 2) {
1053 vi -= 2;
1054 i0 = vi[2];
1055 i1 = vi[1];
1056 tmp0 = tmps[i0];
1057 tmp1 = tmps[i1];
1058 v1 -= 2;
1059 v2 -= 2;
1060 v3 -= 2;
1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064 }
1065 if (j == 1) {
1066 tmp0 = tmps[*vi--];
1067 sum1 -= *v1-- * tmp0;
1068 sum2 -= *v2-- * tmp0;
1069 sum3 -= *v3-- * tmp0;
1070 }
1071 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072 row--;
1073 sum2 -= *v2-- * tmp0;
1074 sum3 -= *v3-- * tmp0;
1075 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076 row--;
1077 sum3 -= *v3-- * tmp0;
1078 x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079 row--;
1080
1081 break;
1082 case 4:
1083 sum1 = tmp[row];
1084 sum2 = tmp[row - 1];
1085 sum3 = tmp[row - 2];
1086 sum4 = tmp[row - 3];
1087 v2 = aa + ai[row] - 1;
1088 v3 = aa + ai[row - 1] - 1;
1089 v4 = aa + ai[row - 2] - 1;
1090
1091 for (j = nz; j > 1; j -= 2) {
1092 vi -= 2;
1093 i0 = vi[2];
1094 i1 = vi[1];
1095 tmp0 = tmps[i0];
1096 tmp1 = tmps[i1];
1097 v1 -= 2;
1098 v2 -= 2;
1099 v3 -= 2;
1100 v4 -= 2;
1101 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104 sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105 }
1106 if (j == 1) {
1107 tmp0 = tmps[*vi--];
1108 sum1 -= *v1-- * tmp0;
1109 sum2 -= *v2-- * tmp0;
1110 sum3 -= *v3-- * tmp0;
1111 sum4 -= *v4-- * tmp0;
1112 }
1113
1114 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115 row--;
1116 sum2 -= *v2-- * tmp0;
1117 sum3 -= *v3-- * tmp0;
1118 sum4 -= *v4-- * tmp0;
1119 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120 row--;
1121 sum3 -= *v3-- * tmp0;
1122 sum4 -= *v4-- * tmp0;
1123 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124 row--;
1125 sum4 -= *v4-- * tmp0;
1126 x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127 row--;
1128 break;
1129 case 5:
1130 sum1 = tmp[row];
1131 sum2 = tmp[row - 1];
1132 sum3 = tmp[row - 2];
1133 sum4 = tmp[row - 3];
1134 sum5 = tmp[row - 4];
1135 v2 = aa + ai[row] - 1;
1136 v3 = aa + ai[row - 1] - 1;
1137 v4 = aa + ai[row - 2] - 1;
1138 v5 = aa + ai[row - 3] - 1;
1139 for (j = nz; j > 1; j -= 2) {
1140 vi -= 2;
1141 i0 = vi[2];
1142 i1 = vi[1];
1143 tmp0 = tmps[i0];
1144 tmp1 = tmps[i1];
1145 v1 -= 2;
1146 v2 -= 2;
1147 v3 -= 2;
1148 v4 -= 2;
1149 v5 -= 2;
1150 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153 sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154 sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155 }
1156 if (j == 1) {
1157 tmp0 = tmps[*vi--];
1158 sum1 -= *v1-- * tmp0;
1159 sum2 -= *v2-- * tmp0;
1160 sum3 -= *v3-- * tmp0;
1161 sum4 -= *v4-- * tmp0;
1162 sum5 -= *v5-- * tmp0;
1163 }
1164
1165 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166 row--;
1167 sum2 -= *v2-- * tmp0;
1168 sum3 -= *v3-- * tmp0;
1169 sum4 -= *v4-- * tmp0;
1170 sum5 -= *v5-- * tmp0;
1171 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172 row--;
1173 sum3 -= *v3-- * tmp0;
1174 sum4 -= *v4-- * tmp0;
1175 sum5 -= *v5-- * tmp0;
1176 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177 row--;
1178 sum4 -= *v4-- * tmp0;
1179 sum5 -= *v5-- * tmp0;
1180 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181 row--;
1182 sum5 -= *v5-- * tmp0;
1183 x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184 row--;
1185 break;
1186 default:
1187 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188 }
1189 }
1190 PetscCall(ISRestoreIndices(isrow, &rout));
1191 PetscCall(ISRestoreIndices(iscol, &cout));
1192 PetscCall(VecRestoreArrayRead(bb, &b));
1193 PetscCall(VecRestoreArrayWrite(xx, &x));
1194 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195 PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197
MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo * info)1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199 {
1200 Mat C = B;
1201 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202 IS isrow = b->row, isicol = b->icol;
1203 const PetscInt *r, *ic, *ics;
1204 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205 PetscInt i, j, k, nz, nzL, row, *pj;
1206 const PetscInt *ajtmp, *bjtmp;
1207 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209 FactorShiftCtx sctx;
1210 const PetscInt *ddiag;
1211 PetscReal rs;
1212 MatScalar d;
1213 PetscInt inod, nodesz, node_max, col;
1214 const PetscInt *ns;
1215 PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
1216
1217 PetscFunctionBegin;
1218 /* MatPivotSetUp(): initialize shift context sctx */
1219 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1220
1221 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222 ddiag = a->diag;
1223 sctx.shift_top = info->zeropivot;
1224 for (i = 0; i < n; i++) {
1225 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226 d = (aa)[ddiag[i]];
1227 rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228 v = aa + ai[i];
1229 nz = ai[i + 1] - ai[i];
1230 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231 if (rs > sctx.shift_top) sctx.shift_top = rs;
1232 }
1233 sctx.shift_top *= 1.1;
1234 sctx.nshift_max = 5;
1235 sctx.shift_lo = 0.;
1236 sctx.shift_hi = 1.;
1237 }
1238
1239 PetscCall(ISGetIndices(isrow, &r));
1240 PetscCall(ISGetIndices(isicol, &ic));
1241
1242 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243 ics = ic;
1244
1245 node_max = a->inode.node_count;
1246 ns = a->inode.size_csr;
1247 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1248
1249 /* If max inode size > 4, split it into two inodes.*/
1250 /* also map the inode sizes according to the ordering */
1251 PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252 for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253 nodesz = ns[i + 1] - ns[i];
1254 if (nodesz > 4) {
1255 tmp_vec1[j] = 4;
1256 ++j;
1257 tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258 } else {
1259 tmp_vec1[j] = nodesz;
1260 }
1261 }
1262 /* Use the correct node_max */
1263 node_max = j;
1264
1265 /* Now reorder the inode info based on mat re-ordering info */
1266 /* First create a row -> inode_size_array_index map */
1267 PetscCall(PetscMalloc1(n + 1, &nsmap));
1268 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269 tmp_vec2[0] = 0;
1270 for (i = 0, row = 0; i < node_max; i++) {
1271 nodesz = tmp_vec1[i];
1272 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273 }
1274 /* Using nsmap, create a reordered ns structure */
1275 for (i = 0, j = 0; i < node_max; i++) {
1276 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278 j += nodesz;
1279 }
1280 PetscCall(PetscFree(nsmap));
1281 PetscCall(PetscFree(tmp_vec1));
1282
1283 /* Now use the correct ns */
1284 ns = tmp_vec2;
1285
1286 do {
1287 sctx.newshift = PETSC_FALSE;
1288 /* Now loop over each block-row, and do the factorization */
1289 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290 nodesz = ns[inod + 1] - ns[inod];
1291
1292 switch (nodesz) {
1293 case 1:
1294 /* zero rtmp1 */
1295 /* L part */
1296 nz = bi[i + 1] - bi[i];
1297 bjtmp = bj + bi[i];
1298 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1299
1300 /* U part */
1301 nz = bdiag[i] - bdiag[i + 1];
1302 bjtmp = bj + bdiag[i + 1] + 1;
1303 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1304
1305 /* load in initial (unfactored row) */
1306 nz = ai[r[i] + 1] - ai[r[i]];
1307 ajtmp = aj + ai[r[i]];
1308 v = aa + ai[r[i]];
1309 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1310
1311 /* ZeropivotApply() */
1312 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1313
1314 /* elimination */
1315 bjtmp = bj + bi[i];
1316 row = *bjtmp++;
1317 nzL = bi[i + 1] - bi[i];
1318 for (k = 0; k < nzL; k++) {
1319 pc = rtmp1 + row;
1320 if (*pc != 0.0) {
1321 pv = b->a + bdiag[row];
1322 mul1 = *pc * (*pv);
1323 *pc = mul1;
1324 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325 pv = b->a + bdiag[row + 1] + 1;
1326 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328 PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329 }
1330 row = *bjtmp++;
1331 }
1332
1333 /* finished row so stick it into b->a */
1334 rs = 0.0;
1335 /* L part */
1336 pv = b->a + bi[i];
1337 pj = b->j + bi[i];
1338 nz = bi[i + 1] - bi[i];
1339 for (j = 0; j < nz; j++) {
1340 pv[j] = rtmp1[pj[j]];
1341 rs += PetscAbsScalar(pv[j]);
1342 }
1343
1344 /* U part */
1345 pv = b->a + bdiag[i + 1] + 1;
1346 pj = b->j + bdiag[i + 1] + 1;
1347 nz = bdiag[i] - bdiag[i + 1] - 1;
1348 for (j = 0; j < nz; j++) {
1349 pv[j] = rtmp1[pj[j]];
1350 rs += PetscAbsScalar(pv[j]);
1351 }
1352
1353 /* Check zero pivot */
1354 sctx.rs = rs;
1355 sctx.pv = rtmp1[i];
1356 PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357 if (sctx.newshift) break;
1358
1359 /* Mark diagonal and invert diagonal for simpler triangular solves */
1360 pv = b->a + bdiag[i];
1361 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362 break;
1363
1364 case 2:
1365 /* zero rtmp1 and rtmp2 */
1366 /* L part */
1367 nz = bi[i + 1] - bi[i];
1368 bjtmp = bj + bi[i];
1369 for (j = 0; j < nz; j++) {
1370 col = bjtmp[j];
1371 rtmp1[col] = 0.0;
1372 rtmp2[col] = 0.0;
1373 }
1374
1375 /* U part */
1376 nz = bdiag[i] - bdiag[i + 1];
1377 bjtmp = bj + bdiag[i + 1] + 1;
1378 for (j = 0; j < nz; j++) {
1379 col = bjtmp[j];
1380 rtmp1[col] = 0.0;
1381 rtmp2[col] = 0.0;
1382 }
1383
1384 /* load in initial (unfactored row) */
1385 nz = ai[r[i] + 1] - ai[r[i]];
1386 ajtmp = aj + ai[r[i]];
1387 v1 = aa + ai[r[i]];
1388 v2 = aa + ai[r[i + 1]];
1389 for (j = 0; j < nz; j++) {
1390 col = ics[ajtmp[j]];
1391 rtmp1[col] = v1[j];
1392 rtmp2[col] = v2[j];
1393 }
1394 /* ZeropivotApply(): shift the diagonal of the matrix */
1395 rtmp1[i] += sctx.shift_amount;
1396 rtmp2[i + 1] += sctx.shift_amount;
1397
1398 /* elimination */
1399 bjtmp = bj + bi[i];
1400 row = *bjtmp++; /* pivot row */
1401 nzL = bi[i + 1] - bi[i];
1402 for (k = 0; k < nzL; k++) {
1403 pc1 = rtmp1 + row;
1404 pc2 = rtmp2 + row;
1405 if (*pc1 != 0.0 || *pc2 != 0.0) {
1406 pv = b->a + bdiag[row];
1407 mul1 = *pc1 * (*pv);
1408 mul2 = *pc2 * (*pv);
1409 *pc1 = mul1;
1410 *pc2 = mul2;
1411
1412 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413 pv = b->a + bdiag[row + 1] + 1;
1414 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415 for (j = 0; j < nz; j++) {
1416 col = pj[j];
1417 rtmp1[col] -= mul1 * pv[j];
1418 rtmp2[col] -= mul2 * pv[j];
1419 }
1420 PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421 }
1422 row = *bjtmp++;
1423 }
1424
1425 /* finished row i; check zero pivot, then stick row i into b->a */
1426 rs = 0.0;
1427 /* L part */
1428 pc1 = b->a + bi[i];
1429 pj = b->j + bi[i];
1430 nz = bi[i + 1] - bi[i];
1431 for (j = 0; j < nz; j++) {
1432 col = pj[j];
1433 pc1[j] = rtmp1[col];
1434 rs += PetscAbsScalar(pc1[j]);
1435 }
1436 /* U part */
1437 pc1 = b->a + bdiag[i + 1] + 1;
1438 pj = b->j + bdiag[i + 1] + 1;
1439 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440 for (j = 0; j < nz; j++) {
1441 col = pj[j];
1442 pc1[j] = rtmp1[col];
1443 rs += PetscAbsScalar(pc1[j]);
1444 }
1445
1446 sctx.rs = rs;
1447 sctx.pv = rtmp1[i];
1448 PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449 if (sctx.newshift) break;
1450 pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451 *pc1 = 1.0 / sctx.pv;
1452
1453 /* Now take care of diagonal 2x2 block. */
1454 pc2 = rtmp2 + i;
1455 if (*pc2 != 0.0) {
1456 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457 *pc2 = mul1; /* insert L entry */
1458 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460 for (j = 0; j < nz; j++) {
1461 col = pj[j];
1462 rtmp2[col] -= mul1 * rtmp1[col];
1463 }
1464 PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465 }
1466
1467 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468 rs = 0.0;
1469 /* L part */
1470 pc2 = b->a + bi[i + 1];
1471 pj = b->j + bi[i + 1];
1472 nz = bi[i + 2] - bi[i + 1];
1473 for (j = 0; j < nz; j++) {
1474 col = pj[j];
1475 pc2[j] = rtmp2[col];
1476 rs += PetscAbsScalar(pc2[j]);
1477 }
1478 /* U part */
1479 pc2 = b->a + bdiag[i + 2] + 1;
1480 pj = b->j + bdiag[i + 2] + 1;
1481 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482 for (j = 0; j < nz; j++) {
1483 col = pj[j];
1484 pc2[j] = rtmp2[col];
1485 rs += PetscAbsScalar(pc2[j]);
1486 }
1487
1488 sctx.rs = rs;
1489 sctx.pv = rtmp2[i + 1];
1490 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491 if (sctx.newshift) break;
1492 pc2 = b->a + bdiag[i + 1];
1493 *pc2 = 1.0 / sctx.pv;
1494 break;
1495
1496 case 3:
1497 /* zero rtmp */
1498 /* L part */
1499 nz = bi[i + 1] - bi[i];
1500 bjtmp = bj + bi[i];
1501 for (j = 0; j < nz; j++) {
1502 col = bjtmp[j];
1503 rtmp1[col] = 0.0;
1504 rtmp2[col] = 0.0;
1505 rtmp3[col] = 0.0;
1506 }
1507
1508 /* U part */
1509 nz = bdiag[i] - bdiag[i + 1];
1510 bjtmp = bj + bdiag[i + 1] + 1;
1511 for (j = 0; j < nz; j++) {
1512 col = bjtmp[j];
1513 rtmp1[col] = 0.0;
1514 rtmp2[col] = 0.0;
1515 rtmp3[col] = 0.0;
1516 }
1517
1518 /* load in initial (unfactored row) */
1519 nz = ai[r[i] + 1] - ai[r[i]];
1520 ajtmp = aj + ai[r[i]];
1521 v1 = aa + ai[r[i]];
1522 v2 = aa + ai[r[i + 1]];
1523 v3 = aa + ai[r[i + 2]];
1524 for (j = 0; j < nz; j++) {
1525 col = ics[ajtmp[j]];
1526 rtmp1[col] = v1[j];
1527 rtmp2[col] = v2[j];
1528 rtmp3[col] = v3[j];
1529 }
1530 /* ZeropivotApply(): shift the diagonal of the matrix */
1531 rtmp1[i] += sctx.shift_amount;
1532 rtmp2[i + 1] += sctx.shift_amount;
1533 rtmp3[i + 2] += sctx.shift_amount;
1534
1535 /* elimination */
1536 bjtmp = bj + bi[i];
1537 row = *bjtmp++; /* pivot row */
1538 nzL = bi[i + 1] - bi[i];
1539 for (k = 0; k < nzL; k++) {
1540 pc1 = rtmp1 + row;
1541 pc2 = rtmp2 + row;
1542 pc3 = rtmp3 + row;
1543 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544 pv = b->a + bdiag[row];
1545 mul1 = *pc1 * (*pv);
1546 mul2 = *pc2 * (*pv);
1547 mul3 = *pc3 * (*pv);
1548 *pc1 = mul1;
1549 *pc2 = mul2;
1550 *pc3 = mul3;
1551
1552 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553 pv = b->a + bdiag[row + 1] + 1;
1554 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555 for (j = 0; j < nz; j++) {
1556 col = pj[j];
1557 rtmp1[col] -= mul1 * pv[j];
1558 rtmp2[col] -= mul2 * pv[j];
1559 rtmp3[col] -= mul3 * pv[j];
1560 }
1561 PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562 }
1563 row = *bjtmp++;
1564 }
1565
1566 /* finished row i; check zero pivot, then stick row i into b->a */
1567 rs = 0.0;
1568 /* L part */
1569 pc1 = b->a + bi[i];
1570 pj = b->j + bi[i];
1571 nz = bi[i + 1] - bi[i];
1572 for (j = 0; j < nz; j++) {
1573 col = pj[j];
1574 pc1[j] = rtmp1[col];
1575 rs += PetscAbsScalar(pc1[j]);
1576 }
1577 /* U part */
1578 pc1 = b->a + bdiag[i + 1] + 1;
1579 pj = b->j + bdiag[i + 1] + 1;
1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581 for (j = 0; j < nz; j++) {
1582 col = pj[j];
1583 pc1[j] = rtmp1[col];
1584 rs += PetscAbsScalar(pc1[j]);
1585 }
1586
1587 sctx.rs = rs;
1588 sctx.pv = rtmp1[i];
1589 PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590 if (sctx.newshift) break;
1591 pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592 *pc1 = 1.0 / sctx.pv;
1593
1594 /* Now take care of 1st column of diagonal 3x3 block. */
1595 pc2 = rtmp2 + i;
1596 pc3 = rtmp3 + i;
1597 if (*pc2 != 0.0 || *pc3 != 0.0) {
1598 mul2 = (*pc2) * (*pc1);
1599 *pc2 = mul2;
1600 mul3 = (*pc3) * (*pc1);
1601 *pc3 = mul3;
1602 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1603 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604 for (j = 0; j < nz; j++) {
1605 col = pj[j];
1606 rtmp2[col] -= mul2 * rtmp1[col];
1607 rtmp3[col] -= mul3 * rtmp1[col];
1608 }
1609 PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610 }
1611
1612 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613 rs = 0.0;
1614 /* L part */
1615 pc2 = b->a + bi[i + 1];
1616 pj = b->j + bi[i + 1];
1617 nz = bi[i + 2] - bi[i + 1];
1618 for (j = 0; j < nz; j++) {
1619 col = pj[j];
1620 pc2[j] = rtmp2[col];
1621 rs += PetscAbsScalar(pc2[j]);
1622 }
1623 /* U part */
1624 pc2 = b->a + bdiag[i + 2] + 1;
1625 pj = b->j + bdiag[i + 2] + 1;
1626 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627 for (j = 0; j < nz; j++) {
1628 col = pj[j];
1629 pc2[j] = rtmp2[col];
1630 rs += PetscAbsScalar(pc2[j]);
1631 }
1632
1633 sctx.rs = rs;
1634 sctx.pv = rtmp2[i + 1];
1635 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636 if (sctx.newshift) break;
1637 pc2 = b->a + bdiag[i + 1];
1638 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639
1640 /* Now take care of 2nd column of diagonal 3x3 block. */
1641 pc3 = rtmp3 + i + 1;
1642 if (*pc3 != 0.0) {
1643 mul3 = (*pc3) * (*pc2);
1644 *pc3 = mul3;
1645 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1646 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647 for (j = 0; j < nz; j++) {
1648 col = pj[j];
1649 rtmp3[col] -= mul3 * rtmp2[col];
1650 }
1651 PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652 }
1653
1654 /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655 rs = 0.0;
1656 /* L part */
1657 pc3 = b->a + bi[i + 2];
1658 pj = b->j + bi[i + 2];
1659 nz = bi[i + 3] - bi[i + 2];
1660 for (j = 0; j < nz; j++) {
1661 col = pj[j];
1662 pc3[j] = rtmp3[col];
1663 rs += PetscAbsScalar(pc3[j]);
1664 }
1665 /* U part */
1666 pc3 = b->a + bdiag[i + 3] + 1;
1667 pj = b->j + bdiag[i + 3] + 1;
1668 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669 for (j = 0; j < nz; j++) {
1670 col = pj[j];
1671 pc3[j] = rtmp3[col];
1672 rs += PetscAbsScalar(pc3[j]);
1673 }
1674
1675 sctx.rs = rs;
1676 sctx.pv = rtmp3[i + 2];
1677 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678 if (sctx.newshift) break;
1679 pc3 = b->a + bdiag[i + 2];
1680 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681 break;
1682 case 4:
1683 /* zero rtmp */
1684 /* L part */
1685 nz = bi[i + 1] - bi[i];
1686 bjtmp = bj + bi[i];
1687 for (j = 0; j < nz; j++) {
1688 col = bjtmp[j];
1689 rtmp1[col] = 0.0;
1690 rtmp2[col] = 0.0;
1691 rtmp3[col] = 0.0;
1692 rtmp4[col] = 0.0;
1693 }
1694
1695 /* U part */
1696 nz = bdiag[i] - bdiag[i + 1];
1697 bjtmp = bj + bdiag[i + 1] + 1;
1698 for (j = 0; j < nz; j++) {
1699 col = bjtmp[j];
1700 rtmp1[col] = 0.0;
1701 rtmp2[col] = 0.0;
1702 rtmp3[col] = 0.0;
1703 rtmp4[col] = 0.0;
1704 }
1705
1706 /* load in initial (unfactored row) */
1707 nz = ai[r[i] + 1] - ai[r[i]];
1708 ajtmp = aj + ai[r[i]];
1709 v1 = aa + ai[r[i]];
1710 v2 = aa + ai[r[i + 1]];
1711 v3 = aa + ai[r[i + 2]];
1712 v4 = aa + ai[r[i + 3]];
1713 for (j = 0; j < nz; j++) {
1714 col = ics[ajtmp[j]];
1715 rtmp1[col] = v1[j];
1716 rtmp2[col] = v2[j];
1717 rtmp3[col] = v3[j];
1718 rtmp4[col] = v4[j];
1719 }
1720 /* ZeropivotApply(): shift the diagonal of the matrix */
1721 rtmp1[i] += sctx.shift_amount;
1722 rtmp2[i + 1] += sctx.shift_amount;
1723 rtmp3[i + 2] += sctx.shift_amount;
1724 rtmp4[i + 3] += sctx.shift_amount;
1725
1726 /* elimination */
1727 bjtmp = bj + bi[i];
1728 row = *bjtmp++; /* pivot row */
1729 nzL = bi[i + 1] - bi[i];
1730 for (k = 0; k < nzL; k++) {
1731 pc1 = rtmp1 + row;
1732 pc2 = rtmp2 + row;
1733 pc3 = rtmp3 + row;
1734 pc4 = rtmp4 + row;
1735 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736 pv = b->a + bdiag[row];
1737 mul1 = *pc1 * (*pv);
1738 mul2 = *pc2 * (*pv);
1739 mul3 = *pc3 * (*pv);
1740 mul4 = *pc4 * (*pv);
1741 *pc1 = mul1;
1742 *pc2 = mul2;
1743 *pc3 = mul3;
1744 *pc4 = mul4;
1745
1746 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747 pv = b->a + bdiag[row + 1] + 1;
1748 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749 for (j = 0; j < nz; j++) {
1750 col = pj[j];
1751 rtmp1[col] -= mul1 * pv[j];
1752 rtmp2[col] -= mul2 * pv[j];
1753 rtmp3[col] -= mul3 * pv[j];
1754 rtmp4[col] -= mul4 * pv[j];
1755 }
1756 PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757 }
1758 row = *bjtmp++;
1759 }
1760
1761 /* finished row i; check zero pivot, then stick row i into b->a */
1762 rs = 0.0;
1763 /* L part */
1764 pc1 = b->a + bi[i];
1765 pj = b->j + bi[i];
1766 nz = bi[i + 1] - bi[i];
1767 for (j = 0; j < nz; j++) {
1768 col = pj[j];
1769 pc1[j] = rtmp1[col];
1770 rs += PetscAbsScalar(pc1[j]);
1771 }
1772 /* U part */
1773 pc1 = b->a + bdiag[i + 1] + 1;
1774 pj = b->j + bdiag[i + 1] + 1;
1775 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776 for (j = 0; j < nz; j++) {
1777 col = pj[j];
1778 pc1[j] = rtmp1[col];
1779 rs += PetscAbsScalar(pc1[j]);
1780 }
1781
1782 sctx.rs = rs;
1783 sctx.pv = rtmp1[i];
1784 PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785 if (sctx.newshift) break;
1786 pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1787 *pc1 = 1.0 / sctx.pv;
1788
1789 /* Now take care of 1st column of diagonal 4x4 block. */
1790 pc2 = rtmp2 + i;
1791 pc3 = rtmp3 + i;
1792 pc4 = rtmp4 + i;
1793 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794 mul2 = (*pc2) * (*pc1);
1795 *pc2 = mul2;
1796 mul3 = (*pc3) * (*pc1);
1797 *pc3 = mul3;
1798 mul4 = (*pc4) * (*pc1);
1799 *pc4 = mul4;
1800 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1801 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802 for (j = 0; j < nz; j++) {
1803 col = pj[j];
1804 rtmp2[col] -= mul2 * rtmp1[col];
1805 rtmp3[col] -= mul3 * rtmp1[col];
1806 rtmp4[col] -= mul4 * rtmp1[col];
1807 }
1808 PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809 }
1810
1811 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812 rs = 0.0;
1813 /* L part */
1814 pc2 = b->a + bi[i + 1];
1815 pj = b->j + bi[i + 1];
1816 nz = bi[i + 2] - bi[i + 1];
1817 for (j = 0; j < nz; j++) {
1818 col = pj[j];
1819 pc2[j] = rtmp2[col];
1820 rs += PetscAbsScalar(pc2[j]);
1821 }
1822 /* U part */
1823 pc2 = b->a + bdiag[i + 2] + 1;
1824 pj = b->j + bdiag[i + 2] + 1;
1825 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826 for (j = 0; j < nz; j++) {
1827 col = pj[j];
1828 pc2[j] = rtmp2[col];
1829 rs += PetscAbsScalar(pc2[j]);
1830 }
1831
1832 sctx.rs = rs;
1833 sctx.pv = rtmp2[i + 1];
1834 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835 if (sctx.newshift) break;
1836 pc2 = b->a + bdiag[i + 1];
1837 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1838
1839 /* Now take care of 2nd column of diagonal 4x4 block. */
1840 pc3 = rtmp3 + i + 1;
1841 pc4 = rtmp4 + i + 1;
1842 if (*pc3 != 0.0 || *pc4 != 0.0) {
1843 mul3 = (*pc3) * (*pc2);
1844 *pc3 = mul3;
1845 mul4 = (*pc4) * (*pc2);
1846 *pc4 = mul4;
1847 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
1848 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849 for (j = 0; j < nz; j++) {
1850 col = pj[j];
1851 rtmp3[col] -= mul3 * rtmp2[col];
1852 rtmp4[col] -= mul4 * rtmp2[col];
1853 }
1854 PetscCall(PetscLogFlops(4.0 * nz));
1855 }
1856
1857 /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858 rs = 0.0;
1859 /* L part */
1860 pc3 = b->a + bi[i + 2];
1861 pj = b->j + bi[i + 2];
1862 nz = bi[i + 3] - bi[i + 2];
1863 for (j = 0; j < nz; j++) {
1864 col = pj[j];
1865 pc3[j] = rtmp3[col];
1866 rs += PetscAbsScalar(pc3[j]);
1867 }
1868 /* U part */
1869 pc3 = b->a + bdiag[i + 3] + 1;
1870 pj = b->j + bdiag[i + 3] + 1;
1871 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872 for (j = 0; j < nz; j++) {
1873 col = pj[j];
1874 pc3[j] = rtmp3[col];
1875 rs += PetscAbsScalar(pc3[j]);
1876 }
1877
1878 sctx.rs = rs;
1879 sctx.pv = rtmp3[i + 2];
1880 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881 if (sctx.newshift) break;
1882 pc3 = b->a + bdiag[i + 2];
1883 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1884
1885 /* Now take care of 3rd column of diagonal 4x4 block. */
1886 pc4 = rtmp4 + i + 2;
1887 if (*pc4 != 0.0) {
1888 mul4 = (*pc4) * (*pc3);
1889 *pc4 = mul4;
1890 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
1891 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892 for (j = 0; j < nz; j++) {
1893 col = pj[j];
1894 rtmp4[col] -= mul4 * rtmp3[col];
1895 }
1896 PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897 }
1898
1899 /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900 rs = 0.0;
1901 /* L part */
1902 pc4 = b->a + bi[i + 3];
1903 pj = b->j + bi[i + 3];
1904 nz = bi[i + 4] - bi[i + 3];
1905 for (j = 0; j < nz; j++) {
1906 col = pj[j];
1907 pc4[j] = rtmp4[col];
1908 rs += PetscAbsScalar(pc4[j]);
1909 }
1910 /* U part */
1911 pc4 = b->a + bdiag[i + 4] + 1;
1912 pj = b->j + bdiag[i + 4] + 1;
1913 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914 for (j = 0; j < nz; j++) {
1915 col = pj[j];
1916 pc4[j] = rtmp4[col];
1917 rs += PetscAbsScalar(pc4[j]);
1918 }
1919
1920 sctx.rs = rs;
1921 sctx.pv = rtmp4[i + 3];
1922 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923 if (sctx.newshift) break;
1924 pc4 = b->a + bdiag[i + 3];
1925 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926 break;
1927
1928 default:
1929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930 }
1931 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932 i += nodesz; /* Update the row */
1933 }
1934
1935 /* MatPivotRefine() */
1936 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937 /*
1938 * if no shift in this attempt & shifting & started shifting & can refine,
1939 * then try lower shift
1940 */
1941 sctx.shift_hi = sctx.shift_fraction;
1942 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1944 sctx.newshift = PETSC_TRUE;
1945 sctx.nshift++;
1946 }
1947 } while (sctx.newshift);
1948
1949 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950 PetscCall(PetscFree(tmp_vec2));
1951 PetscCall(ISRestoreIndices(isicol, &ic));
1952 PetscCall(ISRestoreIndices(isrow, &r));
1953
1954 if (b->inode.size_csr) {
1955 C->ops->solve = MatSolve_SeqAIJ_Inode;
1956 } else {
1957 C->ops->solve = MatSolve_SeqAIJ;
1958 }
1959 C->ops->solveadd = MatSolveAdd_SeqAIJ;
1960 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1961 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962 C->ops->matsolve = MatMatSolve_SeqAIJ;
1963 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964 C->assembled = PETSC_TRUE;
1965 C->preallocated = PETSC_TRUE;
1966
1967 PetscCall(PetscLogFlops(C->cmap->n));
1968
1969 /* MatShiftView(A,info,&sctx) */
1970 if (sctx.nshift) {
1971 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977 }
1978 }
1979 PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981
MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)1982 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
1983 {
1984 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1985 IS iscol = a->col, isrow = a->row;
1986 const PetscInt *r, *c, *rout, *cout;
1987 PetscInt i, j;
1988 PetscInt node_max, row, nsz, aii, i0, i1, nz;
1989 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
1990 PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
1991 PetscScalar sum1, sum2, sum3, sum4, sum5;
1992 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
1993 const PetscScalar *b;
1994
1995 PetscFunctionBegin;
1996 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1997 node_max = a->inode.node_count;
1998 ns = a->inode.size_csr; /* Node Size array */
1999
2000 PetscCall(VecGetArrayRead(bb, &b));
2001 PetscCall(VecGetArrayWrite(xx, &x));
2002 tmp = a->solve_work;
2003
2004 PetscCall(ISGetIndices(isrow, &rout));
2005 r = rout;
2006 PetscCall(ISGetIndices(iscol, &cout));
2007 c = cout;
2008
2009 /* forward solve the lower triangular */
2010 tmps = tmp;
2011 aa = a_a;
2012 aj = a_j;
2013 ad = a->diag;
2014
2015 for (i = 0; i < node_max; ++i) {
2016 row = ns[i];
2017 nsz = ns[i + 1] - ns[i];
2018 aii = ai[row];
2019 v1 = aa + aii;
2020 vi = aj + aii;
2021 nz = ai[row + 1] - ai[row];
2022
2023 if (i < node_max - 1) {
2024 /* Prefetch the indices for the next block */
2025 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2026 /* Prefetch the data for the next block */
2027 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2028 }
2029
2030 switch (nsz) { /* Each loop in 'case' is unrolled */
2031 case 1:
2032 sum1 = b[r[row]];
2033 for (j = 0; j < nz - 1; j += 2) {
2034 i0 = vi[j];
2035 i1 = vi[j + 1];
2036 tmp0 = tmps[i0];
2037 tmp1 = tmps[i1];
2038 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2039 }
2040 if (j == nz - 1) {
2041 tmp0 = tmps[vi[j]];
2042 sum1 -= v1[j] * tmp0;
2043 }
2044 tmp[row++] = sum1;
2045 break;
2046 case 2:
2047 sum1 = b[r[row]];
2048 sum2 = b[r[row + 1]];
2049 v2 = aa + ai[row + 1];
2050
2051 for (j = 0; j < nz - 1; j += 2) {
2052 i0 = vi[j];
2053 i1 = vi[j + 1];
2054 tmp0 = tmps[i0];
2055 tmp1 = tmps[i1];
2056 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2057 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2058 }
2059 if (j == nz - 1) {
2060 tmp0 = tmps[vi[j]];
2061 sum1 -= v1[j] * tmp0;
2062 sum2 -= v2[j] * tmp0;
2063 }
2064 sum2 -= v2[nz] * sum1;
2065 tmp[row++] = sum1;
2066 tmp[row++] = sum2;
2067 break;
2068 case 3:
2069 sum1 = b[r[row]];
2070 sum2 = b[r[row + 1]];
2071 sum3 = b[r[row + 2]];
2072 v2 = aa + ai[row + 1];
2073 v3 = aa + ai[row + 2];
2074
2075 for (j = 0; j < nz - 1; j += 2) {
2076 i0 = vi[j];
2077 i1 = vi[j + 1];
2078 tmp0 = tmps[i0];
2079 tmp1 = tmps[i1];
2080 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2081 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2082 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2083 }
2084 if (j == nz - 1) {
2085 tmp0 = tmps[vi[j]];
2086 sum1 -= v1[j] * tmp0;
2087 sum2 -= v2[j] * tmp0;
2088 sum3 -= v3[j] * tmp0;
2089 }
2090 sum2 -= v2[nz] * sum1;
2091 sum3 -= v3[nz] * sum1;
2092 sum3 -= v3[nz + 1] * sum2;
2093 tmp[row++] = sum1;
2094 tmp[row++] = sum2;
2095 tmp[row++] = sum3;
2096 break;
2097
2098 case 4:
2099 sum1 = b[r[row]];
2100 sum2 = b[r[row + 1]];
2101 sum3 = b[r[row + 2]];
2102 sum4 = b[r[row + 3]];
2103 v2 = aa + ai[row + 1];
2104 v3 = aa + ai[row + 2];
2105 v4 = aa + ai[row + 3];
2106
2107 for (j = 0; j < nz - 1; j += 2) {
2108 i0 = vi[j];
2109 i1 = vi[j + 1];
2110 tmp0 = tmps[i0];
2111 tmp1 = tmps[i1];
2112 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2113 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2114 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2115 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2116 }
2117 if (j == nz - 1) {
2118 tmp0 = tmps[vi[j]];
2119 sum1 -= v1[j] * tmp0;
2120 sum2 -= v2[j] * tmp0;
2121 sum3 -= v3[j] * tmp0;
2122 sum4 -= v4[j] * tmp0;
2123 }
2124 sum2 -= v2[nz] * sum1;
2125 sum3 -= v3[nz] * sum1;
2126 sum4 -= v4[nz] * sum1;
2127 sum3 -= v3[nz + 1] * sum2;
2128 sum4 -= v4[nz + 1] * sum2;
2129 sum4 -= v4[nz + 2] * sum3;
2130
2131 tmp[row++] = sum1;
2132 tmp[row++] = sum2;
2133 tmp[row++] = sum3;
2134 tmp[row++] = sum4;
2135 break;
2136 case 5:
2137 sum1 = b[r[row]];
2138 sum2 = b[r[row + 1]];
2139 sum3 = b[r[row + 2]];
2140 sum4 = b[r[row + 3]];
2141 sum5 = b[r[row + 4]];
2142 v2 = aa + ai[row + 1];
2143 v3 = aa + ai[row + 2];
2144 v4 = aa + ai[row + 3];
2145 v5 = aa + ai[row + 4];
2146
2147 for (j = 0; j < nz - 1; j += 2) {
2148 i0 = vi[j];
2149 i1 = vi[j + 1];
2150 tmp0 = tmps[i0];
2151 tmp1 = tmps[i1];
2152 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2153 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2154 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2155 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2156 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2157 }
2158 if (j == nz - 1) {
2159 tmp0 = tmps[vi[j]];
2160 sum1 -= v1[j] * tmp0;
2161 sum2 -= v2[j] * tmp0;
2162 sum3 -= v3[j] * tmp0;
2163 sum4 -= v4[j] * tmp0;
2164 sum5 -= v5[j] * tmp0;
2165 }
2166
2167 sum2 -= v2[nz] * sum1;
2168 sum3 -= v3[nz] * sum1;
2169 sum4 -= v4[nz] * sum1;
2170 sum5 -= v5[nz] * sum1;
2171 sum3 -= v3[nz + 1] * sum2;
2172 sum4 -= v4[nz + 1] * sum2;
2173 sum5 -= v5[nz + 1] * sum2;
2174 sum4 -= v4[nz + 2] * sum3;
2175 sum5 -= v5[nz + 2] * sum3;
2176 sum5 -= v5[nz + 3] * sum4;
2177
2178 tmp[row++] = sum1;
2179 tmp[row++] = sum2;
2180 tmp[row++] = sum3;
2181 tmp[row++] = sum4;
2182 tmp[row++] = sum5;
2183 break;
2184 default:
2185 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2186 }
2187 }
2188 /* backward solve the upper triangular */
2189 for (i = node_max - 1; i >= 0; i--) {
2190 row = ns[i + 1] - 1;
2191 nsz = ns[i + 1] - ns[i];
2192 aii = ad[row + 1] + 1;
2193 v1 = aa + aii;
2194 vi = aj + aii;
2195 nz = ad[row] - ad[row + 1] - 1;
2196
2197 if (i > 0) {
2198 /* Prefetch the indices for the next block */
2199 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2200 /* Prefetch the data for the next block */
2201 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2202 }
2203
2204 switch (nsz) { /* Each loop in 'case' is unrolled */
2205 case 1:
2206 sum1 = tmp[row];
2207
2208 for (j = 0; j < nz - 1; j += 2) {
2209 i0 = vi[j];
2210 i1 = vi[j + 1];
2211 tmp0 = tmps[i0];
2212 tmp1 = tmps[i1];
2213 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2214 }
2215 if (j == nz - 1) {
2216 tmp0 = tmps[vi[j]];
2217 sum1 -= v1[j] * tmp0;
2218 }
2219 x[c[row]] = tmp[row] = sum1 * v1[nz];
2220 row--;
2221 break;
2222 case 2:
2223 sum1 = tmp[row];
2224 sum2 = tmp[row - 1];
2225 v2 = aa + ad[row] + 1;
2226 for (j = 0; j < nz - 1; j += 2) {
2227 i0 = vi[j];
2228 i1 = vi[j + 1];
2229 tmp0 = tmps[i0];
2230 tmp1 = tmps[i1];
2231 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2232 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2233 }
2234 if (j == nz - 1) {
2235 tmp0 = tmps[vi[j]];
2236 sum1 -= v1[j] * tmp0;
2237 sum2 -= v2[j + 1] * tmp0;
2238 }
2239
2240 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2241 row--;
2242 sum2 -= v2[0] * tmp0;
2243 x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2244 row--;
2245 break;
2246 case 3:
2247 sum1 = tmp[row];
2248 sum2 = tmp[row - 1];
2249 sum3 = tmp[row - 2];
2250 v2 = aa + ad[row] + 1;
2251 v3 = aa + ad[row - 1] + 1;
2252 for (j = 0; j < nz - 1; j += 2) {
2253 i0 = vi[j];
2254 i1 = vi[j + 1];
2255 tmp0 = tmps[i0];
2256 tmp1 = tmps[i1];
2257 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2258 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2259 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2260 }
2261 if (j == nz - 1) {
2262 tmp0 = tmps[vi[j]];
2263 sum1 -= v1[j] * tmp0;
2264 sum2 -= v2[j + 1] * tmp0;
2265 sum3 -= v3[j + 2] * tmp0;
2266 }
2267 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2268 row--;
2269 sum2 -= v2[0] * tmp0;
2270 sum3 -= v3[1] * tmp0;
2271 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2272 row--;
2273 sum3 -= v3[0] * tmp0;
2274 x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2275 row--;
2276
2277 break;
2278 case 4:
2279 sum1 = tmp[row];
2280 sum2 = tmp[row - 1];
2281 sum3 = tmp[row - 2];
2282 sum4 = tmp[row - 3];
2283 v2 = aa + ad[row] + 1;
2284 v3 = aa + ad[row - 1] + 1;
2285 v4 = aa + ad[row - 2] + 1;
2286
2287 for (j = 0; j < nz - 1; j += 2) {
2288 i0 = vi[j];
2289 i1 = vi[j + 1];
2290 tmp0 = tmps[i0];
2291 tmp1 = tmps[i1];
2292 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2293 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2294 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2295 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2296 }
2297 if (j == nz - 1) {
2298 tmp0 = tmps[vi[j]];
2299 sum1 -= v1[j] * tmp0;
2300 sum2 -= v2[j + 1] * tmp0;
2301 sum3 -= v3[j + 2] * tmp0;
2302 sum4 -= v4[j + 3] * tmp0;
2303 }
2304
2305 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2306 row--;
2307 sum2 -= v2[0] * tmp0;
2308 sum3 -= v3[1] * tmp0;
2309 sum4 -= v4[2] * tmp0;
2310 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2311 row--;
2312 sum3 -= v3[0] * tmp0;
2313 sum4 -= v4[1] * tmp0;
2314 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2315 row--;
2316 sum4 -= v4[0] * tmp0;
2317 x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2318 row--;
2319 break;
2320 case 5:
2321 sum1 = tmp[row];
2322 sum2 = tmp[row - 1];
2323 sum3 = tmp[row - 2];
2324 sum4 = tmp[row - 3];
2325 sum5 = tmp[row - 4];
2326 v2 = aa + ad[row] + 1;
2327 v3 = aa + ad[row - 1] + 1;
2328 v4 = aa + ad[row - 2] + 1;
2329 v5 = aa + ad[row - 3] + 1;
2330 for (j = 0; j < nz - 1; j += 2) {
2331 i0 = vi[j];
2332 i1 = vi[j + 1];
2333 tmp0 = tmps[i0];
2334 tmp1 = tmps[i1];
2335 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2336 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2337 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2338 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2339 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2340 }
2341 if (j == nz - 1) {
2342 tmp0 = tmps[vi[j]];
2343 sum1 -= v1[j] * tmp0;
2344 sum2 -= v2[j + 1] * tmp0;
2345 sum3 -= v3[j + 2] * tmp0;
2346 sum4 -= v4[j + 3] * tmp0;
2347 sum5 -= v5[j + 4] * tmp0;
2348 }
2349
2350 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2351 row--;
2352 sum2 -= v2[0] * tmp0;
2353 sum3 -= v3[1] * tmp0;
2354 sum4 -= v4[2] * tmp0;
2355 sum5 -= v5[3] * tmp0;
2356 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2357 row--;
2358 sum3 -= v3[0] * tmp0;
2359 sum4 -= v4[1] * tmp0;
2360 sum5 -= v5[2] * tmp0;
2361 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2362 row--;
2363 sum4 -= v4[0] * tmp0;
2364 sum5 -= v5[1] * tmp0;
2365 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2366 row--;
2367 sum5 -= v5[0] * tmp0;
2368 x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2369 row--;
2370 break;
2371 default:
2372 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2373 }
2374 }
2375 PetscCall(ISRestoreIndices(isrow, &rout));
2376 PetscCall(ISRestoreIndices(iscol, &cout));
2377 PetscCall(VecRestoreArrayRead(bb, &b));
2378 PetscCall(VecRestoreArrayWrite(xx, &x));
2379 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2380 PetscFunctionReturn(PETSC_SUCCESS);
2381 }
2382
2383 /*
2384 Makes a longer coloring[] array and calls the usual code with that
2385 */
MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring * iscoloring)2386 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2387 {
2388 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
2389 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2390 PetscInt *colorused, i;
2391 ISColoringValue *newcolor;
2392
2393 PetscFunctionBegin;
2394 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2395 PetscCall(PetscMalloc1(n + 1, &newcolor));
2396 /* loop over inodes, marking a color for each column*/
2397 row = 0;
2398 for (i = 0; i < m; i++) {
2399 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2400 }
2401
2402 /* eliminate unneeded colors */
2403 PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2404 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2405
2406 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2407 ncolors = colorused[5 * ncolors - 1];
2408 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2409 PetscCall(PetscFree(colorused));
2410 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2411 PetscCall(PetscFree(coloring));
2412 PetscFunctionReturn(PETSC_SUCCESS);
2413 }
2414
2415 #include <petsc/private/kernels/blockinvert.h>
2416
2417 /*
2418 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
2419 */
MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A,PetscScalar omega,PetscScalar fshift)2420 static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2421 {
2422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2423 MatScalar *ibdiag, *bdiag, work[25];
2424 const MatScalar *v = a->a;
2425 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2426 PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
2427 PetscInt k, ipvt[5];
2428 PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2429 const PetscInt *sizes = a->inode.size_csr, *diag;
2430
2431 PetscFunctionBegin;
2432 if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2433 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL));
2434 if (!a->inode.ibdiag) {
2435 /* calculate space needed for diagonal blocks */
2436 for (i = 0; i < m; i++) {
2437 nodesz = sizes[i + 1] - sizes[i];
2438 cnt += nodesz * nodesz;
2439 }
2440 a->inode.bdiagsize = cnt;
2441 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2442 }
2443
2444 /* copy over the diagonal blocks and invert them */
2445 ibdiag = a->inode.ibdiag;
2446 bdiag = a->inode.bdiag;
2447 cnt = 0;
2448 for (i = 0, row = 0; i < m; i++) {
2449 nodesz = sizes[i + 1] - sizes[i];
2450 for (j = 0; j < nodesz; j++) {
2451 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2452 }
2453 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2454
2455 switch (nodesz) {
2456 case 1:
2457 /* Create matrix data structure */
2458 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2459 PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2460 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2461 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2462 A->factorerror_zeropivot_row = row;
2463 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2464 }
2465 ibdiag[cnt] = 1.0 / ibdiag[cnt];
2466 break;
2467 case 2:
2468 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2469 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2470 break;
2471 case 3:
2472 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2473 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2474 break;
2475 case 4:
2476 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2477 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2478 break;
2479 case 5:
2480 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2481 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2482 break;
2483 default:
2484 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2485 }
2486 cnt += nodesz * nodesz;
2487 row += nodesz;
2488 }
2489 a->inode.ibdiagState = ((PetscObject)A)->state;
2490 PetscFunctionReturn(PETSC_SUCCESS);
2491 }
2492
MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)2493 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2494 {
2495 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2496 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2497 MatScalar *ibdiag, *bdiag, *t;
2498 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2499 const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2500 const PetscScalar *xb, *b;
2501 PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
2502 PetscInt sz;
2503 const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
2504
2505 PetscFunctionBegin;
2506 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2507 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2508 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2509 PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
2510 diag = a->diag;
2511
2512 ibdiag = a->inode.ibdiag;
2513 bdiag = a->inode.bdiag;
2514 t = a->inode.ssor_work;
2515
2516 PetscCall(VecGetArray(xx, &x));
2517 PetscCall(VecGetArrayRead(bb, &b));
2518 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2519 if (flag & SOR_ZERO_INITIAL_GUESS) {
2520 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2521 for (i = 0, row = 0; i < m; i++) {
2522 sz = diag[row] - ii[row];
2523 v1 = a->a + ii[row];
2524 idx = a->j + ii[row];
2525
2526 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2527 nodesz = sizes[i + 1] - sizes[i];
2528 switch (nodesz) {
2529 case 1:
2530
2531 sum1 = b[row];
2532 for (n = 0; n < sz - 1; n += 2) {
2533 i1 = idx[0];
2534 i2 = idx[1];
2535 idx += 2;
2536 tmp0 = x[i1];
2537 tmp1 = x[i2];
2538 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2539 v1 += 2;
2540 }
2541
2542 if (n == sz - 1) {
2543 tmp0 = x[*idx];
2544 sum1 -= *v1 * tmp0;
2545 }
2546 t[row] = sum1;
2547 x[row++] = sum1 * (*ibdiag++);
2548 break;
2549 case 2:
2550 v2 = a->a + ii[row + 1];
2551 sum1 = b[row];
2552 sum2 = b[row + 1];
2553 for (n = 0; n < sz - 1; n += 2) {
2554 i1 = idx[0];
2555 i2 = idx[1];
2556 idx += 2;
2557 tmp0 = x[i1];
2558 tmp1 = x[i2];
2559 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2560 v1 += 2;
2561 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2562 v2 += 2;
2563 }
2564
2565 if (n == sz - 1) {
2566 tmp0 = x[*idx];
2567 sum1 -= v1[0] * tmp0;
2568 sum2 -= v2[0] * tmp0;
2569 }
2570 t[row] = sum1;
2571 t[row + 1] = sum2;
2572 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2573 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2574 ibdiag += 4;
2575 break;
2576 case 3:
2577 v2 = a->a + ii[row + 1];
2578 v3 = a->a + ii[row + 2];
2579 sum1 = b[row];
2580 sum2 = b[row + 1];
2581 sum3 = b[row + 2];
2582 for (n = 0; n < sz - 1; n += 2) {
2583 i1 = idx[0];
2584 i2 = idx[1];
2585 idx += 2;
2586 tmp0 = x[i1];
2587 tmp1 = x[i2];
2588 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2589 v1 += 2;
2590 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2591 v2 += 2;
2592 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2593 v3 += 2;
2594 }
2595
2596 if (n == sz - 1) {
2597 tmp0 = x[*idx];
2598 sum1 -= v1[0] * tmp0;
2599 sum2 -= v2[0] * tmp0;
2600 sum3 -= v3[0] * tmp0;
2601 }
2602 t[row] = sum1;
2603 t[row + 1] = sum2;
2604 t[row + 2] = sum3;
2605 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2606 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2607 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2608 ibdiag += 9;
2609 break;
2610 case 4:
2611 v2 = a->a + ii[row + 1];
2612 v3 = a->a + ii[row + 2];
2613 v4 = a->a + ii[row + 3];
2614 sum1 = b[row];
2615 sum2 = b[row + 1];
2616 sum3 = b[row + 2];
2617 sum4 = b[row + 3];
2618 for (n = 0; n < sz - 1; n += 2) {
2619 i1 = idx[0];
2620 i2 = idx[1];
2621 idx += 2;
2622 tmp0 = x[i1];
2623 tmp1 = x[i2];
2624 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2625 v1 += 2;
2626 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2627 v2 += 2;
2628 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2629 v3 += 2;
2630 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2631 v4 += 2;
2632 }
2633
2634 if (n == sz - 1) {
2635 tmp0 = x[*idx];
2636 sum1 -= v1[0] * tmp0;
2637 sum2 -= v2[0] * tmp0;
2638 sum3 -= v3[0] * tmp0;
2639 sum4 -= v4[0] * tmp0;
2640 }
2641 t[row] = sum1;
2642 t[row + 1] = sum2;
2643 t[row + 2] = sum3;
2644 t[row + 3] = sum4;
2645 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
2646 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
2647 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
2648 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
2649 ibdiag += 16;
2650 break;
2651 case 5:
2652 v2 = a->a + ii[row + 1];
2653 v3 = a->a + ii[row + 2];
2654 v4 = a->a + ii[row + 3];
2655 v5 = a->a + ii[row + 4];
2656 sum1 = b[row];
2657 sum2 = b[row + 1];
2658 sum3 = b[row + 2];
2659 sum4 = b[row + 3];
2660 sum5 = b[row + 4];
2661 for (n = 0; n < sz - 1; n += 2) {
2662 i1 = idx[0];
2663 i2 = idx[1];
2664 idx += 2;
2665 tmp0 = x[i1];
2666 tmp1 = x[i2];
2667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2668 v1 += 2;
2669 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2670 v2 += 2;
2671 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2672 v3 += 2;
2673 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2674 v4 += 2;
2675 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2676 v5 += 2;
2677 }
2678
2679 if (n == sz - 1) {
2680 tmp0 = x[*idx];
2681 sum1 -= v1[0] * tmp0;
2682 sum2 -= v2[0] * tmp0;
2683 sum3 -= v3[0] * tmp0;
2684 sum4 -= v4[0] * tmp0;
2685 sum5 -= v5[0] * tmp0;
2686 }
2687 t[row] = sum1;
2688 t[row + 1] = sum2;
2689 t[row + 2] = sum3;
2690 t[row + 3] = sum4;
2691 t[row + 4] = sum5;
2692 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
2693 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
2694 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
2695 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
2696 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
2697 ibdiag += 25;
2698 break;
2699 default:
2700 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2701 }
2702 }
2703
2704 xb = t;
2705 PetscCall(PetscLogFlops(a->nz));
2706 } else xb = b;
2707 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2708 ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
2709 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
2710 nodesz = sizes[i + 1] - sizes[i];
2711 ibdiag -= nodesz * nodesz;
2712 sz = ii[row + 1] - diag[row] - 1;
2713 v1 = a->a + diag[row] + 1;
2714 idx = a->j + diag[row] + 1;
2715
2716 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2717 switch (nodesz) {
2718 case 1:
2719
2720 sum1 = xb[row];
2721 for (n = 0; n < sz - 1; n += 2) {
2722 i1 = idx[0];
2723 i2 = idx[1];
2724 idx += 2;
2725 tmp0 = x[i1];
2726 tmp1 = x[i2];
2727 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2728 v1 += 2;
2729 }
2730
2731 if (n == sz - 1) {
2732 tmp0 = x[*idx];
2733 sum1 -= *v1 * tmp0;
2734 }
2735 x[row--] = sum1 * (*ibdiag);
2736 break;
2737
2738 case 2:
2739
2740 sum1 = xb[row];
2741 sum2 = xb[row - 1];
2742 /* note that sum1 is associated with the second of the two rows */
2743 v2 = a->a + diag[row - 1] + 2;
2744 for (n = 0; n < sz - 1; n += 2) {
2745 i1 = idx[0];
2746 i2 = idx[1];
2747 idx += 2;
2748 tmp0 = x[i1];
2749 tmp1 = x[i2];
2750 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2751 v1 += 2;
2752 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2753 v2 += 2;
2754 }
2755
2756 if (n == sz - 1) {
2757 tmp0 = x[*idx];
2758 sum1 -= *v1 * tmp0;
2759 sum2 -= *v2 * tmp0;
2760 }
2761 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
2762 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
2763 break;
2764 case 3:
2765
2766 sum1 = xb[row];
2767 sum2 = xb[row - 1];
2768 sum3 = xb[row - 2];
2769 v2 = a->a + diag[row - 1] + 2;
2770 v3 = a->a + diag[row - 2] + 3;
2771 for (n = 0; n < sz - 1; n += 2) {
2772 i1 = idx[0];
2773 i2 = idx[1];
2774 idx += 2;
2775 tmp0 = x[i1];
2776 tmp1 = x[i2];
2777 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2778 v1 += 2;
2779 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2780 v2 += 2;
2781 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2782 v3 += 2;
2783 }
2784
2785 if (n == sz - 1) {
2786 tmp0 = x[*idx];
2787 sum1 -= *v1 * tmp0;
2788 sum2 -= *v2 * tmp0;
2789 sum3 -= *v3 * tmp0;
2790 }
2791 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
2792 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
2793 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
2794 break;
2795 case 4:
2796
2797 sum1 = xb[row];
2798 sum2 = xb[row - 1];
2799 sum3 = xb[row - 2];
2800 sum4 = xb[row - 3];
2801 v2 = a->a + diag[row - 1] + 2;
2802 v3 = a->a + diag[row - 2] + 3;
2803 v4 = a->a + diag[row - 3] + 4;
2804 for (n = 0; n < sz - 1; n += 2) {
2805 i1 = idx[0];
2806 i2 = idx[1];
2807 idx += 2;
2808 tmp0 = x[i1];
2809 tmp1 = x[i2];
2810 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2811 v1 += 2;
2812 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2813 v2 += 2;
2814 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2815 v3 += 2;
2816 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2817 v4 += 2;
2818 }
2819
2820 if (n == sz - 1) {
2821 tmp0 = x[*idx];
2822 sum1 -= *v1 * tmp0;
2823 sum2 -= *v2 * tmp0;
2824 sum3 -= *v3 * tmp0;
2825 sum4 -= *v4 * tmp0;
2826 }
2827 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
2828 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
2829 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
2830 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
2831 break;
2832 case 5:
2833
2834 sum1 = xb[row];
2835 sum2 = xb[row - 1];
2836 sum3 = xb[row - 2];
2837 sum4 = xb[row - 3];
2838 sum5 = xb[row - 4];
2839 v2 = a->a + diag[row - 1] + 2;
2840 v3 = a->a + diag[row - 2] + 3;
2841 v4 = a->a + diag[row - 3] + 4;
2842 v5 = a->a + diag[row - 4] + 5;
2843 for (n = 0; n < sz - 1; n += 2) {
2844 i1 = idx[0];
2845 i2 = idx[1];
2846 idx += 2;
2847 tmp0 = x[i1];
2848 tmp1 = x[i2];
2849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2850 v1 += 2;
2851 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2852 v2 += 2;
2853 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2854 v3 += 2;
2855 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2856 v4 += 2;
2857 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2858 v5 += 2;
2859 }
2860
2861 if (n == sz - 1) {
2862 tmp0 = x[*idx];
2863 sum1 -= *v1 * tmp0;
2864 sum2 -= *v2 * tmp0;
2865 sum3 -= *v3 * tmp0;
2866 sum4 -= *v4 * tmp0;
2867 sum5 -= *v5 * tmp0;
2868 }
2869 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
2870 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
2871 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
2872 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
2873 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
2874 break;
2875 default:
2876 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2877 }
2878 }
2879
2880 PetscCall(PetscLogFlops(a->nz));
2881 }
2882 its--;
2883 }
2884 while (its--) {
2885 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2886 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
2887 nodesz = sizes[i + 1] - sizes[i];
2888 sz = diag[row] - ii[row];
2889 v1 = a->a + ii[row];
2890 idx = a->j + ii[row];
2891 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2892 switch (nodesz) {
2893 case 1:
2894 sum1 = b[row];
2895 for (n = 0; n < sz - 1; n += 2) {
2896 i1 = idx[0];
2897 i2 = idx[1];
2898 idx += 2;
2899 tmp0 = x[i1];
2900 tmp1 = x[i2];
2901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2902 v1 += 2;
2903 }
2904 if (n == sz - 1) {
2905 tmp0 = x[*idx++];
2906 sum1 -= *v1 * tmp0;
2907 v1++;
2908 }
2909 t[row] = sum1;
2910 sz = ii[row + 1] - diag[row] - 1;
2911 idx = a->j + diag[row] + 1;
2912 v1 += 1;
2913 for (n = 0; n < sz - 1; n += 2) {
2914 i1 = idx[0];
2915 i2 = idx[1];
2916 idx += 2;
2917 tmp0 = x[i1];
2918 tmp1 = x[i2];
2919 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2920 v1 += 2;
2921 }
2922 if (n == sz - 1) {
2923 tmp0 = x[*idx++];
2924 sum1 -= *v1 * tmp0;
2925 }
2926 /* in MatSOR_SeqAIJ this line would be
2927 *
2928 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
2929 *
2930 * but omega == 1, so this becomes
2931 *
2932 * x[row] = sum1*(*ibdiag++);
2933 *
2934 */
2935 x[row] = sum1 * (*ibdiag);
2936 break;
2937 case 2:
2938 v2 = a->a + ii[row + 1];
2939 sum1 = b[row];
2940 sum2 = b[row + 1];
2941 for (n = 0; n < sz - 1; n += 2) {
2942 i1 = idx[0];
2943 i2 = idx[1];
2944 idx += 2;
2945 tmp0 = x[i1];
2946 tmp1 = x[i2];
2947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2948 v1 += 2;
2949 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2950 v2 += 2;
2951 }
2952 if (n == sz - 1) {
2953 tmp0 = x[*idx++];
2954 sum1 -= v1[0] * tmp0;
2955 sum2 -= v2[0] * tmp0;
2956 v1++;
2957 v2++;
2958 }
2959 t[row] = sum1;
2960 t[row + 1] = sum2;
2961 sz = ii[row + 1] - diag[row] - 2;
2962 idx = a->j + diag[row] + 2;
2963 v1 += 2;
2964 v2 += 2;
2965 for (n = 0; n < sz - 1; n += 2) {
2966 i1 = idx[0];
2967 i2 = idx[1];
2968 idx += 2;
2969 tmp0 = x[i1];
2970 tmp1 = x[i2];
2971 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2972 v1 += 2;
2973 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2974 v2 += 2;
2975 }
2976 if (n == sz - 1) {
2977 tmp0 = x[*idx];
2978 sum1 -= v1[0] * tmp0;
2979 sum2 -= v2[0] * tmp0;
2980 }
2981 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2982 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2983 break;
2984 case 3:
2985 v2 = a->a + ii[row + 1];
2986 v3 = a->a + ii[row + 2];
2987 sum1 = b[row];
2988 sum2 = b[row + 1];
2989 sum3 = b[row + 2];
2990 for (n = 0; n < sz - 1; n += 2) {
2991 i1 = idx[0];
2992 i2 = idx[1];
2993 idx += 2;
2994 tmp0 = x[i1];
2995 tmp1 = x[i2];
2996 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2997 v1 += 2;
2998 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2999 v2 += 2;
3000 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3001 v3 += 2;
3002 }
3003 if (n == sz - 1) {
3004 tmp0 = x[*idx++];
3005 sum1 -= v1[0] * tmp0;
3006 sum2 -= v2[0] * tmp0;
3007 sum3 -= v3[0] * tmp0;
3008 v1++;
3009 v2++;
3010 v3++;
3011 }
3012 t[row] = sum1;
3013 t[row + 1] = sum2;
3014 t[row + 2] = sum3;
3015 sz = ii[row + 1] - diag[row] - 3;
3016 idx = a->j + diag[row] + 3;
3017 v1 += 3;
3018 v2 += 3;
3019 v3 += 3;
3020 for (n = 0; n < sz - 1; n += 2) {
3021 i1 = idx[0];
3022 i2 = idx[1];
3023 idx += 2;
3024 tmp0 = x[i1];
3025 tmp1 = x[i2];
3026 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3027 v1 += 2;
3028 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3029 v2 += 2;
3030 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3031 v3 += 2;
3032 }
3033 if (n == sz - 1) {
3034 tmp0 = x[*idx];
3035 sum1 -= v1[0] * tmp0;
3036 sum2 -= v2[0] * tmp0;
3037 sum3 -= v3[0] * tmp0;
3038 }
3039 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3040 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3041 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3042 break;
3043 case 4:
3044 v2 = a->a + ii[row + 1];
3045 v3 = a->a + ii[row + 2];
3046 v4 = a->a + ii[row + 3];
3047 sum1 = b[row];
3048 sum2 = b[row + 1];
3049 sum3 = b[row + 2];
3050 sum4 = b[row + 3];
3051 for (n = 0; n < sz - 1; n += 2) {
3052 i1 = idx[0];
3053 i2 = idx[1];
3054 idx += 2;
3055 tmp0 = x[i1];
3056 tmp1 = x[i2];
3057 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3058 v1 += 2;
3059 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3060 v2 += 2;
3061 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3062 v3 += 2;
3063 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3064 v4 += 2;
3065 }
3066 if (n == sz - 1) {
3067 tmp0 = x[*idx++];
3068 sum1 -= v1[0] * tmp0;
3069 sum2 -= v2[0] * tmp0;
3070 sum3 -= v3[0] * tmp0;
3071 sum4 -= v4[0] * tmp0;
3072 v1++;
3073 v2++;
3074 v3++;
3075 v4++;
3076 }
3077 t[row] = sum1;
3078 t[row + 1] = sum2;
3079 t[row + 2] = sum3;
3080 t[row + 3] = sum4;
3081 sz = ii[row + 1] - diag[row] - 4;
3082 idx = a->j + diag[row] + 4;
3083 v1 += 4;
3084 v2 += 4;
3085 v3 += 4;
3086 v4 += 4;
3087 for (n = 0; n < sz - 1; n += 2) {
3088 i1 = idx[0];
3089 i2 = idx[1];
3090 idx += 2;
3091 tmp0 = x[i1];
3092 tmp1 = x[i2];
3093 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3094 v1 += 2;
3095 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3096 v2 += 2;
3097 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3098 v3 += 2;
3099 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3100 v4 += 2;
3101 }
3102 if (n == sz - 1) {
3103 tmp0 = x[*idx];
3104 sum1 -= v1[0] * tmp0;
3105 sum2 -= v2[0] * tmp0;
3106 sum3 -= v3[0] * tmp0;
3107 sum4 -= v4[0] * tmp0;
3108 }
3109 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3110 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3111 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3112 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3113 break;
3114 case 5:
3115 v2 = a->a + ii[row + 1];
3116 v3 = a->a + ii[row + 2];
3117 v4 = a->a + ii[row + 3];
3118 v5 = a->a + ii[row + 4];
3119 sum1 = b[row];
3120 sum2 = b[row + 1];
3121 sum3 = b[row + 2];
3122 sum4 = b[row + 3];
3123 sum5 = b[row + 4];
3124 for (n = 0; n < sz - 1; n += 2) {
3125 i1 = idx[0];
3126 i2 = idx[1];
3127 idx += 2;
3128 tmp0 = x[i1];
3129 tmp1 = x[i2];
3130 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3131 v1 += 2;
3132 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3133 v2 += 2;
3134 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3135 v3 += 2;
3136 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3137 v4 += 2;
3138 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3139 v5 += 2;
3140 }
3141 if (n == sz - 1) {
3142 tmp0 = x[*idx++];
3143 sum1 -= v1[0] * tmp0;
3144 sum2 -= v2[0] * tmp0;
3145 sum3 -= v3[0] * tmp0;
3146 sum4 -= v4[0] * tmp0;
3147 sum5 -= v5[0] * tmp0;
3148 v1++;
3149 v2++;
3150 v3++;
3151 v4++;
3152 v5++;
3153 }
3154 t[row] = sum1;
3155 t[row + 1] = sum2;
3156 t[row + 2] = sum3;
3157 t[row + 3] = sum4;
3158 t[row + 4] = sum5;
3159 sz = ii[row + 1] - diag[row] - 5;
3160 idx = a->j + diag[row] + 5;
3161 v1 += 5;
3162 v2 += 5;
3163 v3 += 5;
3164 v4 += 5;
3165 v5 += 5;
3166 for (n = 0; n < sz - 1; n += 2) {
3167 i1 = idx[0];
3168 i2 = idx[1];
3169 idx += 2;
3170 tmp0 = x[i1];
3171 tmp1 = x[i2];
3172 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3173 v1 += 2;
3174 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3175 v2 += 2;
3176 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3177 v3 += 2;
3178 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3179 v4 += 2;
3180 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3181 v5 += 2;
3182 }
3183 if (n == sz - 1) {
3184 tmp0 = x[*idx];
3185 sum1 -= v1[0] * tmp0;
3186 sum2 -= v2[0] * tmp0;
3187 sum3 -= v3[0] * tmp0;
3188 sum4 -= v4[0] * tmp0;
3189 sum5 -= v5[0] * tmp0;
3190 }
3191 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3192 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3193 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3194 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3195 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3196 break;
3197 default:
3198 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3199 }
3200 }
3201 xb = t;
3202 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3203 } else xb = b;
3204
3205 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3206 ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3207 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3208 nodesz = sizes[i + 1] - sizes[i];
3209 ibdiag -= nodesz * nodesz;
3210
3211 /* set RHS */
3212 if (xb == b) {
3213 /* whole (old way) */
3214 sz = ii[row + 1] - ii[row];
3215 idx = a->j + ii[row];
3216 switch (nodesz) {
3217 case 5:
3218 v5 = a->a + ii[row - 4]; /* fall through */
3219 case 4:
3220 v4 = a->a + ii[row - 3]; /* fall through */
3221 case 3:
3222 v3 = a->a + ii[row - 2]; /* fall through */
3223 case 2:
3224 v2 = a->a + ii[row - 1]; /* fall through */
3225 case 1:
3226 v1 = a->a + ii[row];
3227 break;
3228 default:
3229 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3230 }
3231 } else {
3232 /* upper, no diag */
3233 sz = ii[row + 1] - diag[row] - 1;
3234 idx = a->j + diag[row] + 1;
3235 switch (nodesz) {
3236 case 5:
3237 v5 = a->a + diag[row - 4] + 5; /* fall through */
3238 case 4:
3239 v4 = a->a + diag[row - 3] + 4; /* fall through */
3240 case 3:
3241 v3 = a->a + diag[row - 2] + 3; /* fall through */
3242 case 2:
3243 v2 = a->a + diag[row - 1] + 2; /* fall through */
3244 case 1:
3245 v1 = a->a + diag[row] + 1;
3246 }
3247 }
3248 /* set sum */
3249 switch (nodesz) {
3250 case 5:
3251 sum5 = xb[row - 4]; /* fall through */
3252 case 4:
3253 sum4 = xb[row - 3]; /* fall through */
3254 case 3:
3255 sum3 = xb[row - 2]; /* fall through */
3256 case 2:
3257 sum2 = xb[row - 1]; /* fall through */
3258 case 1:
3259 /* note that sum1 is associated with the last row */
3260 sum1 = xb[row];
3261 }
3262 /* do sums */
3263 for (n = 0; n < sz - 1; n += 2) {
3264 i1 = idx[0];
3265 i2 = idx[1];
3266 idx += 2;
3267 tmp0 = x[i1];
3268 tmp1 = x[i2];
3269 switch (nodesz) {
3270 case 5:
3271 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3272 v5 += 2; /* fall through */
3273 case 4:
3274 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3275 v4 += 2; /* fall through */
3276 case 3:
3277 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3278 v3 += 2; /* fall through */
3279 case 2:
3280 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3281 v2 += 2; /* fall through */
3282 case 1:
3283 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284 v1 += 2;
3285 }
3286 }
3287 /* ragged edge */
3288 if (n == sz - 1) {
3289 tmp0 = x[*idx];
3290 switch (nodesz) {
3291 case 5:
3292 sum5 -= *v5 * tmp0; /* fall through */
3293 case 4:
3294 sum4 -= *v4 * tmp0; /* fall through */
3295 case 3:
3296 sum3 -= *v3 * tmp0; /* fall through */
3297 case 2:
3298 sum2 -= *v2 * tmp0; /* fall through */
3299 case 1:
3300 sum1 -= *v1 * tmp0;
3301 }
3302 }
3303 /* update */
3304 if (xb == b) {
3305 /* whole (old way) w/ diag */
3306 switch (nodesz) {
3307 case 5:
3308 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3309 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3310 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3311 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3312 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3313 break;
3314 case 4:
3315 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3316 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3317 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3318 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3319 break;
3320 case 3:
3321 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3322 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3323 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3324 break;
3325 case 2:
3326 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3327 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3328 break;
3329 case 1:
3330 x[row--] += sum1 * (*ibdiag);
3331 break;
3332 }
3333 } else {
3334 /* no diag so set = */
3335 switch (nodesz) {
3336 case 5:
3337 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3338 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3339 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3340 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3341 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3342 break;
3343 case 4:
3344 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3345 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3346 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3347 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3348 break;
3349 case 3:
3350 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3351 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3352 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3353 break;
3354 case 2:
3355 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3356 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3357 break;
3358 case 1:
3359 x[row--] = sum1 * (*ibdiag);
3360 break;
3361 }
3362 }
3363 }
3364 if (xb == b) {
3365 PetscCall(PetscLogFlops(2.0 * a->nz));
3366 } else {
3367 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3368 }
3369 }
3370 }
3371 if (flag & SOR_EISENSTAT) {
3372 /*
3373 Apply (U + D)^-1 where D is now the block diagonal
3374 */
3375 ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3376 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3377 nodesz = sizes[i + 1] - sizes[i];
3378 ibdiag -= nodesz * nodesz;
3379 sz = ii[row + 1] - diag[row] - 1;
3380 v1 = a->a + diag[row] + 1;
3381 idx = a->j + diag[row] + 1;
3382 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3383 switch (nodesz) {
3384 case 1:
3385
3386 sum1 = b[row];
3387 for (n = 0; n < sz - 1; n += 2) {
3388 i1 = idx[0];
3389 i2 = idx[1];
3390 idx += 2;
3391 tmp0 = x[i1];
3392 tmp1 = x[i2];
3393 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3394 v1 += 2;
3395 }
3396
3397 if (n == sz - 1) {
3398 tmp0 = x[*idx];
3399 sum1 -= *v1 * tmp0;
3400 }
3401 x[row] = sum1 * (*ibdiag);
3402 row--;
3403 break;
3404
3405 case 2:
3406
3407 sum1 = b[row];
3408 sum2 = b[row - 1];
3409 /* note that sum1 is associated with the second of the two rows */
3410 v2 = a->a + diag[row - 1] + 2;
3411 for (n = 0; n < sz - 1; n += 2) {
3412 i1 = idx[0];
3413 i2 = idx[1];
3414 idx += 2;
3415 tmp0 = x[i1];
3416 tmp1 = x[i2];
3417 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3418 v1 += 2;
3419 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3420 v2 += 2;
3421 }
3422
3423 if (n == sz - 1) {
3424 tmp0 = x[*idx];
3425 sum1 -= *v1 * tmp0;
3426 sum2 -= *v2 * tmp0;
3427 }
3428 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3429 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3430 row -= 2;
3431 break;
3432 case 3:
3433
3434 sum1 = b[row];
3435 sum2 = b[row - 1];
3436 sum3 = b[row - 2];
3437 v2 = a->a + diag[row - 1] + 2;
3438 v3 = a->a + diag[row - 2] + 3;
3439 for (n = 0; n < sz - 1; n += 2) {
3440 i1 = idx[0];
3441 i2 = idx[1];
3442 idx += 2;
3443 tmp0 = x[i1];
3444 tmp1 = x[i2];
3445 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3446 v1 += 2;
3447 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3448 v2 += 2;
3449 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3450 v3 += 2;
3451 }
3452
3453 if (n == sz - 1) {
3454 tmp0 = x[*idx];
3455 sum1 -= *v1 * tmp0;
3456 sum2 -= *v2 * tmp0;
3457 sum3 -= *v3 * tmp0;
3458 }
3459 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3460 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3461 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3462 row -= 3;
3463 break;
3464 case 4:
3465
3466 sum1 = b[row];
3467 sum2 = b[row - 1];
3468 sum3 = b[row - 2];
3469 sum4 = b[row - 3];
3470 v2 = a->a + diag[row - 1] + 2;
3471 v3 = a->a + diag[row - 2] + 3;
3472 v4 = a->a + diag[row - 3] + 4;
3473 for (n = 0; n < sz - 1; n += 2) {
3474 i1 = idx[0];
3475 i2 = idx[1];
3476 idx += 2;
3477 tmp0 = x[i1];
3478 tmp1 = x[i2];
3479 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3480 v1 += 2;
3481 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3482 v2 += 2;
3483 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3484 v3 += 2;
3485 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3486 v4 += 2;
3487 }
3488
3489 if (n == sz - 1) {
3490 tmp0 = x[*idx];
3491 sum1 -= *v1 * tmp0;
3492 sum2 -= *v2 * tmp0;
3493 sum3 -= *v3 * tmp0;
3494 sum4 -= *v4 * tmp0;
3495 }
3496 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3497 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3498 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3499 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3500 row -= 4;
3501 break;
3502 case 5:
3503
3504 sum1 = b[row];
3505 sum2 = b[row - 1];
3506 sum3 = b[row - 2];
3507 sum4 = b[row - 3];
3508 sum5 = b[row - 4];
3509 v2 = a->a + diag[row - 1] + 2;
3510 v3 = a->a + diag[row - 2] + 3;
3511 v4 = a->a + diag[row - 3] + 4;
3512 v5 = a->a + diag[row - 4] + 5;
3513 for (n = 0; n < sz - 1; n += 2) {
3514 i1 = idx[0];
3515 i2 = idx[1];
3516 idx += 2;
3517 tmp0 = x[i1];
3518 tmp1 = x[i2];
3519 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3520 v1 += 2;
3521 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3522 v2 += 2;
3523 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3524 v3 += 2;
3525 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3526 v4 += 2;
3527 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3528 v5 += 2;
3529 }
3530
3531 if (n == sz - 1) {
3532 tmp0 = x[*idx];
3533 sum1 -= *v1 * tmp0;
3534 sum2 -= *v2 * tmp0;
3535 sum3 -= *v3 * tmp0;
3536 sum4 -= *v4 * tmp0;
3537 sum5 -= *v5 * tmp0;
3538 }
3539 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3540 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3541 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3542 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3543 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3544 row -= 5;
3545 break;
3546 default:
3547 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3548 }
3549 }
3550 PetscCall(PetscLogFlops(a->nz));
3551
3552 /*
3553 t = b - D x where D is the block diagonal
3554 */
3555 cnt = 0;
3556 for (i = 0, row = 0; i < m; i++) {
3557 nodesz = sizes[i + 1] - sizes[i];
3558 switch (nodesz) {
3559 case 1:
3560 t[row] = b[row] - bdiag[cnt++] * x[row];
3561 row++;
3562 break;
3563 case 2:
3564 x1 = x[row];
3565 x2 = x[row + 1];
3566 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3567 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3568 t[row] = b[row] - tmp1;
3569 t[row + 1] = b[row + 1] - tmp2;
3570 row += 2;
3571 cnt += 4;
3572 break;
3573 case 3:
3574 x1 = x[row];
3575 x2 = x[row + 1];
3576 x3 = x[row + 2];
3577 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3578 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3579 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3580 t[row] = b[row] - tmp1;
3581 t[row + 1] = b[row + 1] - tmp2;
3582 t[row + 2] = b[row + 2] - tmp3;
3583 row += 3;
3584 cnt += 9;
3585 break;
3586 case 4:
3587 x1 = x[row];
3588 x2 = x[row + 1];
3589 x3 = x[row + 2];
3590 x4 = x[row + 3];
3591 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3592 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3593 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3594 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3595 t[row] = b[row] - tmp1;
3596 t[row + 1] = b[row + 1] - tmp2;
3597 t[row + 2] = b[row + 2] - tmp3;
3598 t[row + 3] = b[row + 3] - tmp4;
3599 row += 4;
3600 cnt += 16;
3601 break;
3602 case 5:
3603 x1 = x[row];
3604 x2 = x[row + 1];
3605 x3 = x[row + 2];
3606 x4 = x[row + 3];
3607 x5 = x[row + 4];
3608 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3609 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3610 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3611 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3612 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3613 t[row] = b[row] - tmp1;
3614 t[row + 1] = b[row + 1] - tmp2;
3615 t[row + 2] = b[row + 2] - tmp3;
3616 t[row + 3] = b[row + 3] - tmp4;
3617 t[row + 4] = b[row + 4] - tmp5;
3618 row += 5;
3619 cnt += 25;
3620 break;
3621 default:
3622 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3623 }
3624 }
3625 PetscCall(PetscLogFlops(m));
3626
3627 /*
3628 Apply (L + D)^-1 where D is the block diagonal
3629 */
3630 for (i = 0, row = 0; i < m; i++) {
3631 nodesz = sizes[i + 1] - sizes[i];
3632 sz = diag[row] - ii[row];
3633 v1 = a->a + ii[row];
3634 idx = a->j + ii[row];
3635 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3636 switch (nodesz) {
3637 case 1:
3638
3639 sum1 = t[row];
3640 for (n = 0; n < sz - 1; n += 2) {
3641 i1 = idx[0];
3642 i2 = idx[1];
3643 idx += 2;
3644 tmp0 = t[i1];
3645 tmp1 = t[i2];
3646 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3647 v1 += 2;
3648 }
3649
3650 if (n == sz - 1) {
3651 tmp0 = t[*idx];
3652 sum1 -= *v1 * tmp0;
3653 }
3654 x[row] += t[row] = sum1 * (*ibdiag++);
3655 row++;
3656 break;
3657 case 2:
3658 v2 = a->a + ii[row + 1];
3659 sum1 = t[row];
3660 sum2 = t[row + 1];
3661 for (n = 0; n < sz - 1; n += 2) {
3662 i1 = idx[0];
3663 i2 = idx[1];
3664 idx += 2;
3665 tmp0 = t[i1];
3666 tmp1 = t[i2];
3667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3668 v1 += 2;
3669 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3670 v2 += 2;
3671 }
3672
3673 if (n == sz - 1) {
3674 tmp0 = t[*idx];
3675 sum1 -= v1[0] * tmp0;
3676 sum2 -= v2[0] * tmp0;
3677 }
3678 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3679 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3680 ibdiag += 4;
3681 row += 2;
3682 break;
3683 case 3:
3684 v2 = a->a + ii[row + 1];
3685 v3 = a->a + ii[row + 2];
3686 sum1 = t[row];
3687 sum2 = t[row + 1];
3688 sum3 = t[row + 2];
3689 for (n = 0; n < sz - 1; n += 2) {
3690 i1 = idx[0];
3691 i2 = idx[1];
3692 idx += 2;
3693 tmp0 = t[i1];
3694 tmp1 = t[i2];
3695 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3696 v1 += 2;
3697 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3698 v2 += 2;
3699 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3700 v3 += 2;
3701 }
3702
3703 if (n == sz - 1) {
3704 tmp0 = t[*idx];
3705 sum1 -= v1[0] * tmp0;
3706 sum2 -= v2[0] * tmp0;
3707 sum3 -= v3[0] * tmp0;
3708 }
3709 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3710 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3711 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3712 ibdiag += 9;
3713 row += 3;
3714 break;
3715 case 4:
3716 v2 = a->a + ii[row + 1];
3717 v3 = a->a + ii[row + 2];
3718 v4 = a->a + ii[row + 3];
3719 sum1 = t[row];
3720 sum2 = t[row + 1];
3721 sum3 = t[row + 2];
3722 sum4 = t[row + 3];
3723 for (n = 0; n < sz - 1; n += 2) {
3724 i1 = idx[0];
3725 i2 = idx[1];
3726 idx += 2;
3727 tmp0 = t[i1];
3728 tmp1 = t[i2];
3729 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3730 v1 += 2;
3731 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3732 v2 += 2;
3733 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3734 v3 += 2;
3735 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3736 v4 += 2;
3737 }
3738
3739 if (n == sz - 1) {
3740 tmp0 = t[*idx];
3741 sum1 -= v1[0] * tmp0;
3742 sum2 -= v2[0] * tmp0;
3743 sum3 -= v3[0] * tmp0;
3744 sum4 -= v4[0] * tmp0;
3745 }
3746 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3747 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3748 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3749 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3750 ibdiag += 16;
3751 row += 4;
3752 break;
3753 case 5:
3754 v2 = a->a + ii[row + 1];
3755 v3 = a->a + ii[row + 2];
3756 v4 = a->a + ii[row + 3];
3757 v5 = a->a + ii[row + 4];
3758 sum1 = t[row];
3759 sum2 = t[row + 1];
3760 sum3 = t[row + 2];
3761 sum4 = t[row + 3];
3762 sum5 = t[row + 4];
3763 for (n = 0; n < sz - 1; n += 2) {
3764 i1 = idx[0];
3765 i2 = idx[1];
3766 idx += 2;
3767 tmp0 = t[i1];
3768 tmp1 = t[i2];
3769 sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3770 v1 += 2;
3771 sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3772 v2 += 2;
3773 sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3774 v3 += 2;
3775 sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3776 v4 += 2;
3777 sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3778 v5 += 2;
3779 }
3780
3781 if (n == sz - 1) {
3782 tmp0 = t[*idx];
3783 sum1 -= v1[0] * tmp0;
3784 sum2 -= v2[0] * tmp0;
3785 sum3 -= v3[0] * tmp0;
3786 sum4 -= v4[0] * tmp0;
3787 sum5 -= v5[0] * tmp0;
3788 }
3789 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3790 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3791 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3792 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3793 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3794 ibdiag += 25;
3795 row += 5;
3796 break;
3797 default:
3798 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3799 }
3800 }
3801 PetscCall(PetscLogFlops(a->nz));
3802 }
3803 PetscCall(VecRestoreArray(xx, &x));
3804 PetscCall(VecRestoreArrayRead(bb, &b));
3805 PetscFunctionReturn(PETSC_SUCCESS);
3806 }
3807
MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)3808 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
3809 {
3810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3811 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
3812 const MatScalar *bdiag = a->inode.bdiag;
3813 const PetscScalar *b;
3814 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
3815 const PetscInt *sizes = a->inode.size_csr;
3816
3817 PetscFunctionBegin;
3818 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3819 PetscCall(VecGetArray(xx, &x));
3820 PetscCall(VecGetArrayRead(bb, &b));
3821 cnt = 0;
3822 for (i = 0, row = 0; i < m; i++) {
3823 nodesz = sizes[i + 1] - sizes[i];
3824 switch (nodesz) {
3825 case 1:
3826 x[row] = b[row] * bdiag[cnt++];
3827 row++;
3828 break;
3829 case 2:
3830 x1 = b[row];
3831 x2 = b[row + 1];
3832 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3833 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3834 x[row++] = tmp1;
3835 x[row++] = tmp2;
3836 cnt += 4;
3837 break;
3838 case 3:
3839 x1 = b[row];
3840 x2 = b[row + 1];
3841 x3 = b[row + 2];
3842 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3843 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3844 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3845 x[row++] = tmp1;
3846 x[row++] = tmp2;
3847 x[row++] = tmp3;
3848 cnt += 9;
3849 break;
3850 case 4:
3851 x1 = b[row];
3852 x2 = b[row + 1];
3853 x3 = b[row + 2];
3854 x4 = b[row + 3];
3855 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3856 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3857 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3858 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3859 x[row++] = tmp1;
3860 x[row++] = tmp2;
3861 x[row++] = tmp3;
3862 x[row++] = tmp4;
3863 cnt += 16;
3864 break;
3865 case 5:
3866 x1 = b[row];
3867 x2 = b[row + 1];
3868 x3 = b[row + 2];
3869 x4 = b[row + 3];
3870 x5 = b[row + 4];
3871 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3872 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3873 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3874 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3875 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3876 x[row++] = tmp1;
3877 x[row++] = tmp2;
3878 x[row++] = tmp3;
3879 x[row++] = tmp4;
3880 x[row++] = tmp5;
3881 cnt += 25;
3882 break;
3883 default:
3884 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3885 }
3886 }
3887 PetscCall(PetscLogFlops(2.0 * cnt));
3888 PetscCall(VecRestoreArray(xx, &x));
3889 PetscCall(VecRestoreArrayRead(bb, &b));
3890 PetscFunctionReturn(PETSC_SUCCESS);
3891 }
3892
MatSeqAIJ_Inode_ResetOps(Mat A)3893 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
3894 {
3895 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3896
3897 PetscFunctionBegin;
3898 a->inode.node_count = 0;
3899 a->inode.use = PETSC_FALSE;
3900 a->inode.checked = PETSC_FALSE;
3901 a->inode.mat_nonzerostate = -1;
3902 A->ops->getrowij = MatGetRowIJ_SeqAIJ;
3903 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
3904 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
3905 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
3906 A->ops->coloringpatch = NULL;
3907 A->ops->multdiagonalblock = NULL;
3908 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
3909 PetscFunctionReturn(PETSC_SUCCESS);
3910 }
3911
3912 /*
3913 samestructure indicates that the matrix has not changed its nonzero structure so we
3914 do not need to recompute the inodes
3915 */
MatSeqAIJCheckInode(Mat A)3916 PetscErrorCode MatSeqAIJCheckInode(Mat A)
3917 {
3918 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3919 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
3920 PetscBool flag;
3921 const PetscInt *idx, *idy, *ii;
3922
3923 PetscFunctionBegin;
3924 if (!a->inode.use) {
3925 PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3926 PetscCall(PetscFree(a->inode.size_csr));
3927 PetscFunctionReturn(PETSC_SUCCESS);
3928 }
3929 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
3930
3931 m = A->rmap->n;
3932 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
3933 ns = a->inode.size_csr;
3934 ns[0] = 0;
3935
3936 i = 0;
3937 node_count = 0;
3938 idx = a->j;
3939 ii = a->i;
3940 if (idx) {
3941 while (i < m) { /* For each row */
3942 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
3943 /* Limits the number of elements in a node to 'a->inode.limit' */
3944 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
3945 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
3946 if (nzy != nzx) break;
3947 idy += nzx; /* Same nonzero pattern */
3948 PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
3949 if (!flag) break;
3950 }
3951 ns[node_count + 1] = ns[node_count] + blk_size;
3952 node_count++;
3953 idx += blk_size * nzx;
3954 i = j;
3955 }
3956 }
3957 /* If not enough inodes found,, do not use inode version of the routines */
3958 if (!m || !idx || node_count > .8 * m) {
3959 PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3960 PetscCall(PetscFree(a->inode.size_csr));
3961 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
3962 } else {
3963 if (!A->factortype) {
3964 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3965 if (A->rmap->n == A->cmap->n) {
3966 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
3967 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
3968 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
3969 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
3970 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
3971 }
3972 } else {
3973 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3974 }
3975 a->inode.node_count = node_count;
3976 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
3977 }
3978 a->inode.checked = PETSC_TRUE;
3979 a->inode.mat_nonzerostate = A->nonzerostate;
3980 PetscFunctionReturn(PETSC_SUCCESS);
3981 }
3982
MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat * C)3983 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
3984 {
3985 Mat B = *C;
3986 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
3987 PetscInt m = A->rmap->n;
3988
3989 PetscFunctionBegin;
3990 c->inode.use = a->inode.use;
3991 c->inode.limit = a->inode.limit;
3992 c->inode.max_limit = a->inode.max_limit;
3993 c->inode.checked = PETSC_FALSE;
3994 c->inode.size_csr = NULL;
3995 c->inode.node_count = 0;
3996 c->inode.ibdiag = NULL;
3997 c->inode.bdiag = NULL;
3998 c->inode.mat_nonzerostate = -1;
3999 if (a->inode.use) {
4000 if (a->inode.checked && a->inode.size_csr) {
4001 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4002 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4003
4004 c->inode.checked = PETSC_TRUE;
4005 c->inode.node_count = a->inode.node_count;
4006 c->inode.mat_nonzerostate = (*C)->nonzerostate;
4007 }
4008 /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4009 if (!B->factortype) {
4010 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4011 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4012 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4013 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4014 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4015 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4016 } else {
4017 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4018 }
4019 }
4020 PetscFunctionReturn(PETSC_SUCCESS);
4021 }
4022
MatGetRow_FactoredLU(PetscInt * cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt * ai,const PetscInt * aj,const PetscInt * adiag,PetscInt row)4023 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4024 {
4025 PetscInt k;
4026 const PetscInt *vi;
4027
4028 PetscFunctionBegin;
4029 vi = aj + ai[row];
4030 for (k = 0; k < nzl; k++) cols[k] = vi[k];
4031 vi = aj + adiag[row];
4032 cols[nzl] = vi[0];
4033 vi = aj + adiag[row + 1] + 1;
4034 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4035 PetscFunctionReturn(PETSC_SUCCESS);
4036 }
4037 /*
4038 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4039 Modified from MatSeqAIJCheckInode().
4040
4041 Input Parameters:
4042 . Mat A - ILU or LU matrix factor
4043
4044 */
MatSeqAIJCheckInode_FactorLU(Mat A)4045 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4046 {
4047 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4048 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4049 PetscInt *cols1, *cols2, *ns;
4050 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4051 PetscBool flag;
4052
4053 PetscFunctionBegin;
4054 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4055 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4056
4057 m = A->rmap->n;
4058 if (a->inode.size_csr) ns = a->inode.size_csr;
4059 else PetscCall(PetscMalloc1(m + 1, &ns));
4060 ns[0] = 0;
4061
4062 i = 0;
4063 node_count = 0;
4064 PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4065 while (i < m) { /* For each row */
4066 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4067 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4068 nzx = nzl1 + nzu1 + 1;
4069 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4070
4071 /* Limits the number of elements in a node to 'a->inode.limit' */
4072 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4073 nzl2 = ai[j + 1] - ai[j];
4074 nzu2 = adiag[j] - adiag[j + 1] - 1;
4075 nzy = nzl2 + nzu2 + 1;
4076 if (nzy != nzx) break;
4077 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4078 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4079 if (!flag) break;
4080 }
4081 ns[node_count + 1] = ns[node_count] + blk_size;
4082 node_count++;
4083 i = j;
4084 }
4085 PetscCall(PetscFree2(cols1, cols2));
4086 /* If not enough inodes found,, do not use inode version of the routines */
4087 if (!m || node_count > .8 * m) {
4088 PetscCall(PetscFree(ns));
4089
4090 a->inode.node_count = 0;
4091 a->inode.size_csr = NULL;
4092 a->inode.use = PETSC_FALSE;
4093
4094 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4095 } else {
4096 A->ops->mult = NULL;
4097 A->ops->sor = NULL;
4098 A->ops->multadd = NULL;
4099 A->ops->getrowij = NULL;
4100 A->ops->restorerowij = NULL;
4101 A->ops->getcolumnij = NULL;
4102 A->ops->restorecolumnij = NULL;
4103 A->ops->coloringpatch = NULL;
4104 A->ops->multdiagonalblock = NULL;
4105 a->inode.node_count = node_count;
4106 a->inode.size_csr = ns;
4107 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4108 }
4109 a->inode.checked = PETSC_TRUE;
4110 PetscFunctionReturn(PETSC_SUCCESS);
4111 }
4112
4113 /*
4114 This is really ugly. if inodes are used this replaces the
4115 permutations with ones that correspond to rows/cols of the matrix
4116 rather than inode blocks
4117 */
MatInodeAdjustForInodes(Mat A,IS * rperm,IS * cperm)4118 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4119 {
4120 PetscFunctionBegin;
4121 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4122 PetscFunctionReturn(PETSC_SUCCESS);
4123 }
4124
MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS * rperm,IS * cperm)4125 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4126 {
4127 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4128 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4129 const PetscInt *ridx, *cidx;
4130 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4131 PetscInt nslim_col, *ns_col;
4132 IS ris = *rperm, cis = *cperm;
4133
4134 PetscFunctionBegin;
4135 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
4136 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4137
4138 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4139 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4140 PetscCall(PetscMalloc2(m, &permr, n, &permc));
4141
4142 PetscCall(ISGetIndices(ris, &ridx));
4143 PetscCall(ISGetIndices(cis, &cidx));
4144
4145 /* Form the inode structure for the rows of permuted matrix using inv perm*/
4146 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4147
4148 /* Construct the permutations for rows*/
4149 for (i = 0, row = 0; i < nslim_row; ++i) {
4150 indx = ridx[i];
4151 start_val = tns[indx];
4152 end_val = tns[indx + 1];
4153 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4154 }
4155
4156 /* Form the inode structure for the columns of permuted matrix using inv perm*/
4157 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4158
4159 /* Construct permutations for columns */
4160 for (i = 0, col = 0; i < nslim_col; ++i) {
4161 indx = cidx[i];
4162 start_val = tns[indx];
4163 end_val = tns[indx + 1];
4164 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4165 }
4166
4167 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4168 PetscCall(ISSetPermutation(*rperm));
4169 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4170 PetscCall(ISSetPermutation(*cperm));
4171
4172 PetscCall(ISRestoreIndices(ris, &ridx));
4173 PetscCall(ISRestoreIndices(cis, &cidx));
4174
4175 PetscCall(PetscFree(ns_col));
4176 PetscCall(PetscFree2(permr, permc));
4177 PetscCall(ISDestroy(&cis));
4178 PetscCall(ISDestroy(&ris));
4179 PetscCall(PetscFree(tns));
4180 PetscFunctionReturn(PETSC_SUCCESS);
4181 }
4182
4183 /*@C
4184 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4185
4186 Not Collective
4187
4188 Input Parameter:
4189 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4190
4191 Output Parameters:
4192 + node_count - no of inodes present in the matrix.
4193 . sizes - an array of size `node_count`, with the sizes of each inode.
4194 - limit - the max size used to generate the inodes.
4195
4196 Level: advanced
4197
4198 Note:
4199 It should be called after the matrix is assembled.
4200 The contents of the sizes[] array should not be changed.
4201 `NULL` may be passed for information not needed
4202
4203 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4204 @*/
MatInodeGetInodeSizes(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4205 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4206 {
4207 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4208
4209 PetscFunctionBegin;
4210 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4211 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4212 if (f) PetscCall((*f)(A, node_count, sizes, limit));
4213 PetscFunctionReturn(PETSC_SUCCESS);
4214 }
4215
MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4216 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4217 {
4218 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4219
4220 PetscFunctionBegin;
4221 if (node_count) *node_count = a->inode.node_count;
4222 if (sizes) *sizes = a->inode.size_csr;
4223 if (limit) *limit = a->inode.limit;
4224 PetscFunctionReturn(PETSC_SUCCESS);
4225 }
4226