xref: /petsc/src/mat/impls/aij/seq/inode.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 #if defined(PETSC_HAVE_XMMINTRIN_H)
8   #include <xmmintrin.h>
9 #endif
10 
11 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
12 {
13   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
14   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
15 
16   PetscFunctionBegin;
17   n = A->cmap->n;
18   m = A->rmap->n;
19   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
25       ;
26     for (; count + 1 < n; count++, i++)
27       ;
28     if (count < n) i++;
29     *size = i;
30     PetscFunctionReturn(PETSC_SUCCESS);
31   }
32   PetscCall(PetscMalloc1(n + 1, &ns_col));
33 
34   /* Use the same row structure wherever feasible. */
35   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
36 
37   /* if m < n; pad up the remainder with inode_limit */
38   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
39   /* The last node is the odd ball. pad it up with the remaining rows; */
40   if (count < n) {
41     ns_col[i] = n - count;
42     i++;
43   } else if (count > n) {
44     /* Adjust for the over estimation */
45     ns_col[i - 1] += n - count;
46   }
47   *size = i;
48   *ns   = ns_col;
49   PetscFunctionReturn(PETSC_SUCCESS);
50 }
51 
52 /*
53       This builds symmetric version of nonzero structure,
54 */
55 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
56 {
57   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
58   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
59   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
60   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
61 
62   PetscFunctionBegin;
63   nslim_row = a->inode.node_count;
64   m         = A->rmap->n;
65   n         = A->cmap->n;
66   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
67   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
68 
69   /* Use the row_inode as column_inode */
70   nslim_col = nslim_row;
71   ns_col    = ns_row;
72 
73   /* allocate space for reformatted inode structure */
74   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
75   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
76 
77   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
78     nsz = ns_col[i1];
79     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
80   }
81   /* allocate space for row pointers */
82   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
83   *iia = ia;
84   PetscCall(PetscMalloc1(nslim_row + 1, &work));
85 
86   /* determine the number of columns in each row */
87   ia[0] = oshift;
88   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
89     j    = aj + ai[row] + ishift;
90     jmax = aj + ai[row + 1] + ishift;
91     if (j == jmax) continue; /* empty row */
92     col = *j++ + ishift;
93     i2  = tvc[col];
94     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
95       ia[i1 + 1]++;
96       ia[i2 + 1]++;
97       i2++; /* Start col of next node */
98       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
99       i2 = tvc[col];
100     }
101     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
102   }
103 
104   /* shift ia[i] to point to next row */
105   for (i1 = 1; i1 < nslim_row + 1; i1++) {
106     row = ia[i1 - 1];
107     ia[i1] += row;
108     work[i1 - 1] = row - oshift;
109   }
110 
111   /* allocate space for column pointers */
112   nz = ia[nslim_row] + (!ishift);
113   PetscCall(PetscMalloc1(nz, &ja));
114   *jja = ja;
115 
116   /* loop over lower triangular part putting into ja */
117   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
171     j  = aj + ai[row] + ishift;
172     nz = ai[row + 1] - ai[row];
173     if (!nz) continue; /* empty row */
174     col = *j++ + ishift;
175     i2  = tvc[col];
176     while (nz-- > 0) { /* off-diagonal elements */
177       ia[i1 + 1]++;
178       i2++; /* Start col of next node */
179       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
180       if (nz > 0) i2 = tvc[col];
181     }
182   }
183 
184   /* shift ia[i] to point to next row */
185   for (i1 = 1; i1 < nslim_row + 1; i1++) {
186     row = ia[i1 - 1];
187     ia[i1] += row;
188     work[i1 - 1] = row - oshift;
189   }
190 
191   /* allocate space for column pointers */
192   nz = ia[nslim_row] + (!ishift);
193   PetscCall(PetscMalloc1(nz, &ja));
194   *jja = ja;
195 
196   /* loop over matrix putting into ja */
197   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
198     j  = aj + ai[row] + ishift;
199     nz = ai[row + 1] - ai[row];
200     if (!nz) continue; /* empty row */
201     col = *j++ + ishift;
202     i2  = tvc[col];
203     while (nz-- > 0) {
204       ja[work[i1]++] = i2 + oshift;
205       ++i2;
206       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
207       if (nz > 0) i2 = tvc[col];
208     }
209   }
210   PetscCall(PetscFree(ns_col));
211   PetscCall(PetscFree(work));
212   PetscCall(PetscFree2(tns, tvc));
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
217 {
218   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
219 
220   PetscFunctionBegin;
221   if (n) *n = a->inode.node_count;
222   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
223   if (!blockcompressed) {
224     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
225   } else if (symmetric) {
226     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
227   } else {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
229   }
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
234 {
235   PetscFunctionBegin;
236   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
237 
238   if (!blockcompressed) {
239     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
240   } else {
241     PetscCall(PetscFree(*ia));
242     PetscCall(PetscFree(*ja));
243   }
244   PetscFunctionReturn(PETSC_SUCCESS);
245 }
246 
247 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
248 {
249   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
250   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
251   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
252 
253   PetscFunctionBegin;
254   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
255   nslim_row = a->inode.node_count;
256   n         = A->cmap->n;
257 
258   /* Create The column_inode for this matrix */
259   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
260 
261   /* allocate space for reformatted column_inode structure */
262   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
263   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
264 
265   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
266     nsz = ns_col[i1];
267     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
268   }
269   /* allocate space for column pointers */
270   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
271   *iia = ia;
272   PetscCall(PetscMalloc1(nslim_col + 1, &work));
273 
274   /* determine the number of columns in each row */
275   ia[0] = oshift;
276   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
277     j   = aj + ai[row] + ishift;
278     col = *j++ + ishift;
279     i2  = tvc[col];
280     nz  = ai[row + 1] - ai[row];
281     while (nz-- > 0) { /* off-diagonal elements */
282       /* ia[i1+1]++; */
283       ia[i2 + 1]++;
284       i2++;
285       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
286       if (nz > 0) i2 = tvc[col];
287     }
288   }
289 
290   /* shift ia[i] to point to next col */
291   for (i1 = 1; i1 < nslim_col + 1; i1++) {
292     col = ia[i1 - 1];
293     ia[i1] += col;
294     work[i1 - 1] = col - oshift;
295   }
296 
297   /* allocate space for column pointers */
298   nz = ia[nslim_col] + (!ishift);
299   PetscCall(PetscMalloc1(nz, &ja));
300   *jja = ja;
301 
302   /* loop over matrix putting into ja */
303   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
304     j   = aj + ai[row] + ishift;
305     col = *j++ + ishift;
306     i2  = tvc[col];
307     nz  = ai[row + 1] - ai[row];
308     while (nz-- > 0) {
309       /* ja[work[i1]++] = i2 + oshift; */
310       ja[work[i2]++] = i1 + oshift;
311       i2++;
312       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
313       if (nz > 0) i2 = tvc[col];
314     }
315   }
316   PetscCall(PetscFree(ns_col));
317   PetscCall(PetscFree(work));
318   PetscCall(PetscFree2(tns, tvc));
319   PetscFunctionReturn(PETSC_SUCCESS);
320 }
321 
322 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
323 {
324   PetscFunctionBegin;
325   PetscCall(MatCreateColInode_Private(A, n, NULL));
326   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
327 
328   if (!blockcompressed) {
329     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
330   } else if (symmetric) {
331     /* Since the indices are symmetric it doesn't matter */
332     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
333   } else {
334     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
335   }
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
340 {
341   PetscFunctionBegin;
342   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
343   if (!blockcompressed) {
344     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
345   } else {
346     PetscCall(PetscFree(*ia));
347     PetscCall(PetscFree(*ja));
348   }
349   PetscFunctionReturn(PETSC_SUCCESS);
350 }
351 
352 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
353 {
354   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
355   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
356   PetscScalar       *y;
357   const PetscScalar *x;
358   const MatScalar   *v1, *v2, *v3, *v4, *v5;
359   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
360   const PetscInt    *idx, *ns, *ii;
361 
362 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
363   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
364 #endif
365 
366   PetscFunctionBegin;
367   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
368   node_max = a->inode.node_count;
369   ns       = a->inode.size; /* Node Size array */
370   PetscCall(VecGetArrayRead(xx, &x));
371   PetscCall(VecGetArray(yy, &y));
372   idx = a->j;
373   v1  = a->a;
374   ii  = a->i;
375 
376   for (i = 0, row = 0; i < node_max; ++i) {
377     nsz = ns[i];
378     n   = ii[1] - ii[0];
379     nonzerorow += (n > 0) * nsz;
380     ii += nsz;
381     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
382     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
383     sz = n;                                                                /* No of non zeros in this row */
384                                                                            /* Switch on the size of Node */
385     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
386     case 1:
387       sum1 = 0.;
388 
389       for (n = 0; n < sz - 1; n += 2) {
390         i1 = idx[0]; /* The instructions are ordered to */
391         i2 = idx[1]; /* make the compiler's job easy */
392         idx += 2;
393         tmp0 = x[i1];
394         tmp1 = x[i2];
395         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
396         v1 += 2;
397       }
398 
399       if (n == sz - 1) { /* Take care of the last nonzero  */
400         tmp0 = x[*idx++];
401         sum1 += *v1++ * tmp0;
402       }
403       y[row++] = sum1;
404       break;
405     case 2:
406       sum1 = 0.;
407       sum2 = 0.;
408       v2   = v1 + n;
409 
410       for (n = 0; n < sz - 1; n += 2) {
411         i1 = idx[0];
412         i2 = idx[1];
413         idx += 2;
414         tmp0 = x[i1];
415         tmp1 = x[i2];
416         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
417         v1 += 2;
418         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
419         v2 += 2;
420       }
421       if (n == sz - 1) {
422         tmp0 = x[*idx++];
423         sum1 += *v1++ * tmp0;
424         sum2 += *v2++ * tmp0;
425       }
426       y[row++] = sum1;
427       y[row++] = sum2;
428       v1       = v2; /* Since the next block to be processed starts there*/
429       idx += sz;
430       break;
431     case 3:
432       sum1 = 0.;
433       sum2 = 0.;
434       sum3 = 0.;
435       v2   = v1 + n;
436       v3   = v2 + n;
437 
438       for (n = 0; n < sz - 1; n += 2) {
439         i1 = idx[0];
440         i2 = idx[1];
441         idx += 2;
442         tmp0 = x[i1];
443         tmp1 = x[i2];
444         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
445         v1 += 2;
446         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
447         v2 += 2;
448         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
449         v3 += 2;
450       }
451       if (n == sz - 1) {
452         tmp0 = x[*idx++];
453         sum1 += *v1++ * tmp0;
454         sum2 += *v2++ * tmp0;
455         sum3 += *v3++ * tmp0;
456       }
457       y[row++] = sum1;
458       y[row++] = sum2;
459       y[row++] = sum3;
460       v1       = v3; /* Since the next block to be processed starts there*/
461       idx += 2 * sz;
462       break;
463     case 4:
464       sum1 = 0.;
465       sum2 = 0.;
466       sum3 = 0.;
467       sum4 = 0.;
468       v2   = v1 + n;
469       v3   = v2 + n;
470       v4   = v3 + n;
471 
472       for (n = 0; n < sz - 1; n += 2) {
473         i1 = idx[0];
474         i2 = idx[1];
475         idx += 2;
476         tmp0 = x[i1];
477         tmp1 = x[i2];
478         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
479         v1 += 2;
480         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
481         v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
483         v3 += 2;
484         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
485         v4 += 2;
486       }
487       if (n == sz - 1) {
488         tmp0 = x[*idx++];
489         sum1 += *v1++ * tmp0;
490         sum2 += *v2++ * tmp0;
491         sum3 += *v3++ * tmp0;
492         sum4 += *v4++ * tmp0;
493       }
494       y[row++] = sum1;
495       y[row++] = sum2;
496       y[row++] = sum3;
497       y[row++] = sum4;
498       v1       = v4; /* Since the next block to be processed starts there*/
499       idx += 3 * sz;
500       break;
501     case 5:
502       sum1 = 0.;
503       sum2 = 0.;
504       sum3 = 0.;
505       sum4 = 0.;
506       sum5 = 0.;
507       v2   = v1 + n;
508       v3   = v2 + n;
509       v4   = v3 + n;
510       v5   = v4 + n;
511 
512       for (n = 0; n < sz - 1; n += 2) {
513         i1 = idx[0];
514         i2 = idx[1];
515         idx += 2;
516         tmp0 = x[i1];
517         tmp1 = x[i2];
518         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
519         v1 += 2;
520         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
521         v2 += 2;
522         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
523         v3 += 2;
524         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
525         v4 += 2;
526         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
527         v5 += 2;
528       }
529       if (n == sz - 1) {
530         tmp0 = x[*idx++];
531         sum1 += *v1++ * tmp0;
532         sum2 += *v2++ * tmp0;
533         sum3 += *v3++ * tmp0;
534         sum4 += *v4++ * tmp0;
535         sum5 += *v5++ * tmp0;
536       }
537       y[row++] = sum1;
538       y[row++] = sum2;
539       y[row++] = sum3;
540       y[row++] = sum4;
541       y[row++] = sum5;
542       v1       = v5; /* Since the next block to be processed starts there */
543       idx += 4 * sz;
544       break;
545     default:
546       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
547     }
548   }
549   PetscCall(VecRestoreArrayRead(xx, &x));
550   PetscCall(VecRestoreArray(yy, &y));
551   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
552   PetscFunctionReturn(PETSC_SUCCESS);
553 }
554 
555 /* Almost same code as the MatMult_SeqAIJ_Inode() */
556 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
557 {
558   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
559   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
560   const MatScalar   *v1, *v2, *v3, *v4, *v5;
561   const PetscScalar *x;
562   PetscScalar       *y, *z, *zt;
563   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
564   const PetscInt    *idx, *ns, *ii;
565 
566   PetscFunctionBegin;
567   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
568   node_max = a->inode.node_count;
569   ns       = a->inode.size; /* Node Size array */
570 
571   PetscCall(VecGetArrayRead(xx, &x));
572   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
573   zt = z;
574 
575   idx = a->j;
576   v1  = a->a;
577   ii  = a->i;
578 
579   for (i = 0, row = 0; i < node_max; ++i) {
580     nsz = ns[i];
581     n   = ii[1] - ii[0];
582     ii += nsz;
583     sz = n;        /* No of non zeros in this row */
584                    /* Switch on the size of Node */
585     switch (nsz) { /* Each loop in 'case' is unrolled */
586     case 1:
587       sum1 = *zt++;
588 
589       for (n = 0; n < sz - 1; n += 2) {
590         i1 = idx[0]; /* The instructions are ordered to */
591         i2 = idx[1]; /* make the compiler's job easy */
592         idx += 2;
593         tmp0 = x[i1];
594         tmp1 = x[i2];
595         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
596         v1 += 2;
597       }
598 
599       if (n == sz - 1) { /* Take care of the last nonzero  */
600         tmp0 = x[*idx++];
601         sum1 += *v1++ * tmp0;
602       }
603       y[row++] = sum1;
604       break;
605     case 2:
606       sum1 = *zt++;
607       sum2 = *zt++;
608       v2   = v1 + n;
609 
610       for (n = 0; n < sz - 1; n += 2) {
611         i1 = idx[0];
612         i2 = idx[1];
613         idx += 2;
614         tmp0 = x[i1];
615         tmp1 = x[i2];
616         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
617         v1 += 2;
618         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
619         v2 += 2;
620       }
621       if (n == sz - 1) {
622         tmp0 = x[*idx++];
623         sum1 += *v1++ * tmp0;
624         sum2 += *v2++ * tmp0;
625       }
626       y[row++] = sum1;
627       y[row++] = sum2;
628       v1       = v2; /* Since the next block to be processed starts there*/
629       idx += sz;
630       break;
631     case 3:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       sum3 = *zt++;
635       v2   = v1 + n;
636       v3   = v2 + n;
637 
638       for (n = 0; n < sz - 1; n += 2) {
639         i1 = idx[0];
640         i2 = idx[1];
641         idx += 2;
642         tmp0 = x[i1];
643         tmp1 = x[i2];
644         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
645         v1 += 2;
646         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
647         v2 += 2;
648         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
649         v3 += 2;
650       }
651       if (n == sz - 1) {
652         tmp0 = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655         sum3 += *v3++ * tmp0;
656       }
657       y[row++] = sum1;
658       y[row++] = sum2;
659       y[row++] = sum3;
660       v1       = v3; /* Since the next block to be processed starts there*/
661       idx += 2 * sz;
662       break;
663     case 4:
664       sum1 = *zt++;
665       sum2 = *zt++;
666       sum3 = *zt++;
667       sum4 = *zt++;
668       v2   = v1 + n;
669       v3   = v2 + n;
670       v4   = v3 + n;
671 
672       for (n = 0; n < sz - 1; n += 2) {
673         i1 = idx[0];
674         i2 = idx[1];
675         idx += 2;
676         tmp0 = x[i1];
677         tmp1 = x[i2];
678         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
679         v1 += 2;
680         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
681         v2 += 2;
682         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
683         v3 += 2;
684         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
685         v4 += 2;
686       }
687       if (n == sz - 1) {
688         tmp0 = x[*idx++];
689         sum1 += *v1++ * tmp0;
690         sum2 += *v2++ * tmp0;
691         sum3 += *v3++ * tmp0;
692         sum4 += *v4++ * tmp0;
693       }
694       y[row++] = sum1;
695       y[row++] = sum2;
696       y[row++] = sum3;
697       y[row++] = sum4;
698       v1       = v4; /* Since the next block to be processed starts there*/
699       idx += 3 * sz;
700       break;
701     case 5:
702       sum1 = *zt++;
703       sum2 = *zt++;
704       sum3 = *zt++;
705       sum4 = *zt++;
706       sum5 = *zt++;
707       v2   = v1 + n;
708       v3   = v2 + n;
709       v4   = v3 + n;
710       v5   = v4 + n;
711 
712       for (n = 0; n < sz - 1; n += 2) {
713         i1 = idx[0];
714         i2 = idx[1];
715         idx += 2;
716         tmp0 = x[i1];
717         tmp1 = x[i2];
718         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
719         v1 += 2;
720         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
721         v2 += 2;
722         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
723         v3 += 2;
724         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
725         v4 += 2;
726         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
727         v5 += 2;
728       }
729       if (n == sz - 1) {
730         tmp0 = x[*idx++];
731         sum1 += *v1++ * tmp0;
732         sum2 += *v2++ * tmp0;
733         sum3 += *v3++ * tmp0;
734         sum4 += *v4++ * tmp0;
735         sum5 += *v5++ * tmp0;
736       }
737       y[row++] = sum1;
738       y[row++] = sum2;
739       y[row++] = sum3;
740       y[row++] = sum4;
741       y[row++] = sum5;
742       v1       = v5; /* Since the next block to be processed starts there */
743       idx += 4 * sz;
744       break;
745     default:
746       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
747     }
748   }
749   PetscCall(VecRestoreArrayRead(xx, &x));
750   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
751   PetscCall(PetscLogFlops(2.0 * a->nz));
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
756 {
757   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
758   IS                 iscol = a->col, isrow = a->row;
759   const PetscInt    *r, *c, *rout, *cout;
760   PetscInt           i, j, n = A->rmap->n, nz;
761   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
762   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
763   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
764   PetscScalar        sum1, sum2, sum3, sum4, sum5;
765   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
766   const PetscScalar *b;
767 
768   PetscFunctionBegin;
769   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
770   node_max = a->inode.node_count;
771   ns       = a->inode.size; /* Node Size array */
772 
773   PetscCall(VecGetArrayRead(bb, &b));
774   PetscCall(VecGetArrayWrite(xx, &x));
775   tmp = a->solve_work;
776 
777   PetscCall(ISGetIndices(isrow, &rout));
778   r = rout;
779   PetscCall(ISGetIndices(iscol, &cout));
780   c = cout + (n - 1);
781 
782   /* forward solve the lower triangular */
783   tmps = tmp;
784   aa   = a_a;
785   aj   = a_j;
786   ad   = a->diag;
787 
788   for (i = 0, row = 0; i < node_max; ++i) {
789     nsz = ns[i];
790     aii = ai[row];
791     v1  = aa + aii;
792     vi  = aj + aii;
793     nz  = ad[row] - aii;
794     if (i < node_max - 1) {
795       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
796       * but our indexing to determine its size could. */
797       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
798       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
799       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
800       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
801     }
802 
803     switch (nsz) { /* Each loop in 'case' is unrolled */
804     case 1:
805       sum1 = b[*r++];
806       for (j = 0; j < nz - 1; j += 2) {
807         i0 = vi[0];
808         i1 = vi[1];
809         vi += 2;
810         tmp0 = tmps[i0];
811         tmp1 = tmps[i1];
812         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
813         v1 += 2;
814       }
815       if (j == nz - 1) {
816         tmp0 = tmps[*vi++];
817         sum1 -= *v1++ * tmp0;
818       }
819       tmp[row++] = sum1;
820       break;
821     case 2:
822       sum1 = b[*r++];
823       sum2 = b[*r++];
824       v2   = aa + ai[row + 1];
825 
826       for (j = 0; j < nz - 1; j += 2) {
827         i0 = vi[0];
828         i1 = vi[1];
829         vi += 2;
830         tmp0 = tmps[i0];
831         tmp1 = tmps[i1];
832         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
833         v1 += 2;
834         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
835         v2 += 2;
836       }
837       if (j == nz - 1) {
838         tmp0 = tmps[*vi++];
839         sum1 -= *v1++ * tmp0;
840         sum2 -= *v2++ * tmp0;
841       }
842       sum2 -= *v2++ * sum1;
843       tmp[row++] = sum1;
844       tmp[row++] = sum2;
845       break;
846     case 3:
847       sum1 = b[*r++];
848       sum2 = b[*r++];
849       sum3 = b[*r++];
850       v2   = aa + ai[row + 1];
851       v3   = aa + ai[row + 2];
852 
853       for (j = 0; j < nz - 1; j += 2) {
854         i0 = vi[0];
855         i1 = vi[1];
856         vi += 2;
857         tmp0 = tmps[i0];
858         tmp1 = tmps[i1];
859         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
860         v1 += 2;
861         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
862         v2 += 2;
863         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
864         v3 += 2;
865       }
866       if (j == nz - 1) {
867         tmp0 = tmps[*vi++];
868         sum1 -= *v1++ * tmp0;
869         sum2 -= *v2++ * tmp0;
870         sum3 -= *v3++ * tmp0;
871       }
872       sum2 -= *v2++ * sum1;
873       sum3 -= *v3++ * sum1;
874       sum3 -= *v3++ * sum2;
875 
876       tmp[row++] = sum1;
877       tmp[row++] = sum2;
878       tmp[row++] = sum3;
879       break;
880 
881     case 4:
882       sum1 = b[*r++];
883       sum2 = b[*r++];
884       sum3 = b[*r++];
885       sum4 = b[*r++];
886       v2   = aa + ai[row + 1];
887       v3   = aa + ai[row + 2];
888       v4   = aa + ai[row + 3];
889 
890       for (j = 0; j < nz - 1; j += 2) {
891         i0 = vi[0];
892         i1 = vi[1];
893         vi += 2;
894         tmp0 = tmps[i0];
895         tmp1 = tmps[i1];
896         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
897         v1 += 2;
898         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
899         v2 += 2;
900         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
901         v3 += 2;
902         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
903         v4 += 2;
904       }
905       if (j == nz - 1) {
906         tmp0 = tmps[*vi++];
907         sum1 -= *v1++ * tmp0;
908         sum2 -= *v2++ * tmp0;
909         sum3 -= *v3++ * tmp0;
910         sum4 -= *v4++ * tmp0;
911       }
912       sum2 -= *v2++ * sum1;
913       sum3 -= *v3++ * sum1;
914       sum4 -= *v4++ * sum1;
915       sum3 -= *v3++ * sum2;
916       sum4 -= *v4++ * sum2;
917       sum4 -= *v4++ * sum3;
918 
919       tmp[row++] = sum1;
920       tmp[row++] = sum2;
921       tmp[row++] = sum3;
922       tmp[row++] = sum4;
923       break;
924     case 5:
925       sum1 = b[*r++];
926       sum2 = b[*r++];
927       sum3 = b[*r++];
928       sum4 = b[*r++];
929       sum5 = b[*r++];
930       v2   = aa + ai[row + 1];
931       v3   = aa + ai[row + 2];
932       v4   = aa + ai[row + 3];
933       v5   = aa + ai[row + 4];
934 
935       for (j = 0; j < nz - 1; j += 2) {
936         i0 = vi[0];
937         i1 = vi[1];
938         vi += 2;
939         tmp0 = tmps[i0];
940         tmp1 = tmps[i1];
941         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
942         v1 += 2;
943         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
944         v2 += 2;
945         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
946         v3 += 2;
947         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
948         v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
950         v5 += 2;
951       }
952       if (j == nz - 1) {
953         tmp0 = tmps[*vi++];
954         sum1 -= *v1++ * tmp0;
955         sum2 -= *v2++ * tmp0;
956         sum3 -= *v3++ * tmp0;
957         sum4 -= *v4++ * tmp0;
958         sum5 -= *v5++ * tmp0;
959       }
960 
961       sum2 -= *v2++ * sum1;
962       sum3 -= *v3++ * sum1;
963       sum4 -= *v4++ * sum1;
964       sum5 -= *v5++ * sum1;
965       sum3 -= *v3++ * sum2;
966       sum4 -= *v4++ * sum2;
967       sum5 -= *v5++ * sum2;
968       sum4 -= *v4++ * sum3;
969       sum5 -= *v5++ * sum3;
970       sum5 -= *v5++ * sum4;
971 
972       tmp[row++] = sum1;
973       tmp[row++] = sum2;
974       tmp[row++] = sum3;
975       tmp[row++] = sum4;
976       tmp[row++] = sum5;
977       break;
978     default:
979       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
980     }
981   }
982   /* backward solve the upper triangular */
983   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
984     nsz = ns[i];
985     aii = ai[row + 1] - 1;
986     v1  = aa + aii;
987     vi  = aj + aii;
988     nz  = aii - ad[row];
989     switch (nsz) { /* Each loop in 'case' is unrolled */
990     case 1:
991       sum1 = tmp[row];
992 
993       for (j = nz; j > 1; j -= 2) {
994         vi -= 2;
995         i0   = vi[2];
996         i1   = vi[1];
997         tmp0 = tmps[i0];
998         tmp1 = tmps[i1];
999         v1 -= 2;
1000         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001       }
1002       if (j == 1) {
1003         tmp0 = tmps[*vi--];
1004         sum1 -= *v1-- * tmp0;
1005       }
1006       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1007       row--;
1008       break;
1009     case 2:
1010       sum1 = tmp[row];
1011       sum2 = tmp[row - 1];
1012       v2   = aa + ai[row] - 1;
1013       for (j = nz; j > 1; j -= 2) {
1014         vi -= 2;
1015         i0   = vi[2];
1016         i1   = vi[1];
1017         tmp0 = tmps[i0];
1018         tmp1 = tmps[i1];
1019         v1 -= 2;
1020         v2 -= 2;
1021         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1022         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1023       }
1024       if (j == 1) {
1025         tmp0 = tmps[*vi--];
1026         sum1 -= *v1-- * tmp0;
1027         sum2 -= *v2-- * tmp0;
1028       }
1029 
1030       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1031       row--;
1032       sum2 -= *v2-- * tmp0;
1033       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1034       row--;
1035       break;
1036     case 3:
1037       sum1 = tmp[row];
1038       sum2 = tmp[row - 1];
1039       sum3 = tmp[row - 2];
1040       v2   = aa + ai[row] - 1;
1041       v3   = aa + ai[row - 1] - 1;
1042       for (j = nz; j > 1; j -= 2) {
1043         vi -= 2;
1044         i0   = vi[2];
1045         i1   = vi[1];
1046         tmp0 = tmps[i0];
1047         tmp1 = tmps[i1];
1048         v1 -= 2;
1049         v2 -= 2;
1050         v3 -= 2;
1051         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054       }
1055       if (j == 1) {
1056         tmp0 = tmps[*vi--];
1057         sum1 -= *v1-- * tmp0;
1058         sum2 -= *v2-- * tmp0;
1059         sum3 -= *v3-- * tmp0;
1060       }
1061       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1062       row--;
1063       sum2 -= *v2-- * tmp0;
1064       sum3 -= *v3-- * tmp0;
1065       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1066       row--;
1067       sum3 -= *v3-- * tmp0;
1068       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1069       row--;
1070 
1071       break;
1072     case 4:
1073       sum1 = tmp[row];
1074       sum2 = tmp[row - 1];
1075       sum3 = tmp[row - 2];
1076       sum4 = tmp[row - 3];
1077       v2   = aa + ai[row] - 1;
1078       v3   = aa + ai[row - 1] - 1;
1079       v4   = aa + ai[row - 2] - 1;
1080 
1081       for (j = nz; j > 1; j -= 2) {
1082         vi -= 2;
1083         i0   = vi[2];
1084         i1   = vi[1];
1085         tmp0 = tmps[i0];
1086         tmp1 = tmps[i1];
1087         v1 -= 2;
1088         v2 -= 2;
1089         v3 -= 2;
1090         v4 -= 2;
1091         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1092         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1093         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1094         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1095       }
1096       if (j == 1) {
1097         tmp0 = tmps[*vi--];
1098         sum1 -= *v1-- * tmp0;
1099         sum2 -= *v2-- * tmp0;
1100         sum3 -= *v3-- * tmp0;
1101         sum4 -= *v4-- * tmp0;
1102       }
1103 
1104       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1105       row--;
1106       sum2 -= *v2-- * tmp0;
1107       sum3 -= *v3-- * tmp0;
1108       sum4 -= *v4-- * tmp0;
1109       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1110       row--;
1111       sum3 -= *v3-- * tmp0;
1112       sum4 -= *v4-- * tmp0;
1113       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1114       row--;
1115       sum4 -= *v4-- * tmp0;
1116       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1117       row--;
1118       break;
1119     case 5:
1120       sum1 = tmp[row];
1121       sum2 = tmp[row - 1];
1122       sum3 = tmp[row - 2];
1123       sum4 = tmp[row - 3];
1124       sum5 = tmp[row - 4];
1125       v2   = aa + ai[row] - 1;
1126       v3   = aa + ai[row - 1] - 1;
1127       v4   = aa + ai[row - 2] - 1;
1128       v5   = aa + ai[row - 3] - 1;
1129       for (j = nz; j > 1; j -= 2) {
1130         vi -= 2;
1131         i0   = vi[2];
1132         i1   = vi[1];
1133         tmp0 = tmps[i0];
1134         tmp1 = tmps[i1];
1135         v1 -= 2;
1136         v2 -= 2;
1137         v3 -= 2;
1138         v4 -= 2;
1139         v5 -= 2;
1140         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1141         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1142         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1143         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1144         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1145       }
1146       if (j == 1) {
1147         tmp0 = tmps[*vi--];
1148         sum1 -= *v1-- * tmp0;
1149         sum2 -= *v2-- * tmp0;
1150         sum3 -= *v3-- * tmp0;
1151         sum4 -= *v4-- * tmp0;
1152         sum5 -= *v5-- * tmp0;
1153       }
1154 
1155       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1156       row--;
1157       sum2 -= *v2-- * tmp0;
1158       sum3 -= *v3-- * tmp0;
1159       sum4 -= *v4-- * tmp0;
1160       sum5 -= *v5-- * tmp0;
1161       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1162       row--;
1163       sum3 -= *v3-- * tmp0;
1164       sum4 -= *v4-- * tmp0;
1165       sum5 -= *v5-- * tmp0;
1166       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1167       row--;
1168       sum4 -= *v4-- * tmp0;
1169       sum5 -= *v5-- * tmp0;
1170       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1171       row--;
1172       sum5 -= *v5-- * tmp0;
1173       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1174       row--;
1175       break;
1176     default:
1177       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1178     }
1179   }
1180   PetscCall(ISRestoreIndices(isrow, &rout));
1181   PetscCall(ISRestoreIndices(iscol, &cout));
1182   PetscCall(VecRestoreArrayRead(bb, &b));
1183   PetscCall(VecRestoreArrayWrite(xx, &x));
1184   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187 
1188 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1189 {
1190   Mat              C = B;
1191   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1192   IS               isrow = b->row, isicol = b->icol;
1193   const PetscInt  *r, *ic, *ics;
1194   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1195   PetscInt         i, j, k, nz, nzL, row, *pj;
1196   const PetscInt  *ajtmp, *bjtmp;
1197   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1198   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1199   FactorShiftCtx   sctx;
1200   const PetscInt  *ddiag;
1201   PetscReal        rs;
1202   MatScalar        d;
1203   PetscInt         inod, nodesz, node_max, col;
1204   const PetscInt  *ns;
1205   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1206 
1207   PetscFunctionBegin;
1208   /* MatPivotSetUp(): initialize shift context sctx */
1209   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1210 
1211   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1212     ddiag          = a->diag;
1213     sctx.shift_top = info->zeropivot;
1214     for (i = 0; i < n; i++) {
1215       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1216       d  = (aa)[ddiag[i]];
1217       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1218       v  = aa + ai[i];
1219       nz = ai[i + 1] - ai[i];
1220       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1221       if (rs > sctx.shift_top) sctx.shift_top = rs;
1222     }
1223     sctx.shift_top *= 1.1;
1224     sctx.nshift_max = 5;
1225     sctx.shift_lo   = 0.;
1226     sctx.shift_hi   = 1.;
1227   }
1228 
1229   PetscCall(ISGetIndices(isrow, &r));
1230   PetscCall(ISGetIndices(isicol, &ic));
1231 
1232   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1233   ics = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1238 
1239   /* If max inode size > 4, split it into two inodes.*/
1240   /* also map the inode sizes according to the ordering */
1241   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1242   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1243     if (ns[i] > 4) {
1244       tmp_vec1[j] = 4;
1245       ++j;
1246       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1247     } else {
1248       tmp_vec1[j] = ns[i];
1249     }
1250   }
1251   /* Use the correct node_max */
1252   node_max = j;
1253 
1254   /* Now reorder the inode info based on mat re-ordering info */
1255   /* First create a row -> inode_size_array_index map */
1256   PetscCall(PetscMalloc1(n + 1, &nsmap));
1257   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1258   for (i = 0, row = 0; i < node_max; i++) {
1259     nodesz = tmp_vec1[i];
1260     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1261   }
1262   /* Using nsmap, create a reordered ns structure */
1263   for (i = 0, j = 0; i < node_max; i++) {
1264     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1265     tmp_vec2[i] = nodesz;
1266     j += nodesz;
1267   }
1268   PetscCall(PetscFree(nsmap));
1269   PetscCall(PetscFree(tmp_vec1));
1270 
1271   /* Now use the correct ns */
1272   ns = tmp_vec2;
1273 
1274   do {
1275     sctx.newshift = PETSC_FALSE;
1276     /* Now loop over each block-row, and do the factorization */
1277     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1278       nodesz = ns[inod];
1279 
1280       switch (nodesz) {
1281       case 1:
1282         /* zero rtmp1 */
1283         /* L part */
1284         nz    = bi[i + 1] - bi[i];
1285         bjtmp = bj + bi[i];
1286         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1287 
1288         /* U part */
1289         nz    = bdiag[i] - bdiag[i + 1];
1290         bjtmp = bj + bdiag[i + 1] + 1;
1291         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1292 
1293         /* load in initial (unfactored row) */
1294         nz    = ai[r[i] + 1] - ai[r[i]];
1295         ajtmp = aj + ai[r[i]];
1296         v     = aa + ai[r[i]];
1297         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1298 
1299         /* ZeropivotApply() */
1300         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1301 
1302         /* elimination */
1303         bjtmp = bj + bi[i];
1304         row   = *bjtmp++;
1305         nzL   = bi[i + 1] - bi[i];
1306         for (k = 0; k < nzL; k++) {
1307           pc = rtmp1 + row;
1308           if (*pc != 0.0) {
1309             pv   = b->a + bdiag[row];
1310             mul1 = *pc * (*pv);
1311             *pc  = mul1;
1312             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1313             pv   = b->a + bdiag[row + 1] + 1;
1314             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1315             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1316             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1317           }
1318           row = *bjtmp++;
1319         }
1320 
1321         /* finished row so stick it into b->a */
1322         rs = 0.0;
1323         /* L part */
1324         pv = b->a + bi[i];
1325         pj = b->j + bi[i];
1326         nz = bi[i + 1] - bi[i];
1327         for (j = 0; j < nz; j++) {
1328           pv[j] = rtmp1[pj[j]];
1329           rs += PetscAbsScalar(pv[j]);
1330         }
1331 
1332         /* U part */
1333         pv = b->a + bdiag[i + 1] + 1;
1334         pj = b->j + bdiag[i + 1] + 1;
1335         nz = bdiag[i] - bdiag[i + 1] - 1;
1336         for (j = 0; j < nz; j++) {
1337           pv[j] = rtmp1[pj[j]];
1338           rs += PetscAbsScalar(pv[j]);
1339         }
1340 
1341         /* Check zero pivot */
1342         sctx.rs = rs;
1343         sctx.pv = rtmp1[i];
1344         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1345         if (sctx.newshift) break;
1346 
1347         /* Mark diagonal and invert diagonal for simpler triangular solves */
1348         pv  = b->a + bdiag[i];
1349         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1350         break;
1351 
1352       case 2:
1353         /* zero rtmp1 and rtmp2 */
1354         /* L part */
1355         nz    = bi[i + 1] - bi[i];
1356         bjtmp = bj + bi[i];
1357         for (j = 0; j < nz; j++) {
1358           col        = bjtmp[j];
1359           rtmp1[col] = 0.0;
1360           rtmp2[col] = 0.0;
1361         }
1362 
1363         /* U part */
1364         nz    = bdiag[i] - bdiag[i + 1];
1365         bjtmp = bj + bdiag[i + 1] + 1;
1366         for (j = 0; j < nz; j++) {
1367           col        = bjtmp[j];
1368           rtmp1[col] = 0.0;
1369           rtmp2[col] = 0.0;
1370         }
1371 
1372         /* load in initial (unfactored row) */
1373         nz    = ai[r[i] + 1] - ai[r[i]];
1374         ajtmp = aj + ai[r[i]];
1375         v1    = aa + ai[r[i]];
1376         v2    = aa + ai[r[i] + 1];
1377         for (j = 0; j < nz; j++) {
1378           col        = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j];
1380           rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount;
1384         rtmp2[i + 1] += sctx.shift_amount;
1385 
1386         /* elimination */
1387         bjtmp = bj + bi[i];
1388         row   = *bjtmp++; /* pivot row */
1389         nzL   = bi[i + 1] - bi[i];
1390         for (k = 0; k < nzL; k++) {
1391           pc1 = rtmp1 + row;
1392           pc2 = rtmp2 + row;
1393           if (*pc1 != 0.0 || *pc2 != 0.0) {
1394             pv   = b->a + bdiag[row];
1395             mul1 = *pc1 * (*pv);
1396             mul2 = *pc2 * (*pv);
1397             *pc1 = mul1;
1398             *pc2 = mul2;
1399 
1400             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1401             pv = b->a + bdiag[row + 1] + 1;
1402             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1403             for (j = 0; j < nz; j++) {
1404               col = pj[j];
1405               rtmp1[col] -= mul1 * pv[j];
1406               rtmp2[col] -= mul2 * pv[j];
1407             }
1408             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1409           }
1410           row = *bjtmp++;
1411         }
1412 
1413         /* finished row i; check zero pivot, then stick row i into b->a */
1414         rs = 0.0;
1415         /* L part */
1416         pc1 = b->a + bi[i];
1417         pj  = b->j + bi[i];
1418         nz  = bi[i + 1] - bi[i];
1419         for (j = 0; j < nz; j++) {
1420           col    = pj[j];
1421           pc1[j] = rtmp1[col];
1422           rs += PetscAbsScalar(pc1[j]);
1423         }
1424         /* U part */
1425         pc1 = b->a + bdiag[i + 1] + 1;
1426         pj  = b->j + bdiag[i + 1] + 1;
1427         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1428         for (j = 0; j < nz; j++) {
1429           col    = pj[j];
1430           pc1[j] = rtmp1[col];
1431           rs += PetscAbsScalar(pc1[j]);
1432         }
1433 
1434         sctx.rs = rs;
1435         sctx.pv = rtmp1[i];
1436         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1437         if (sctx.newshift) break;
1438         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1439         *pc1 = 1.0 / sctx.pv;
1440 
1441         /* Now take care of diagonal 2x2 block. */
1442         pc2 = rtmp2 + i;
1443         if (*pc2 != 0.0) {
1444           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1445           *pc2 = mul1;                        /* insert L entry */
1446           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1447           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1448           for (j = 0; j < nz; j++) {
1449             col = pj[j];
1450             rtmp2[col] -= mul1 * rtmp1[col];
1451           }
1452           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1453         }
1454 
1455         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1456         rs = 0.0;
1457         /* L part */
1458         pc2 = b->a + bi[i + 1];
1459         pj  = b->j + bi[i + 1];
1460         nz  = bi[i + 2] - bi[i + 1];
1461         for (j = 0; j < nz; j++) {
1462           col    = pj[j];
1463           pc2[j] = rtmp2[col];
1464           rs += PetscAbsScalar(pc2[j]);
1465         }
1466         /* U part */
1467         pc2 = b->a + bdiag[i + 2] + 1;
1468         pj  = b->j + bdiag[i + 2] + 1;
1469         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1470         for (j = 0; j < nz; j++) {
1471           col    = pj[j];
1472           pc2[j] = rtmp2[col];
1473           rs += PetscAbsScalar(pc2[j]);
1474         }
1475 
1476         sctx.rs = rs;
1477         sctx.pv = rtmp2[i + 1];
1478         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1479         if (sctx.newshift) break;
1480         pc2  = b->a + bdiag[i + 1];
1481         *pc2 = 1.0 / sctx.pv;
1482         break;
1483 
1484       case 3:
1485         /* zero rtmp */
1486         /* L part */
1487         nz    = bi[i + 1] - bi[i];
1488         bjtmp = bj + bi[i];
1489         for (j = 0; j < nz; j++) {
1490           col        = bjtmp[j];
1491           rtmp1[col] = 0.0;
1492           rtmp2[col] = 0.0;
1493           rtmp3[col] = 0.0;
1494         }
1495 
1496         /* U part */
1497         nz    = bdiag[i] - bdiag[i + 1];
1498         bjtmp = bj + bdiag[i + 1] + 1;
1499         for (j = 0; j < nz; j++) {
1500           col        = bjtmp[j];
1501           rtmp1[col] = 0.0;
1502           rtmp2[col] = 0.0;
1503           rtmp3[col] = 0.0;
1504         }
1505 
1506         /* load in initial (unfactored row) */
1507         nz    = ai[r[i] + 1] - ai[r[i]];
1508         ajtmp = aj + ai[r[i]];
1509         v1    = aa + ai[r[i]];
1510         v2    = aa + ai[r[i] + 1];
1511         v3    = aa + ai[r[i] + 2];
1512         for (j = 0; j < nz; j++) {
1513           col        = ics[ajtmp[j]];
1514           rtmp1[col] = v1[j];
1515           rtmp2[col] = v2[j];
1516           rtmp3[col] = v3[j];
1517         }
1518         /* ZeropivotApply(): shift the diagonal of the matrix  */
1519         rtmp1[i] += sctx.shift_amount;
1520         rtmp2[i + 1] += sctx.shift_amount;
1521         rtmp3[i + 2] += sctx.shift_amount;
1522 
1523         /* elimination */
1524         bjtmp = bj + bi[i];
1525         row   = *bjtmp++; /* pivot row */
1526         nzL   = bi[i + 1] - bi[i];
1527         for (k = 0; k < nzL; k++) {
1528           pc1 = rtmp1 + row;
1529           pc2 = rtmp2 + row;
1530           pc3 = rtmp3 + row;
1531           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1532             pv   = b->a + bdiag[row];
1533             mul1 = *pc1 * (*pv);
1534             mul2 = *pc2 * (*pv);
1535             mul3 = *pc3 * (*pv);
1536             *pc1 = mul1;
1537             *pc2 = mul2;
1538             *pc3 = mul3;
1539 
1540             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1541             pv = b->a + bdiag[row + 1] + 1;
1542             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1543             for (j = 0; j < nz; j++) {
1544               col = pj[j];
1545               rtmp1[col] -= mul1 * pv[j];
1546               rtmp2[col] -= mul2 * pv[j];
1547               rtmp3[col] -= mul3 * pv[j];
1548             }
1549             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1550           }
1551           row = *bjtmp++;
1552         }
1553 
1554         /* finished row i; check zero pivot, then stick row i into b->a */
1555         rs = 0.0;
1556         /* L part */
1557         pc1 = b->a + bi[i];
1558         pj  = b->j + bi[i];
1559         nz  = bi[i + 1] - bi[i];
1560         for (j = 0; j < nz; j++) {
1561           col    = pj[j];
1562           pc1[j] = rtmp1[col];
1563           rs += PetscAbsScalar(pc1[j]);
1564         }
1565         /* U part */
1566         pc1 = b->a + bdiag[i + 1] + 1;
1567         pj  = b->j + bdiag[i + 1] + 1;
1568         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1569         for (j = 0; j < nz; j++) {
1570           col    = pj[j];
1571           pc1[j] = rtmp1[col];
1572           rs += PetscAbsScalar(pc1[j]);
1573         }
1574 
1575         sctx.rs = rs;
1576         sctx.pv = rtmp1[i];
1577         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1578         if (sctx.newshift) break;
1579         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1580         *pc1 = 1.0 / sctx.pv;
1581 
1582         /* Now take care of 1st column of diagonal 3x3 block. */
1583         pc2 = rtmp2 + i;
1584         pc3 = rtmp3 + i;
1585         if (*pc2 != 0.0 || *pc3 != 0.0) {
1586           mul2 = (*pc2) * (*pc1);
1587           *pc2 = mul2;
1588           mul3 = (*pc3) * (*pc1);
1589           *pc3 = mul3;
1590           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1591           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1592           for (j = 0; j < nz; j++) {
1593             col = pj[j];
1594             rtmp2[col] -= mul2 * rtmp1[col];
1595             rtmp3[col] -= mul3 * rtmp1[col];
1596           }
1597           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1598         }
1599 
1600         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1601         rs = 0.0;
1602         /* L part */
1603         pc2 = b->a + bi[i + 1];
1604         pj  = b->j + bi[i + 1];
1605         nz  = bi[i + 2] - bi[i + 1];
1606         for (j = 0; j < nz; j++) {
1607           col    = pj[j];
1608           pc2[j] = rtmp2[col];
1609           rs += PetscAbsScalar(pc2[j]);
1610         }
1611         /* U part */
1612         pc2 = b->a + bdiag[i + 2] + 1;
1613         pj  = b->j + bdiag[i + 2] + 1;
1614         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1615         for (j = 0; j < nz; j++) {
1616           col    = pj[j];
1617           pc2[j] = rtmp2[col];
1618           rs += PetscAbsScalar(pc2[j]);
1619         }
1620 
1621         sctx.rs = rs;
1622         sctx.pv = rtmp2[i + 1];
1623         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1624         if (sctx.newshift) break;
1625         pc2  = b->a + bdiag[i + 1];
1626         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1627 
1628         /* Now take care of 2nd column of diagonal 3x3 block. */
1629         pc3 = rtmp3 + i + 1;
1630         if (*pc3 != 0.0) {
1631           mul3 = (*pc3) * (*pc2);
1632           *pc3 = mul3;
1633           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1634           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1635           for (j = 0; j < nz; j++) {
1636             col = pj[j];
1637             rtmp3[col] -= mul3 * rtmp2[col];
1638           }
1639           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1640         }
1641 
1642         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1643         rs = 0.0;
1644         /* L part */
1645         pc3 = b->a + bi[i + 2];
1646         pj  = b->j + bi[i + 2];
1647         nz  = bi[i + 3] - bi[i + 2];
1648         for (j = 0; j < nz; j++) {
1649           col    = pj[j];
1650           pc3[j] = rtmp3[col];
1651           rs += PetscAbsScalar(pc3[j]);
1652         }
1653         /* U part */
1654         pc3 = b->a + bdiag[i + 3] + 1;
1655         pj  = b->j + bdiag[i + 3] + 1;
1656         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1657         for (j = 0; j < nz; j++) {
1658           col    = pj[j];
1659           pc3[j] = rtmp3[col];
1660           rs += PetscAbsScalar(pc3[j]);
1661         }
1662 
1663         sctx.rs = rs;
1664         sctx.pv = rtmp3[i + 2];
1665         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1666         if (sctx.newshift) break;
1667         pc3  = b->a + bdiag[i + 2];
1668         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1669         break;
1670       case 4:
1671         /* zero rtmp */
1672         /* L part */
1673         nz    = bi[i + 1] - bi[i];
1674         bjtmp = bj + bi[i];
1675         for (j = 0; j < nz; j++) {
1676           col        = bjtmp[j];
1677           rtmp1[col] = 0.0;
1678           rtmp2[col] = 0.0;
1679           rtmp3[col] = 0.0;
1680           rtmp4[col] = 0.0;
1681         }
1682 
1683         /* U part */
1684         nz    = bdiag[i] - bdiag[i + 1];
1685         bjtmp = bj + bdiag[i + 1] + 1;
1686         for (j = 0; j < nz; j++) {
1687           col        = bjtmp[j];
1688           rtmp1[col] = 0.0;
1689           rtmp2[col] = 0.0;
1690           rtmp3[col] = 0.0;
1691           rtmp4[col] = 0.0;
1692         }
1693 
1694         /* load in initial (unfactored row) */
1695         nz    = ai[r[i] + 1] - ai[r[i]];
1696         ajtmp = aj + ai[r[i]];
1697         v1    = aa + ai[r[i]];
1698         v2    = aa + ai[r[i] + 1];
1699         v3    = aa + ai[r[i] + 2];
1700         v4    = aa + ai[r[i] + 3];
1701         for (j = 0; j < nz; j++) {
1702           col        = ics[ajtmp[j]];
1703           rtmp1[col] = v1[j];
1704           rtmp2[col] = v2[j];
1705           rtmp3[col] = v3[j];
1706           rtmp4[col] = v4[j];
1707         }
1708         /* ZeropivotApply(): shift the diagonal of the matrix  */
1709         rtmp1[i] += sctx.shift_amount;
1710         rtmp2[i + 1] += sctx.shift_amount;
1711         rtmp3[i + 2] += sctx.shift_amount;
1712         rtmp4[i + 3] += sctx.shift_amount;
1713 
1714         /* elimination */
1715         bjtmp = bj + bi[i];
1716         row   = *bjtmp++; /* pivot row */
1717         nzL   = bi[i + 1] - bi[i];
1718         for (k = 0; k < nzL; k++) {
1719           pc1 = rtmp1 + row;
1720           pc2 = rtmp2 + row;
1721           pc3 = rtmp3 + row;
1722           pc4 = rtmp4 + row;
1723           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1724             pv   = b->a + bdiag[row];
1725             mul1 = *pc1 * (*pv);
1726             mul2 = *pc2 * (*pv);
1727             mul3 = *pc3 * (*pv);
1728             mul4 = *pc4 * (*pv);
1729             *pc1 = mul1;
1730             *pc2 = mul2;
1731             *pc3 = mul3;
1732             *pc4 = mul4;
1733 
1734             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1735             pv = b->a + bdiag[row + 1] + 1;
1736             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1737             for (j = 0; j < nz; j++) {
1738               col = pj[j];
1739               rtmp1[col] -= mul1 * pv[j];
1740               rtmp2[col] -= mul2 * pv[j];
1741               rtmp3[col] -= mul3 * pv[j];
1742               rtmp4[col] -= mul4 * pv[j];
1743             }
1744             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1745           }
1746           row = *bjtmp++;
1747         }
1748 
1749         /* finished row i; check zero pivot, then stick row i into b->a */
1750         rs = 0.0;
1751         /* L part */
1752         pc1 = b->a + bi[i];
1753         pj  = b->j + bi[i];
1754         nz  = bi[i + 1] - bi[i];
1755         for (j = 0; j < nz; j++) {
1756           col    = pj[j];
1757           pc1[j] = rtmp1[col];
1758           rs += PetscAbsScalar(pc1[j]);
1759         }
1760         /* U part */
1761         pc1 = b->a + bdiag[i + 1] + 1;
1762         pj  = b->j + bdiag[i + 1] + 1;
1763         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1764         for (j = 0; j < nz; j++) {
1765           col    = pj[j];
1766           pc1[j] = rtmp1[col];
1767           rs += PetscAbsScalar(pc1[j]);
1768         }
1769 
1770         sctx.rs = rs;
1771         sctx.pv = rtmp1[i];
1772         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1773         if (sctx.newshift) break;
1774         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1775         *pc1 = 1.0 / sctx.pv;
1776 
1777         /* Now take care of 1st column of diagonal 4x4 block. */
1778         pc2 = rtmp2 + i;
1779         pc3 = rtmp3 + i;
1780         pc4 = rtmp4 + i;
1781         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1782           mul2 = (*pc2) * (*pc1);
1783           *pc2 = mul2;
1784           mul3 = (*pc3) * (*pc1);
1785           *pc3 = mul3;
1786           mul4 = (*pc4) * (*pc1);
1787           *pc4 = mul4;
1788           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1789           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1790           for (j = 0; j < nz; j++) {
1791             col = pj[j];
1792             rtmp2[col] -= mul2 * rtmp1[col];
1793             rtmp3[col] -= mul3 * rtmp1[col];
1794             rtmp4[col] -= mul4 * rtmp1[col];
1795           }
1796           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1797         }
1798 
1799         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1800         rs = 0.0;
1801         /* L part */
1802         pc2 = b->a + bi[i + 1];
1803         pj  = b->j + bi[i + 1];
1804         nz  = bi[i + 2] - bi[i + 1];
1805         for (j = 0; j < nz; j++) {
1806           col    = pj[j];
1807           pc2[j] = rtmp2[col];
1808           rs += PetscAbsScalar(pc2[j]);
1809         }
1810         /* U part */
1811         pc2 = b->a + bdiag[i + 2] + 1;
1812         pj  = b->j + bdiag[i + 2] + 1;
1813         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1814         for (j = 0; j < nz; j++) {
1815           col    = pj[j];
1816           pc2[j] = rtmp2[col];
1817           rs += PetscAbsScalar(pc2[j]);
1818         }
1819 
1820         sctx.rs = rs;
1821         sctx.pv = rtmp2[i + 1];
1822         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1823         if (sctx.newshift) break;
1824         pc2  = b->a + bdiag[i + 1];
1825         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1826 
1827         /* Now take care of 2nd column of diagonal 4x4 block. */
1828         pc3 = rtmp3 + i + 1;
1829         pc4 = rtmp4 + i + 1;
1830         if (*pc3 != 0.0 || *pc4 != 0.0) {
1831           mul3 = (*pc3) * (*pc2);
1832           *pc3 = mul3;
1833           mul4 = (*pc4) * (*pc2);
1834           *pc4 = mul4;
1835           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1836           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1837           for (j = 0; j < nz; j++) {
1838             col = pj[j];
1839             rtmp3[col] -= mul3 * rtmp2[col];
1840             rtmp4[col] -= mul4 * rtmp2[col];
1841           }
1842           PetscCall(PetscLogFlops(4.0 * nz));
1843         }
1844 
1845         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1846         rs = 0.0;
1847         /* L part */
1848         pc3 = b->a + bi[i + 2];
1849         pj  = b->j + bi[i + 2];
1850         nz  = bi[i + 3] - bi[i + 2];
1851         for (j = 0; j < nz; j++) {
1852           col    = pj[j];
1853           pc3[j] = rtmp3[col];
1854           rs += PetscAbsScalar(pc3[j]);
1855         }
1856         /* U part */
1857         pc3 = b->a + bdiag[i + 3] + 1;
1858         pj  = b->j + bdiag[i + 3] + 1;
1859         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1860         for (j = 0; j < nz; j++) {
1861           col    = pj[j];
1862           pc3[j] = rtmp3[col];
1863           rs += PetscAbsScalar(pc3[j]);
1864         }
1865 
1866         sctx.rs = rs;
1867         sctx.pv = rtmp3[i + 2];
1868         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1869         if (sctx.newshift) break;
1870         pc3  = b->a + bdiag[i + 2];
1871         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1872 
1873         /* Now take care of 3rd column of diagonal 4x4 block. */
1874         pc4 = rtmp4 + i + 2;
1875         if (*pc4 != 0.0) {
1876           mul4 = (*pc4) * (*pc3);
1877           *pc4 = mul4;
1878           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1879           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1880           for (j = 0; j < nz; j++) {
1881             col = pj[j];
1882             rtmp4[col] -= mul4 * rtmp3[col];
1883           }
1884           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1885         }
1886 
1887         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1888         rs = 0.0;
1889         /* L part */
1890         pc4 = b->a + bi[i + 3];
1891         pj  = b->j + bi[i + 3];
1892         nz  = bi[i + 4] - bi[i + 3];
1893         for (j = 0; j < nz; j++) {
1894           col    = pj[j];
1895           pc4[j] = rtmp4[col];
1896           rs += PetscAbsScalar(pc4[j]);
1897         }
1898         /* U part */
1899         pc4 = b->a + bdiag[i + 4] + 1;
1900         pj  = b->j + bdiag[i + 4] + 1;
1901         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1902         for (j = 0; j < nz; j++) {
1903           col    = pj[j];
1904           pc4[j] = rtmp4[col];
1905           rs += PetscAbsScalar(pc4[j]);
1906         }
1907 
1908         sctx.rs = rs;
1909         sctx.pv = rtmp4[i + 3];
1910         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1911         if (sctx.newshift) break;
1912         pc4  = b->a + bdiag[i + 3];
1913         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1914         break;
1915 
1916       default:
1917         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1918       }
1919       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1920       i += nodesz;              /* Update the row */
1921     }
1922 
1923     /* MatPivotRefine() */
1924     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1925       /*
1926        * if no shift in this attempt & shifting & started shifting & can refine,
1927        * then try lower shift
1928        */
1929       sctx.shift_hi       = sctx.shift_fraction;
1930       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1931       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1932       sctx.newshift       = PETSC_TRUE;
1933       sctx.nshift++;
1934     }
1935   } while (sctx.newshift);
1936 
1937   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1938   PetscCall(PetscFree(tmp_vec2));
1939   PetscCall(ISRestoreIndices(isicol, &ic));
1940   PetscCall(ISRestoreIndices(isrow, &r));
1941 
1942   if (b->inode.size) {
1943     C->ops->solve = MatSolve_SeqAIJ_Inode;
1944   } else {
1945     C->ops->solve = MatSolve_SeqAIJ;
1946   }
1947   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1948   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1949   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1950   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1951   C->assembled              = PETSC_TRUE;
1952   C->preallocated           = PETSC_TRUE;
1953 
1954   PetscCall(PetscLogFlops(C->cmap->n));
1955 
1956   /* MatShiftView(A,info,&sctx) */
1957   if (sctx.nshift) {
1958     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1959       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1960     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1961       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1962     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1963       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1964     }
1965   }
1966   PetscFunctionReturn(PETSC_SUCCESS);
1967 }
1968 
1969 #if 0
1970 // unused
1971 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1972 {
1973   Mat              C = B;
1974   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1975   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1976   const PetscInt  *r, *ic, *c, *ics;
1977   PetscInt         n = A->rmap->n, *bi = b->i;
1978   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1979   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1980   PetscInt        *ai = a->i, *aj = a->j;
1981   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1982   PetscScalar      mul1, mul2, mul3, tmp;
1983   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1984   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1985   PetscReal        rs = 0.0;
1986   FactorShiftCtx   sctx;
1987 
1988   PetscFunctionBegin;
1989   sctx.shift_top      = 0;
1990   sctx.nshift_max     = 0;
1991   sctx.shift_lo       = 0;
1992   sctx.shift_hi       = 0;
1993   sctx.shift_fraction = 0;
1994 
1995   /* if both shift schemes are chosen by user, only use info->shiftpd */
1996   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1997     sctx.shift_top = 0;
1998     for (i = 0; i < n; i++) {
1999       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2000       rs    = 0.0;
2001       ajtmp = aj + ai[i];
2002       rtmp1 = aa + ai[i];
2003       nz    = ai[i + 1] - ai[i];
2004       for (j = 0; j < nz; j++) {
2005         if (*ajtmp != i) {
2006           rs += PetscAbsScalar(*rtmp1++);
2007         } else {
2008           rs -= PetscRealPart(*rtmp1++);
2009         }
2010         ajtmp++;
2011       }
2012       if (rs > sctx.shift_top) sctx.shift_top = rs;
2013     }
2014     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2015     sctx.shift_top *= 1.1;
2016     sctx.nshift_max = 5;
2017     sctx.shift_lo   = 0.;
2018     sctx.shift_hi   = 1.;
2019   }
2020   sctx.shift_amount = 0;
2021   sctx.nshift       = 0;
2022 
2023   PetscCall(ISGetIndices(isrow, &r));
2024   PetscCall(ISGetIndices(iscol, &c));
2025   PetscCall(ISGetIndices(isicol, &ic));
2026   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2027   ics = ic;
2028 
2029   node_max = a->inode.node_count;
2030   ns       = a->inode.size;
2031   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2032 
2033   /* If max inode size > 3, split it into two inodes.*/
2034   /* also map the inode sizes according to the ordering */
2035   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2036   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2037     if (ns[i] > 3) {
2038       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2039       ++j;
2040       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2041     } else {
2042       tmp_vec1[j] = ns[i];
2043     }
2044   }
2045   /* Use the correct node_max */
2046   node_max = j;
2047 
2048   /* Now reorder the inode info based on mat re-ordering info */
2049   /* First create a row -> inode_size_array_index map */
2050   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2051   for (i = 0, row = 0; i < node_max; i++) {
2052     nodesz = tmp_vec1[i];
2053     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2054   }
2055   /* Using nsmap, create a reordered ns structure */
2056   for (i = 0, j = 0; i < node_max; i++) {
2057     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2058     tmp_vec2[i] = nodesz;
2059     j += nodesz;
2060   }
2061   PetscCall(PetscFree2(nsmap, tmp_vec1));
2062   /* Now use the correct ns */
2063   ns = tmp_vec2;
2064 
2065   do {
2066     sctx.newshift = PETSC_FALSE;
2067     /* Now loop over each block-row, and do the factorization */
2068     for (i = 0, row = 0; i < node_max; i++) {
2069       nodesz = ns[i];
2070       nz     = bi[row + 1] - bi[row];
2071       bjtmp  = bj + bi[row];
2072 
2073       switch (nodesz) {
2074       case 1:
2075         for (j = 0; j < nz; j++) {
2076           idx         = bjtmp[j];
2077           rtmp11[idx] = 0.0;
2078         }
2079 
2080         /* load in initial (unfactored row) */
2081         idx    = r[row];
2082         nz_tmp = ai[idx + 1] - ai[idx];
2083         ajtmp  = aj + ai[idx];
2084         v1     = aa + ai[idx];
2085 
2086         for (j = 0; j < nz_tmp; j++) {
2087           idx         = ics[ajtmp[j]];
2088           rtmp11[idx] = v1[j];
2089         }
2090         rtmp11[ics[r[row]]] += sctx.shift_amount;
2091 
2092         prow = *bjtmp++;
2093         while (prow < row) {
2094           pc1 = rtmp11 + prow;
2095           if (*pc1 != 0.0) {
2096             pv     = ba + bd[prow];
2097             pj     = nbj + bd[prow];
2098             mul1   = *pc1 * *pv++;
2099             *pc1   = mul1;
2100             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2101             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2102             for (j = 0; j < nz_tmp; j++) {
2103               tmp = pv[j];
2104               idx = pj[j];
2105               rtmp11[idx] -= mul1 * tmp;
2106             }
2107           }
2108           prow = *bjtmp++;
2109         }
2110         pj  = bj + bi[row];
2111         pc1 = ba + bi[row];
2112 
2113         sctx.pv     = rtmp11[row];
2114         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2115         rs          = 0.0;
2116         for (j = 0; j < nz; j++) {
2117           idx    = pj[j];
2118           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2119           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2120         }
2121         sctx.rs = rs;
2122         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2123         if (sctx.newshift) goto endofwhile;
2124         break;
2125 
2126       case 2:
2127         for (j = 0; j < nz; j++) {
2128           idx         = bjtmp[j];
2129           rtmp11[idx] = 0.0;
2130           rtmp22[idx] = 0.0;
2131         }
2132 
2133         /* load in initial (unfactored row) */
2134         idx    = r[row];
2135         nz_tmp = ai[idx + 1] - ai[idx];
2136         ajtmp  = aj + ai[idx];
2137         v1     = aa + ai[idx];
2138         v2     = aa + ai[idx + 1];
2139         for (j = 0; j < nz_tmp; j++) {
2140           idx         = ics[ajtmp[j]];
2141           rtmp11[idx] = v1[j];
2142           rtmp22[idx] = v2[j];
2143         }
2144         rtmp11[ics[r[row]]] += sctx.shift_amount;
2145         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2146 
2147         prow = *bjtmp++;
2148         while (prow < row) {
2149           pc1 = rtmp11 + prow;
2150           pc2 = rtmp22 + prow;
2151           if (*pc1 != 0.0 || *pc2 != 0.0) {
2152             pv   = ba + bd[prow];
2153             pj   = nbj + bd[prow];
2154             mul1 = *pc1 * *pv;
2155             mul2 = *pc2 * *pv;
2156             ++pv;
2157             *pc1 = mul1;
2158             *pc2 = mul2;
2159 
2160             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2161             for (j = 0; j < nz_tmp; j++) {
2162               tmp = pv[j];
2163               idx = pj[j];
2164               rtmp11[idx] -= mul1 * tmp;
2165               rtmp22[idx] -= mul2 * tmp;
2166             }
2167             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2168           }
2169           prow = *bjtmp++;
2170         }
2171 
2172         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2173         pc1 = rtmp11 + prow;
2174         pc2 = rtmp22 + prow;
2175 
2176         sctx.pv = *pc1;
2177         pj      = bj + bi[prow];
2178         rs      = 0.0;
2179         for (j = 0; j < nz; j++) {
2180           idx = pj[j];
2181           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2182         }
2183         sctx.rs = rs;
2184         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2185         if (sctx.newshift) goto endofwhile;
2186 
2187         if (*pc2 != 0.0) {
2188           pj     = nbj + bd[prow];
2189           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2190           *pc2   = mul2;
2191           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2192           for (j = 0; j < nz_tmp; j++) {
2193             idx = pj[j];
2194             tmp = rtmp11[idx];
2195             rtmp22[idx] -= mul2 * tmp;
2196           }
2197           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2198         }
2199 
2200         pj  = bj + bi[row];
2201         pc1 = ba + bi[row];
2202         pc2 = ba + bi[row + 1];
2203 
2204         sctx.pv         = rtmp22[row + 1];
2205         rs              = 0.0;
2206         rtmp11[row]     = 1.0 / rtmp11[row];
2207         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2208         /* copy row entries from dense representation to sparse */
2209         for (j = 0; j < nz; j++) {
2210           idx    = pj[j];
2211           pc1[j] = rtmp11[idx];
2212           pc2[j] = rtmp22[idx];
2213           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2214         }
2215         sctx.rs = rs;
2216         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2217         if (sctx.newshift) goto endofwhile;
2218         break;
2219 
2220       case 3:
2221         for (j = 0; j < nz; j++) {
2222           idx         = bjtmp[j];
2223           rtmp11[idx] = 0.0;
2224           rtmp22[idx] = 0.0;
2225           rtmp33[idx] = 0.0;
2226         }
2227         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2228         idx    = r[row];
2229         nz_tmp = ai[idx + 1] - ai[idx];
2230         ajtmp  = aj + ai[idx];
2231         v1     = aa + ai[idx];
2232         v2     = aa + ai[idx + 1];
2233         v3     = aa + ai[idx + 2];
2234         for (j = 0; j < nz_tmp; j++) {
2235           idx         = ics[ajtmp[j]];
2236           rtmp11[idx] = v1[j];
2237           rtmp22[idx] = v2[j];
2238           rtmp33[idx] = v3[j];
2239         }
2240         rtmp11[ics[r[row]]] += sctx.shift_amount;
2241         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2242         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2243 
2244         /* loop over all pivot row blocks above this row block */
2245         prow = *bjtmp++;
2246         while (prow < row) {
2247           pc1 = rtmp11 + prow;
2248           pc2 = rtmp22 + prow;
2249           pc3 = rtmp33 + prow;
2250           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2251             pv   = ba + bd[prow];
2252             pj   = nbj + bd[prow];
2253             mul1 = *pc1 * *pv;
2254             mul2 = *pc2 * *pv;
2255             mul3 = *pc3 * *pv;
2256             ++pv;
2257             *pc1 = mul1;
2258             *pc2 = mul2;
2259             *pc3 = mul3;
2260 
2261             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2262             /* update this row based on pivot row */
2263             for (j = 0; j < nz_tmp; j++) {
2264               tmp = pv[j];
2265               idx = pj[j];
2266               rtmp11[idx] -= mul1 * tmp;
2267               rtmp22[idx] -= mul2 * tmp;
2268               rtmp33[idx] -= mul3 * tmp;
2269             }
2270             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2271           }
2272           prow = *bjtmp++;
2273         }
2274 
2275         /* Now take care of diagonal 3x3 block in this set of rows */
2276         /* note: prow = row here */
2277         pc1 = rtmp11 + prow;
2278         pc2 = rtmp22 + prow;
2279         pc3 = rtmp33 + prow;
2280 
2281         sctx.pv = *pc1;
2282         pj      = bj + bi[prow];
2283         rs      = 0.0;
2284         for (j = 0; j < nz; j++) {
2285           idx = pj[j];
2286           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2287         }
2288         sctx.rs = rs;
2289         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2290         if (sctx.newshift) goto endofwhile;
2291 
2292         if (*pc2 != 0.0 || *pc3 != 0.0) {
2293           mul2   = (*pc2) / (*pc1);
2294           mul3   = (*pc3) / (*pc1);
2295           *pc2   = mul2;
2296           *pc3   = mul3;
2297           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2298           pj     = nbj + bd[prow];
2299           for (j = 0; j < nz_tmp; j++) {
2300             idx = pj[j];
2301             tmp = rtmp11[idx];
2302             rtmp22[idx] -= mul2 * tmp;
2303             rtmp33[idx] -= mul3 * tmp;
2304           }
2305           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2306         }
2307         ++prow;
2308 
2309         pc2     = rtmp22 + prow;
2310         pc3     = rtmp33 + prow;
2311         sctx.pv = *pc2;
2312         pj      = bj + bi[prow];
2313         rs      = 0.0;
2314         for (j = 0; j < nz; j++) {
2315           idx = pj[j];
2316           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2317         }
2318         sctx.rs = rs;
2319         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2320         if (sctx.newshift) goto endofwhile;
2321 
2322         if (*pc3 != 0.0) {
2323           mul3   = (*pc3) / (*pc2);
2324           *pc3   = mul3;
2325           pj     = nbj + bd[prow];
2326           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2327           for (j = 0; j < nz_tmp; j++) {
2328             idx = pj[j];
2329             tmp = rtmp22[idx];
2330             rtmp33[idx] -= mul3 * tmp;
2331           }
2332           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2333         }
2334 
2335         pj  = bj + bi[row];
2336         pc1 = ba + bi[row];
2337         pc2 = ba + bi[row + 1];
2338         pc3 = ba + bi[row + 2];
2339 
2340         sctx.pv         = rtmp33[row + 2];
2341         rs              = 0.0;
2342         rtmp11[row]     = 1.0 / rtmp11[row];
2343         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2344         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2345         /* copy row entries from dense representation to sparse */
2346         for (j = 0; j < nz; j++) {
2347           idx    = pj[j];
2348           pc1[j] = rtmp11[idx];
2349           pc2[j] = rtmp22[idx];
2350           pc3[j] = rtmp33[idx];
2351           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2352         }
2353 
2354         sctx.rs = rs;
2355         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2356         if (sctx.newshift) goto endofwhile;
2357         break;
2358 
2359       default:
2360         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2361       }
2362       row += nodesz; /* Update the row */
2363     }
2364   endofwhile:;
2365   } while (sctx.newshift);
2366   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2367   PetscCall(PetscFree(tmp_vec2));
2368   PetscCall(ISRestoreIndices(isicol, &ic));
2369   PetscCall(ISRestoreIndices(isrow, &r));
2370   PetscCall(ISRestoreIndices(iscol, &c));
2371 
2372   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2373   /* do not set solve add, since MatSolve_Inode + Add is faster */
2374   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2375   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2376   C->assembled              = PETSC_TRUE;
2377   C->preallocated           = PETSC_TRUE;
2378   if (sctx.nshift) {
2379     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2380       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2381     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2382       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2383     }
2384   }
2385   PetscCall(PetscLogFlops(C->cmap->n));
2386   PetscCall(MatSeqAIJCheckInode(C));
2387   PetscFunctionReturn(PETSC_SUCCESS);
2388 }
2389 #endif
2390 
2391 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2392 {
2393   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2394   IS                 iscol = a->col, isrow = a->row;
2395   const PetscInt    *r, *c, *rout, *cout;
2396   PetscInt           i, j, n = A->rmap->n;
2397   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2398   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2399   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2400   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2401   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2402   const PetscScalar *b;
2403 
2404   PetscFunctionBegin;
2405   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2406   node_max = a->inode.node_count;
2407   ns       = a->inode.size; /* Node Size array */
2408 
2409   PetscCall(VecGetArrayRead(bb, &b));
2410   PetscCall(VecGetArrayWrite(xx, &x));
2411   tmp = a->solve_work;
2412 
2413   PetscCall(ISGetIndices(isrow, &rout));
2414   r = rout;
2415   PetscCall(ISGetIndices(iscol, &cout));
2416   c = cout;
2417 
2418   /* forward solve the lower triangular */
2419   tmps = tmp;
2420   aa   = a_a;
2421   aj   = a_j;
2422   ad   = a->diag;
2423 
2424   for (i = 0, row = 0; i < node_max; ++i) {
2425     nsz = ns[i];
2426     aii = ai[row];
2427     v1  = aa + aii;
2428     vi  = aj + aii;
2429     nz  = ai[row + 1] - ai[row];
2430 
2431     if (i < node_max - 1) {
2432       /* Prefetch the indices for the next block */
2433       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2434       /* Prefetch the data for the next block */
2435       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2436     }
2437 
2438     switch (nsz) { /* Each loop in 'case' is unrolled */
2439     case 1:
2440       sum1 = b[r[row]];
2441       for (j = 0; j < nz - 1; j += 2) {
2442         i0   = vi[j];
2443         i1   = vi[j + 1];
2444         tmp0 = tmps[i0];
2445         tmp1 = tmps[i1];
2446         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2447       }
2448       if (j == nz - 1) {
2449         tmp0 = tmps[vi[j]];
2450         sum1 -= v1[j] * tmp0;
2451       }
2452       tmp[row++] = sum1;
2453       break;
2454     case 2:
2455       sum1 = b[r[row]];
2456       sum2 = b[r[row + 1]];
2457       v2   = aa + ai[row + 1];
2458 
2459       for (j = 0; j < nz - 1; j += 2) {
2460         i0   = vi[j];
2461         i1   = vi[j + 1];
2462         tmp0 = tmps[i0];
2463         tmp1 = tmps[i1];
2464         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2465         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2466       }
2467       if (j == nz - 1) {
2468         tmp0 = tmps[vi[j]];
2469         sum1 -= v1[j] * tmp0;
2470         sum2 -= v2[j] * tmp0;
2471       }
2472       sum2 -= v2[nz] * sum1;
2473       tmp[row++] = sum1;
2474       tmp[row++] = sum2;
2475       break;
2476     case 3:
2477       sum1 = b[r[row]];
2478       sum2 = b[r[row + 1]];
2479       sum3 = b[r[row + 2]];
2480       v2   = aa + ai[row + 1];
2481       v3   = aa + ai[row + 2];
2482 
2483       for (j = 0; j < nz - 1; j += 2) {
2484         i0   = vi[j];
2485         i1   = vi[j + 1];
2486         tmp0 = tmps[i0];
2487         tmp1 = tmps[i1];
2488         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2489         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2490         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2491       }
2492       if (j == nz - 1) {
2493         tmp0 = tmps[vi[j]];
2494         sum1 -= v1[j] * tmp0;
2495         sum2 -= v2[j] * tmp0;
2496         sum3 -= v3[j] * tmp0;
2497       }
2498       sum2 -= v2[nz] * sum1;
2499       sum3 -= v3[nz] * sum1;
2500       sum3 -= v3[nz + 1] * sum2;
2501       tmp[row++] = sum1;
2502       tmp[row++] = sum2;
2503       tmp[row++] = sum3;
2504       break;
2505 
2506     case 4:
2507       sum1 = b[r[row]];
2508       sum2 = b[r[row + 1]];
2509       sum3 = b[r[row + 2]];
2510       sum4 = b[r[row + 3]];
2511       v2   = aa + ai[row + 1];
2512       v3   = aa + ai[row + 2];
2513       v4   = aa + ai[row + 3];
2514 
2515       for (j = 0; j < nz - 1; j += 2) {
2516         i0   = vi[j];
2517         i1   = vi[j + 1];
2518         tmp0 = tmps[i0];
2519         tmp1 = tmps[i1];
2520         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2521         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2522         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2523         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2524       }
2525       if (j == nz - 1) {
2526         tmp0 = tmps[vi[j]];
2527         sum1 -= v1[j] * tmp0;
2528         sum2 -= v2[j] * tmp0;
2529         sum3 -= v3[j] * tmp0;
2530         sum4 -= v4[j] * tmp0;
2531       }
2532       sum2 -= v2[nz] * sum1;
2533       sum3 -= v3[nz] * sum1;
2534       sum4 -= v4[nz] * sum1;
2535       sum3 -= v3[nz + 1] * sum2;
2536       sum4 -= v4[nz + 1] * sum2;
2537       sum4 -= v4[nz + 2] * sum3;
2538 
2539       tmp[row++] = sum1;
2540       tmp[row++] = sum2;
2541       tmp[row++] = sum3;
2542       tmp[row++] = sum4;
2543       break;
2544     case 5:
2545       sum1 = b[r[row]];
2546       sum2 = b[r[row + 1]];
2547       sum3 = b[r[row + 2]];
2548       sum4 = b[r[row + 3]];
2549       sum5 = b[r[row + 4]];
2550       v2   = aa + ai[row + 1];
2551       v3   = aa + ai[row + 2];
2552       v4   = aa + ai[row + 3];
2553       v5   = aa + ai[row + 4];
2554 
2555       for (j = 0; j < nz - 1; j += 2) {
2556         i0   = vi[j];
2557         i1   = vi[j + 1];
2558         tmp0 = tmps[i0];
2559         tmp1 = tmps[i1];
2560         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2561         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2562         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2563         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2564         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2565       }
2566       if (j == nz - 1) {
2567         tmp0 = tmps[vi[j]];
2568         sum1 -= v1[j] * tmp0;
2569         sum2 -= v2[j] * tmp0;
2570         sum3 -= v3[j] * tmp0;
2571         sum4 -= v4[j] * tmp0;
2572         sum5 -= v5[j] * tmp0;
2573       }
2574 
2575       sum2 -= v2[nz] * sum1;
2576       sum3 -= v3[nz] * sum1;
2577       sum4 -= v4[nz] * sum1;
2578       sum5 -= v5[nz] * sum1;
2579       sum3 -= v3[nz + 1] * sum2;
2580       sum4 -= v4[nz + 1] * sum2;
2581       sum5 -= v5[nz + 1] * sum2;
2582       sum4 -= v4[nz + 2] * sum3;
2583       sum5 -= v5[nz + 2] * sum3;
2584       sum5 -= v5[nz + 3] * sum4;
2585 
2586       tmp[row++] = sum1;
2587       tmp[row++] = sum2;
2588       tmp[row++] = sum3;
2589       tmp[row++] = sum4;
2590       tmp[row++] = sum5;
2591       break;
2592     default:
2593       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2594     }
2595   }
2596   /* backward solve the upper triangular */
2597   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2598     nsz = ns[i];
2599     aii = ad[row + 1] + 1;
2600     v1  = aa + aii;
2601     vi  = aj + aii;
2602     nz  = ad[row] - ad[row + 1] - 1;
2603 
2604     if (i > 0) {
2605       /* Prefetch the indices for the next block */
2606       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2607       /* Prefetch the data for the next block */
2608       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2609     }
2610 
2611     switch (nsz) { /* Each loop in 'case' is unrolled */
2612     case 1:
2613       sum1 = tmp[row];
2614 
2615       for (j = 0; j < nz - 1; j += 2) {
2616         i0   = vi[j];
2617         i1   = vi[j + 1];
2618         tmp0 = tmps[i0];
2619         tmp1 = tmps[i1];
2620         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2621       }
2622       if (j == nz - 1) {
2623         tmp0 = tmps[vi[j]];
2624         sum1 -= v1[j] * tmp0;
2625       }
2626       x[c[row]] = tmp[row] = sum1 * v1[nz];
2627       row--;
2628       break;
2629     case 2:
2630       sum1 = tmp[row];
2631       sum2 = tmp[row - 1];
2632       v2   = aa + ad[row] + 1;
2633       for (j = 0; j < nz - 1; j += 2) {
2634         i0   = vi[j];
2635         i1   = vi[j + 1];
2636         tmp0 = tmps[i0];
2637         tmp1 = tmps[i1];
2638         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2639         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2640       }
2641       if (j == nz - 1) {
2642         tmp0 = tmps[vi[j]];
2643         sum1 -= v1[j] * tmp0;
2644         sum2 -= v2[j + 1] * tmp0;
2645       }
2646 
2647       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2648       row--;
2649       sum2 -= v2[0] * tmp0;
2650       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2651       row--;
2652       break;
2653     case 3:
2654       sum1 = tmp[row];
2655       sum2 = tmp[row - 1];
2656       sum3 = tmp[row - 2];
2657       v2   = aa + ad[row] + 1;
2658       v3   = aa + ad[row - 1] + 1;
2659       for (j = 0; j < nz - 1; j += 2) {
2660         i0   = vi[j];
2661         i1   = vi[j + 1];
2662         tmp0 = tmps[i0];
2663         tmp1 = tmps[i1];
2664         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2665         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2666         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2667       }
2668       if (j == nz - 1) {
2669         tmp0 = tmps[vi[j]];
2670         sum1 -= v1[j] * tmp0;
2671         sum2 -= v2[j + 1] * tmp0;
2672         sum3 -= v3[j + 2] * tmp0;
2673       }
2674       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2675       row--;
2676       sum2 -= v2[0] * tmp0;
2677       sum3 -= v3[1] * tmp0;
2678       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2679       row--;
2680       sum3 -= v3[0] * tmp0;
2681       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2682       row--;
2683 
2684       break;
2685     case 4:
2686       sum1 = tmp[row];
2687       sum2 = tmp[row - 1];
2688       sum3 = tmp[row - 2];
2689       sum4 = tmp[row - 3];
2690       v2   = aa + ad[row] + 1;
2691       v3   = aa + ad[row - 1] + 1;
2692       v4   = aa + ad[row - 2] + 1;
2693 
2694       for (j = 0; j < nz - 1; j += 2) {
2695         i0   = vi[j];
2696         i1   = vi[j + 1];
2697         tmp0 = tmps[i0];
2698         tmp1 = tmps[i1];
2699         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2700         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2701         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2702         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2703       }
2704       if (j == nz - 1) {
2705         tmp0 = tmps[vi[j]];
2706         sum1 -= v1[j] * tmp0;
2707         sum2 -= v2[j + 1] * tmp0;
2708         sum3 -= v3[j + 2] * tmp0;
2709         sum4 -= v4[j + 3] * tmp0;
2710       }
2711 
2712       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2713       row--;
2714       sum2 -= v2[0] * tmp0;
2715       sum3 -= v3[1] * tmp0;
2716       sum4 -= v4[2] * tmp0;
2717       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2718       row--;
2719       sum3 -= v3[0] * tmp0;
2720       sum4 -= v4[1] * tmp0;
2721       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2722       row--;
2723       sum4 -= v4[0] * tmp0;
2724       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2725       row--;
2726       break;
2727     case 5:
2728       sum1 = tmp[row];
2729       sum2 = tmp[row - 1];
2730       sum3 = tmp[row - 2];
2731       sum4 = tmp[row - 3];
2732       sum5 = tmp[row - 4];
2733       v2   = aa + ad[row] + 1;
2734       v3   = aa + ad[row - 1] + 1;
2735       v4   = aa + ad[row - 2] + 1;
2736       v5   = aa + ad[row - 3] + 1;
2737       for (j = 0; j < nz - 1; j += 2) {
2738         i0   = vi[j];
2739         i1   = vi[j + 1];
2740         tmp0 = tmps[i0];
2741         tmp1 = tmps[i1];
2742         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2743         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2744         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2745         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2746         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2747       }
2748       if (j == nz - 1) {
2749         tmp0 = tmps[vi[j]];
2750         sum1 -= v1[j] * tmp0;
2751         sum2 -= v2[j + 1] * tmp0;
2752         sum3 -= v3[j + 2] * tmp0;
2753         sum4 -= v4[j + 3] * tmp0;
2754         sum5 -= v5[j + 4] * tmp0;
2755       }
2756 
2757       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2758       row--;
2759       sum2 -= v2[0] * tmp0;
2760       sum3 -= v3[1] * tmp0;
2761       sum4 -= v4[2] * tmp0;
2762       sum5 -= v5[3] * tmp0;
2763       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2764       row--;
2765       sum3 -= v3[0] * tmp0;
2766       sum4 -= v4[1] * tmp0;
2767       sum5 -= v5[2] * tmp0;
2768       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2769       row--;
2770       sum4 -= v4[0] * tmp0;
2771       sum5 -= v5[1] * tmp0;
2772       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2773       row--;
2774       sum5 -= v5[0] * tmp0;
2775       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2776       row--;
2777       break;
2778     default:
2779       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2780     }
2781   }
2782   PetscCall(ISRestoreIndices(isrow, &rout));
2783   PetscCall(ISRestoreIndices(iscol, &cout));
2784   PetscCall(VecRestoreArrayRead(bb, &b));
2785   PetscCall(VecRestoreArrayWrite(xx, &x));
2786   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2787   PetscFunctionReturn(PETSC_SUCCESS);
2788 }
2789 
2790 /*
2791      Makes a longer coloring[] array and calls the usual code with that
2792 */
2793 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2794 {
2795   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2796   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2797   PetscInt        *colorused, i;
2798   ISColoringValue *newcolor;
2799 
2800   PetscFunctionBegin;
2801   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2802   PetscCall(PetscMalloc1(n + 1, &newcolor));
2803   /* loop over inodes, marking a color for each column*/
2804   row = 0;
2805   for (i = 0; i < m; i++) {
2806     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2807   }
2808 
2809   /* eliminate unneeded colors */
2810   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2811   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2812 
2813   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2814   ncolors = colorused[5 * ncolors - 1];
2815   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2816   PetscCall(PetscFree(colorused));
2817   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2818   PetscCall(PetscFree(coloring));
2819   PetscFunctionReturn(PETSC_SUCCESS);
2820 }
2821 
2822 #include <petsc/private/kernels/blockinvert.h>
2823 
2824 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2825 {
2826   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2827   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2828   MatScalar         *ibdiag, *bdiag, work[25], *t;
2829   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2830   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2831   const PetscScalar *xb, *b;
2832   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2833   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2834   PetscInt           sz, k, ipvt[5];
2835   PetscBool          allowzeropivot, zeropivotdetected;
2836   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2837 
2838   PetscFunctionBegin;
2839   allowzeropivot = PetscNot(A->erroriffailure);
2840   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2841   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2842   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2843 
2844   if (!a->inode.ibdiagvalid) {
2845     if (!a->inode.ibdiag) {
2846       /* calculate space needed for diagonal blocks */
2847       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2848       a->inode.bdiagsize = cnt;
2849 
2850       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2851     }
2852 
2853     /* copy over the diagonal blocks and invert them */
2854     ibdiag = a->inode.ibdiag;
2855     bdiag  = a->inode.bdiag;
2856     cnt    = 0;
2857     for (i = 0, row = 0; i < m; i++) {
2858       for (j = 0; j < sizes[i]; j++) {
2859         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2860       }
2861       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2862 
2863       switch (sizes[i]) {
2864       case 1:
2865         /* Create matrix data structure */
2866         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2867           if (allowzeropivot) {
2868             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2869             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2870             A->factorerror_zeropivot_row   = row;
2871             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2872           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2873         }
2874         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2875         break;
2876       case 2:
2877         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2878         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2879         break;
2880       case 3:
2881         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2882         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2883         break;
2884       case 4:
2885         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2886         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2887         break;
2888       case 5:
2889         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2890         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2891         break;
2892       default:
2893         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2894       }
2895       cnt += sizes[i] * sizes[i];
2896       row += sizes[i];
2897     }
2898     a->inode.ibdiagvalid = PETSC_TRUE;
2899   }
2900   ibdiag = a->inode.ibdiag;
2901   bdiag  = a->inode.bdiag;
2902   t      = a->inode.ssor_work;
2903 
2904   PetscCall(VecGetArray(xx, &x));
2905   PetscCall(VecGetArrayRead(bb, &b));
2906   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2907   if (flag & SOR_ZERO_INITIAL_GUESS) {
2908     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2909       for (i = 0, row = 0; i < m; i++) {
2910         sz  = diag[row] - ii[row];
2911         v1  = a->a + ii[row];
2912         idx = a->j + ii[row];
2913 
2914         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2915         switch (sizes[i]) {
2916         case 1:
2917 
2918           sum1 = b[row];
2919           for (n = 0; n < sz - 1; n += 2) {
2920             i1 = idx[0];
2921             i2 = idx[1];
2922             idx += 2;
2923             tmp0 = x[i1];
2924             tmp1 = x[i2];
2925             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2926             v1 += 2;
2927           }
2928 
2929           if (n == sz - 1) {
2930             tmp0 = x[*idx];
2931             sum1 -= *v1 * tmp0;
2932           }
2933           t[row]   = sum1;
2934           x[row++] = sum1 * (*ibdiag++);
2935           break;
2936         case 2:
2937           v2   = a->a + ii[row + 1];
2938           sum1 = b[row];
2939           sum2 = b[row + 1];
2940           for (n = 0; n < sz - 1; n += 2) {
2941             i1 = idx[0];
2942             i2 = idx[1];
2943             idx += 2;
2944             tmp0 = x[i1];
2945             tmp1 = x[i2];
2946             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2947             v1 += 2;
2948             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2949             v2 += 2;
2950           }
2951 
2952           if (n == sz - 1) {
2953             tmp0 = x[*idx];
2954             sum1 -= v1[0] * tmp0;
2955             sum2 -= v2[0] * tmp0;
2956           }
2957           t[row]     = sum1;
2958           t[row + 1] = sum2;
2959           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2960           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2961           ibdiag += 4;
2962           break;
2963         case 3:
2964           v2   = a->a + ii[row + 1];
2965           v3   = a->a + ii[row + 2];
2966           sum1 = b[row];
2967           sum2 = b[row + 1];
2968           sum3 = b[row + 2];
2969           for (n = 0; n < sz - 1; n += 2) {
2970             i1 = idx[0];
2971             i2 = idx[1];
2972             idx += 2;
2973             tmp0 = x[i1];
2974             tmp1 = x[i2];
2975             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2976             v1 += 2;
2977             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2978             v2 += 2;
2979             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2980             v3 += 2;
2981           }
2982 
2983           if (n == sz - 1) {
2984             tmp0 = x[*idx];
2985             sum1 -= v1[0] * tmp0;
2986             sum2 -= v2[0] * tmp0;
2987             sum3 -= v3[0] * tmp0;
2988           }
2989           t[row]     = sum1;
2990           t[row + 1] = sum2;
2991           t[row + 2] = sum3;
2992           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2993           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2994           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2995           ibdiag += 9;
2996           break;
2997         case 4:
2998           v2   = a->a + ii[row + 1];
2999           v3   = a->a + ii[row + 2];
3000           v4   = a->a + ii[row + 3];
3001           sum1 = b[row];
3002           sum2 = b[row + 1];
3003           sum3 = b[row + 2];
3004           sum4 = b[row + 3];
3005           for (n = 0; n < sz - 1; n += 2) {
3006             i1 = idx[0];
3007             i2 = idx[1];
3008             idx += 2;
3009             tmp0 = x[i1];
3010             tmp1 = x[i2];
3011             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3012             v1 += 2;
3013             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3014             v2 += 2;
3015             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3016             v3 += 2;
3017             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3018             v4 += 2;
3019           }
3020 
3021           if (n == sz - 1) {
3022             tmp0 = x[*idx];
3023             sum1 -= v1[0] * tmp0;
3024             sum2 -= v2[0] * tmp0;
3025             sum3 -= v3[0] * tmp0;
3026             sum4 -= v4[0] * tmp0;
3027           }
3028           t[row]     = sum1;
3029           t[row + 1] = sum2;
3030           t[row + 2] = sum3;
3031           t[row + 3] = sum4;
3032           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3033           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3034           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3035           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3036           ibdiag += 16;
3037           break;
3038         case 5:
3039           v2   = a->a + ii[row + 1];
3040           v3   = a->a + ii[row + 2];
3041           v4   = a->a + ii[row + 3];
3042           v5   = a->a + ii[row + 4];
3043           sum1 = b[row];
3044           sum2 = b[row + 1];
3045           sum3 = b[row + 2];
3046           sum4 = b[row + 3];
3047           sum5 = b[row + 4];
3048           for (n = 0; n < sz - 1; n += 2) {
3049             i1 = idx[0];
3050             i2 = idx[1];
3051             idx += 2;
3052             tmp0 = x[i1];
3053             tmp1 = x[i2];
3054             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3055             v1 += 2;
3056             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3057             v2 += 2;
3058             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3059             v3 += 2;
3060             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3061             v4 += 2;
3062             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3063             v5 += 2;
3064           }
3065 
3066           if (n == sz - 1) {
3067             tmp0 = x[*idx];
3068             sum1 -= v1[0] * tmp0;
3069             sum2 -= v2[0] * tmp0;
3070             sum3 -= v3[0] * tmp0;
3071             sum4 -= v4[0] * tmp0;
3072             sum5 -= v5[0] * tmp0;
3073           }
3074           t[row]     = sum1;
3075           t[row + 1] = sum2;
3076           t[row + 2] = sum3;
3077           t[row + 3] = sum4;
3078           t[row + 4] = sum5;
3079           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3080           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3081           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3082           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3083           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3084           ibdiag += 25;
3085           break;
3086         default:
3087           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3088         }
3089       }
3090 
3091       xb = t;
3092       PetscCall(PetscLogFlops(a->nz));
3093     } else xb = b;
3094     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3095       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3096       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3097         ibdiag -= sizes[i] * sizes[i];
3098         sz  = ii[row + 1] - diag[row] - 1;
3099         v1  = a->a + diag[row] + 1;
3100         idx = a->j + diag[row] + 1;
3101 
3102         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3103         switch (sizes[i]) {
3104         case 1:
3105 
3106           sum1 = xb[row];
3107           for (n = 0; n < sz - 1; n += 2) {
3108             i1 = idx[0];
3109             i2 = idx[1];
3110             idx += 2;
3111             tmp0 = x[i1];
3112             tmp1 = x[i2];
3113             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3114             v1 += 2;
3115           }
3116 
3117           if (n == sz - 1) {
3118             tmp0 = x[*idx];
3119             sum1 -= *v1 * tmp0;
3120           }
3121           x[row--] = sum1 * (*ibdiag);
3122           break;
3123 
3124         case 2:
3125 
3126           sum1 = xb[row];
3127           sum2 = xb[row - 1];
3128           /* note that sum1 is associated with the second of the two rows */
3129           v2 = a->a + diag[row - 1] + 2;
3130           for (n = 0; n < sz - 1; n += 2) {
3131             i1 = idx[0];
3132             i2 = idx[1];
3133             idx += 2;
3134             tmp0 = x[i1];
3135             tmp1 = x[i2];
3136             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3137             v1 += 2;
3138             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3139             v2 += 2;
3140           }
3141 
3142           if (n == sz - 1) {
3143             tmp0 = x[*idx];
3144             sum1 -= *v1 * tmp0;
3145             sum2 -= *v2 * tmp0;
3146           }
3147           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3148           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3149           break;
3150         case 3:
3151 
3152           sum1 = xb[row];
3153           sum2 = xb[row - 1];
3154           sum3 = xb[row - 2];
3155           v2   = a->a + diag[row - 1] + 2;
3156           v3   = a->a + diag[row - 2] + 3;
3157           for (n = 0; n < sz - 1; n += 2) {
3158             i1 = idx[0];
3159             i2 = idx[1];
3160             idx += 2;
3161             tmp0 = x[i1];
3162             tmp1 = x[i2];
3163             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3164             v1 += 2;
3165             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3166             v2 += 2;
3167             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3168             v3 += 2;
3169           }
3170 
3171           if (n == sz - 1) {
3172             tmp0 = x[*idx];
3173             sum1 -= *v1 * tmp0;
3174             sum2 -= *v2 * tmp0;
3175             sum3 -= *v3 * tmp0;
3176           }
3177           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3178           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3179           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3180           break;
3181         case 4:
3182 
3183           sum1 = xb[row];
3184           sum2 = xb[row - 1];
3185           sum3 = xb[row - 2];
3186           sum4 = xb[row - 3];
3187           v2   = a->a + diag[row - 1] + 2;
3188           v3   = a->a + diag[row - 2] + 3;
3189           v4   = a->a + diag[row - 3] + 4;
3190           for (n = 0; n < sz - 1; n += 2) {
3191             i1 = idx[0];
3192             i2 = idx[1];
3193             idx += 2;
3194             tmp0 = x[i1];
3195             tmp1 = x[i2];
3196             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3197             v1 += 2;
3198             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3199             v2 += 2;
3200             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3201             v3 += 2;
3202             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3203             v4 += 2;
3204           }
3205 
3206           if (n == sz - 1) {
3207             tmp0 = x[*idx];
3208             sum1 -= *v1 * tmp0;
3209             sum2 -= *v2 * tmp0;
3210             sum3 -= *v3 * tmp0;
3211             sum4 -= *v4 * tmp0;
3212           }
3213           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3214           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3215           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3216           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3217           break;
3218         case 5:
3219 
3220           sum1 = xb[row];
3221           sum2 = xb[row - 1];
3222           sum3 = xb[row - 2];
3223           sum4 = xb[row - 3];
3224           sum5 = xb[row - 4];
3225           v2   = a->a + diag[row - 1] + 2;
3226           v3   = a->a + diag[row - 2] + 3;
3227           v4   = a->a + diag[row - 3] + 4;
3228           v5   = a->a + diag[row - 4] + 5;
3229           for (n = 0; n < sz - 1; n += 2) {
3230             i1 = idx[0];
3231             i2 = idx[1];
3232             idx += 2;
3233             tmp0 = x[i1];
3234             tmp1 = x[i2];
3235             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3236             v1 += 2;
3237             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3238             v2 += 2;
3239             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3240             v3 += 2;
3241             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3242             v4 += 2;
3243             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3244             v5 += 2;
3245           }
3246 
3247           if (n == sz - 1) {
3248             tmp0 = x[*idx];
3249             sum1 -= *v1 * tmp0;
3250             sum2 -= *v2 * tmp0;
3251             sum3 -= *v3 * tmp0;
3252             sum4 -= *v4 * tmp0;
3253             sum5 -= *v5 * tmp0;
3254           }
3255           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3256           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3257           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3258           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3259           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3260           break;
3261         default:
3262           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3263         }
3264       }
3265 
3266       PetscCall(PetscLogFlops(a->nz));
3267     }
3268     its--;
3269   }
3270   while (its--) {
3271     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3272       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3273         sz  = diag[row] - ii[row];
3274         v1  = a->a + ii[row];
3275         idx = a->j + ii[row];
3276         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3277         switch (sizes[i]) {
3278         case 1:
3279           sum1 = b[row];
3280           for (n = 0; n < sz - 1; n += 2) {
3281             i1 = idx[0];
3282             i2 = idx[1];
3283             idx += 2;
3284             tmp0 = x[i1];
3285             tmp1 = x[i2];
3286             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3287             v1 += 2;
3288           }
3289           if (n == sz - 1) {
3290             tmp0 = x[*idx++];
3291             sum1 -= *v1 * tmp0;
3292             v1++;
3293           }
3294           t[row] = sum1;
3295           sz     = ii[row + 1] - diag[row] - 1;
3296           idx    = a->j + diag[row] + 1;
3297           v1 += 1;
3298           for (n = 0; n < sz - 1; n += 2) {
3299             i1 = idx[0];
3300             i2 = idx[1];
3301             idx += 2;
3302             tmp0 = x[i1];
3303             tmp1 = x[i2];
3304             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3305             v1 += 2;
3306           }
3307           if (n == sz - 1) {
3308             tmp0 = x[*idx++];
3309             sum1 -= *v1 * tmp0;
3310           }
3311           /* in MatSOR_SeqAIJ this line would be
3312            *
3313            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3314            *
3315            * but omega == 1, so this becomes
3316            *
3317            * x[row] = sum1*(*ibdiag++);
3318            *
3319            */
3320           x[row] = sum1 * (*ibdiag);
3321           break;
3322         case 2:
3323           v2   = a->a + ii[row + 1];
3324           sum1 = b[row];
3325           sum2 = b[row + 1];
3326           for (n = 0; n < sz - 1; n += 2) {
3327             i1 = idx[0];
3328             i2 = idx[1];
3329             idx += 2;
3330             tmp0 = x[i1];
3331             tmp1 = x[i2];
3332             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3333             v1 += 2;
3334             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3335             v2 += 2;
3336           }
3337           if (n == sz - 1) {
3338             tmp0 = x[*idx++];
3339             sum1 -= v1[0] * tmp0;
3340             sum2 -= v2[0] * tmp0;
3341             v1++;
3342             v2++;
3343           }
3344           t[row]     = sum1;
3345           t[row + 1] = sum2;
3346           sz         = ii[row + 1] - diag[row] - 2;
3347           idx        = a->j + diag[row] + 2;
3348           v1 += 2;
3349           v2 += 2;
3350           for (n = 0; n < sz - 1; n += 2) {
3351             i1 = idx[0];
3352             i2 = idx[1];
3353             idx += 2;
3354             tmp0 = x[i1];
3355             tmp1 = x[i2];
3356             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3357             v1 += 2;
3358             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3359             v2 += 2;
3360           }
3361           if (n == sz - 1) {
3362             tmp0 = x[*idx];
3363             sum1 -= v1[0] * tmp0;
3364             sum2 -= v2[0] * tmp0;
3365           }
3366           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3367           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3368           break;
3369         case 3:
3370           v2   = a->a + ii[row + 1];
3371           v3   = a->a + ii[row + 2];
3372           sum1 = b[row];
3373           sum2 = b[row + 1];
3374           sum3 = b[row + 2];
3375           for (n = 0; n < sz - 1; n += 2) {
3376             i1 = idx[0];
3377             i2 = idx[1];
3378             idx += 2;
3379             tmp0 = x[i1];
3380             tmp1 = x[i2];
3381             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3382             v1 += 2;
3383             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3384             v2 += 2;
3385             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3386             v3 += 2;
3387           }
3388           if (n == sz - 1) {
3389             tmp0 = x[*idx++];
3390             sum1 -= v1[0] * tmp0;
3391             sum2 -= v2[0] * tmp0;
3392             sum3 -= v3[0] * tmp0;
3393             v1++;
3394             v2++;
3395             v3++;
3396           }
3397           t[row]     = sum1;
3398           t[row + 1] = sum2;
3399           t[row + 2] = sum3;
3400           sz         = ii[row + 1] - diag[row] - 3;
3401           idx        = a->j + diag[row] + 3;
3402           v1 += 3;
3403           v2 += 3;
3404           v3 += 3;
3405           for (n = 0; n < sz - 1; n += 2) {
3406             i1 = idx[0];
3407             i2 = idx[1];
3408             idx += 2;
3409             tmp0 = x[i1];
3410             tmp1 = x[i2];
3411             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3412             v1 += 2;
3413             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3414             v2 += 2;
3415             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3416             v3 += 2;
3417           }
3418           if (n == sz - 1) {
3419             tmp0 = x[*idx];
3420             sum1 -= v1[0] * tmp0;
3421             sum2 -= v2[0] * tmp0;
3422             sum3 -= v3[0] * tmp0;
3423           }
3424           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3425           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3426           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3427           break;
3428         case 4:
3429           v2   = a->a + ii[row + 1];
3430           v3   = a->a + ii[row + 2];
3431           v4   = a->a + ii[row + 3];
3432           sum1 = b[row];
3433           sum2 = b[row + 1];
3434           sum3 = b[row + 2];
3435           sum4 = b[row + 3];
3436           for (n = 0; n < sz - 1; n += 2) {
3437             i1 = idx[0];
3438             i2 = idx[1];
3439             idx += 2;
3440             tmp0 = x[i1];
3441             tmp1 = x[i2];
3442             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3443             v1 += 2;
3444             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3445             v2 += 2;
3446             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3447             v3 += 2;
3448             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3449             v4 += 2;
3450           }
3451           if (n == sz - 1) {
3452             tmp0 = x[*idx++];
3453             sum1 -= v1[0] * tmp0;
3454             sum2 -= v2[0] * tmp0;
3455             sum3 -= v3[0] * tmp0;
3456             sum4 -= v4[0] * tmp0;
3457             v1++;
3458             v2++;
3459             v3++;
3460             v4++;
3461           }
3462           t[row]     = sum1;
3463           t[row + 1] = sum2;
3464           t[row + 2] = sum3;
3465           t[row + 3] = sum4;
3466           sz         = ii[row + 1] - diag[row] - 4;
3467           idx        = a->j + diag[row] + 4;
3468           v1 += 4;
3469           v2 += 4;
3470           v3 += 4;
3471           v4 += 4;
3472           for (n = 0; n < sz - 1; n += 2) {
3473             i1 = idx[0];
3474             i2 = idx[1];
3475             idx += 2;
3476             tmp0 = x[i1];
3477             tmp1 = x[i2];
3478             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3479             v1 += 2;
3480             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3481             v2 += 2;
3482             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3483             v3 += 2;
3484             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3485             v4 += 2;
3486           }
3487           if (n == sz - 1) {
3488             tmp0 = x[*idx];
3489             sum1 -= v1[0] * tmp0;
3490             sum2 -= v2[0] * tmp0;
3491             sum3 -= v3[0] * tmp0;
3492             sum4 -= v4[0] * tmp0;
3493           }
3494           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3495           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3496           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3497           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3498           break;
3499         case 5:
3500           v2   = a->a + ii[row + 1];
3501           v3   = a->a + ii[row + 2];
3502           v4   = a->a + ii[row + 3];
3503           v5   = a->a + ii[row + 4];
3504           sum1 = b[row];
3505           sum2 = b[row + 1];
3506           sum3 = b[row + 2];
3507           sum4 = b[row + 3];
3508           sum5 = b[row + 4];
3509           for (n = 0; n < sz - 1; n += 2) {
3510             i1 = idx[0];
3511             i2 = idx[1];
3512             idx += 2;
3513             tmp0 = x[i1];
3514             tmp1 = x[i2];
3515             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3516             v1 += 2;
3517             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3518             v2 += 2;
3519             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3520             v3 += 2;
3521             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3522             v4 += 2;
3523             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3524             v5 += 2;
3525           }
3526           if (n == sz - 1) {
3527             tmp0 = x[*idx++];
3528             sum1 -= v1[0] * tmp0;
3529             sum2 -= v2[0] * tmp0;
3530             sum3 -= v3[0] * tmp0;
3531             sum4 -= v4[0] * tmp0;
3532             sum5 -= v5[0] * tmp0;
3533             v1++;
3534             v2++;
3535             v3++;
3536             v4++;
3537             v5++;
3538           }
3539           t[row]     = sum1;
3540           t[row + 1] = sum2;
3541           t[row + 2] = sum3;
3542           t[row + 3] = sum4;
3543           t[row + 4] = sum5;
3544           sz         = ii[row + 1] - diag[row] - 5;
3545           idx        = a->j + diag[row] + 5;
3546           v1 += 5;
3547           v2 += 5;
3548           v3 += 5;
3549           v4 += 5;
3550           v5 += 5;
3551           for (n = 0; n < sz - 1; n += 2) {
3552             i1 = idx[0];
3553             i2 = idx[1];
3554             idx += 2;
3555             tmp0 = x[i1];
3556             tmp1 = x[i2];
3557             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3558             v1 += 2;
3559             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3560             v2 += 2;
3561             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3562             v3 += 2;
3563             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3564             v4 += 2;
3565             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3566             v5 += 2;
3567           }
3568           if (n == sz - 1) {
3569             tmp0 = x[*idx];
3570             sum1 -= v1[0] * tmp0;
3571             sum2 -= v2[0] * tmp0;
3572             sum3 -= v3[0] * tmp0;
3573             sum4 -= v4[0] * tmp0;
3574             sum5 -= v5[0] * tmp0;
3575           }
3576           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3577           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3578           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3579           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3580           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3581           break;
3582         default:
3583           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3584         }
3585       }
3586       xb = t;
3587       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3588     } else xb = b;
3589 
3590     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3591       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3592       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3593         ibdiag -= sizes[i] * sizes[i];
3594 
3595         /* set RHS */
3596         if (xb == b) {
3597           /* whole (old way) */
3598           sz  = ii[row + 1] - ii[row];
3599           idx = a->j + ii[row];
3600           switch (sizes[i]) {
3601           case 5:
3602             v5 = a->a + ii[row - 4]; /* fall through */
3603           case 4:
3604             v4 = a->a + ii[row - 3]; /* fall through */
3605           case 3:
3606             v3 = a->a + ii[row - 2]; /* fall through */
3607           case 2:
3608             v2 = a->a + ii[row - 1]; /* fall through */
3609           case 1:
3610             v1 = a->a + ii[row];
3611             break;
3612           default:
3613             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3614           }
3615         } else {
3616           /* upper, no diag */
3617           sz  = ii[row + 1] - diag[row] - 1;
3618           idx = a->j + diag[row] + 1;
3619           switch (sizes[i]) {
3620           case 5:
3621             v5 = a->a + diag[row - 4] + 5; /* fall through */
3622           case 4:
3623             v4 = a->a + diag[row - 3] + 4; /* fall through */
3624           case 3:
3625             v3 = a->a + diag[row - 2] + 3; /* fall through */
3626           case 2:
3627             v2 = a->a + diag[row - 1] + 2; /* fall through */
3628           case 1:
3629             v1 = a->a + diag[row] + 1;
3630           }
3631         }
3632         /* set sum */
3633         switch (sizes[i]) {
3634         case 5:
3635           sum5 = xb[row - 4]; /* fall through */
3636         case 4:
3637           sum4 = xb[row - 3]; /* fall through */
3638         case 3:
3639           sum3 = xb[row - 2]; /* fall through */
3640         case 2:
3641           sum2 = xb[row - 1]; /* fall through */
3642         case 1:
3643           /* note that sum1 is associated with the last row */
3644           sum1 = xb[row];
3645         }
3646         /* do sums */
3647         for (n = 0; n < sz - 1; n += 2) {
3648           i1 = idx[0];
3649           i2 = idx[1];
3650           idx += 2;
3651           tmp0 = x[i1];
3652           tmp1 = x[i2];
3653           switch (sizes[i]) {
3654           case 5:
3655             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3656             v5 += 2; /* fall through */
3657           case 4:
3658             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3659             v4 += 2; /* fall through */
3660           case 3:
3661             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3662             v3 += 2; /* fall through */
3663           case 2:
3664             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3665             v2 += 2; /* fall through */
3666           case 1:
3667             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3668             v1 += 2;
3669           }
3670         }
3671         /* ragged edge */
3672         if (n == sz - 1) {
3673           tmp0 = x[*idx];
3674           switch (sizes[i]) {
3675           case 5:
3676             sum5 -= *v5 * tmp0; /* fall through */
3677           case 4:
3678             sum4 -= *v4 * tmp0; /* fall through */
3679           case 3:
3680             sum3 -= *v3 * tmp0; /* fall through */
3681           case 2:
3682             sum2 -= *v2 * tmp0; /* fall through */
3683           case 1:
3684             sum1 -= *v1 * tmp0;
3685           }
3686         }
3687         /* update */
3688         if (xb == b) {
3689           /* whole (old way) w/ diag */
3690           switch (sizes[i]) {
3691           case 5:
3692             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3693             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3694             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3695             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3696             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3697             break;
3698           case 4:
3699             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3700             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3701             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3702             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3703             break;
3704           case 3:
3705             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3706             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3707             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3708             break;
3709           case 2:
3710             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3711             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3712             break;
3713           case 1:
3714             x[row--] += sum1 * (*ibdiag);
3715             break;
3716           }
3717         } else {
3718           /* no diag so set =  */
3719           switch (sizes[i]) {
3720           case 5:
3721             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3722             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3723             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3724             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3725             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3726             break;
3727           case 4:
3728             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3729             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3730             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3731             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3732             break;
3733           case 3:
3734             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3735             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3736             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3737             break;
3738           case 2:
3739             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3740             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3741             break;
3742           case 1:
3743             x[row--] = sum1 * (*ibdiag);
3744             break;
3745           }
3746         }
3747       }
3748       if (xb == b) {
3749         PetscCall(PetscLogFlops(2.0 * a->nz));
3750       } else {
3751         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3752       }
3753     }
3754   }
3755   if (flag & SOR_EISENSTAT) {
3756     /*
3757           Apply  (U + D)^-1  where D is now the block diagonal
3758     */
3759     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3760     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3761       ibdiag -= sizes[i] * sizes[i];
3762       sz  = ii[row + 1] - diag[row] - 1;
3763       v1  = a->a + diag[row] + 1;
3764       idx = a->j + diag[row] + 1;
3765       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3766       switch (sizes[i]) {
3767       case 1:
3768 
3769         sum1 = b[row];
3770         for (n = 0; n < sz - 1; n += 2) {
3771           i1 = idx[0];
3772           i2 = idx[1];
3773           idx += 2;
3774           tmp0 = x[i1];
3775           tmp1 = x[i2];
3776           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3777           v1 += 2;
3778         }
3779 
3780         if (n == sz - 1) {
3781           tmp0 = x[*idx];
3782           sum1 -= *v1 * tmp0;
3783         }
3784         x[row] = sum1 * (*ibdiag);
3785         row--;
3786         break;
3787 
3788       case 2:
3789 
3790         sum1 = b[row];
3791         sum2 = b[row - 1];
3792         /* note that sum1 is associated with the second of the two rows */
3793         v2 = a->a + diag[row - 1] + 2;
3794         for (n = 0; n < sz - 1; n += 2) {
3795           i1 = idx[0];
3796           i2 = idx[1];
3797           idx += 2;
3798           tmp0 = x[i1];
3799           tmp1 = x[i2];
3800           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3801           v1 += 2;
3802           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3803           v2 += 2;
3804         }
3805 
3806         if (n == sz - 1) {
3807           tmp0 = x[*idx];
3808           sum1 -= *v1 * tmp0;
3809           sum2 -= *v2 * tmp0;
3810         }
3811         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3812         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3813         row -= 2;
3814         break;
3815       case 3:
3816 
3817         sum1 = b[row];
3818         sum2 = b[row - 1];
3819         sum3 = b[row - 2];
3820         v2   = a->a + diag[row - 1] + 2;
3821         v3   = a->a + diag[row - 2] + 3;
3822         for (n = 0; n < sz - 1; n += 2) {
3823           i1 = idx[0];
3824           i2 = idx[1];
3825           idx += 2;
3826           tmp0 = x[i1];
3827           tmp1 = x[i2];
3828           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3829           v1 += 2;
3830           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3831           v2 += 2;
3832           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3833           v3 += 2;
3834         }
3835 
3836         if (n == sz - 1) {
3837           tmp0 = x[*idx];
3838           sum1 -= *v1 * tmp0;
3839           sum2 -= *v2 * tmp0;
3840           sum3 -= *v3 * tmp0;
3841         }
3842         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3843         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3844         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3845         row -= 3;
3846         break;
3847       case 4:
3848 
3849         sum1 = b[row];
3850         sum2 = b[row - 1];
3851         sum3 = b[row - 2];
3852         sum4 = b[row - 3];
3853         v2   = a->a + diag[row - 1] + 2;
3854         v3   = a->a + diag[row - 2] + 3;
3855         v4   = a->a + diag[row - 3] + 4;
3856         for (n = 0; n < sz - 1; n += 2) {
3857           i1 = idx[0];
3858           i2 = idx[1];
3859           idx += 2;
3860           tmp0 = x[i1];
3861           tmp1 = x[i2];
3862           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3863           v1 += 2;
3864           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3865           v2 += 2;
3866           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3867           v3 += 2;
3868           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3869           v4 += 2;
3870         }
3871 
3872         if (n == sz - 1) {
3873           tmp0 = x[*idx];
3874           sum1 -= *v1 * tmp0;
3875           sum2 -= *v2 * tmp0;
3876           sum3 -= *v3 * tmp0;
3877           sum4 -= *v4 * tmp0;
3878         }
3879         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3880         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3881         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3882         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3883         row -= 4;
3884         break;
3885       case 5:
3886 
3887         sum1 = b[row];
3888         sum2 = b[row - 1];
3889         sum3 = b[row - 2];
3890         sum4 = b[row - 3];
3891         sum5 = b[row - 4];
3892         v2   = a->a + diag[row - 1] + 2;
3893         v3   = a->a + diag[row - 2] + 3;
3894         v4   = a->a + diag[row - 3] + 4;
3895         v5   = a->a + diag[row - 4] + 5;
3896         for (n = 0; n < sz - 1; n += 2) {
3897           i1 = idx[0];
3898           i2 = idx[1];
3899           idx += 2;
3900           tmp0 = x[i1];
3901           tmp1 = x[i2];
3902           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3903           v1 += 2;
3904           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3905           v2 += 2;
3906           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3907           v3 += 2;
3908           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3909           v4 += 2;
3910           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3911           v5 += 2;
3912         }
3913 
3914         if (n == sz - 1) {
3915           tmp0 = x[*idx];
3916           sum1 -= *v1 * tmp0;
3917           sum2 -= *v2 * tmp0;
3918           sum3 -= *v3 * tmp0;
3919           sum4 -= *v4 * tmp0;
3920           sum5 -= *v5 * tmp0;
3921         }
3922         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3923         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3924         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3925         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3926         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3927         row -= 5;
3928         break;
3929       default:
3930         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3931       }
3932     }
3933     PetscCall(PetscLogFlops(a->nz));
3934 
3935     /*
3936            t = b - D x    where D is the block diagonal
3937     */
3938     cnt = 0;
3939     for (i = 0, row = 0; i < m; i++) {
3940       switch (sizes[i]) {
3941       case 1:
3942         t[row] = b[row] - bdiag[cnt++] * x[row];
3943         row++;
3944         break;
3945       case 2:
3946         x1         = x[row];
3947         x2         = x[row + 1];
3948         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3949         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3950         t[row]     = b[row] - tmp1;
3951         t[row + 1] = b[row + 1] - tmp2;
3952         row += 2;
3953         cnt += 4;
3954         break;
3955       case 3:
3956         x1         = x[row];
3957         x2         = x[row + 1];
3958         x3         = x[row + 2];
3959         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3960         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3961         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3962         t[row]     = b[row] - tmp1;
3963         t[row + 1] = b[row + 1] - tmp2;
3964         t[row + 2] = b[row + 2] - tmp3;
3965         row += 3;
3966         cnt += 9;
3967         break;
3968       case 4:
3969         x1         = x[row];
3970         x2         = x[row + 1];
3971         x3         = x[row + 2];
3972         x4         = x[row + 3];
3973         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3974         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3975         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3976         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3977         t[row]     = b[row] - tmp1;
3978         t[row + 1] = b[row + 1] - tmp2;
3979         t[row + 2] = b[row + 2] - tmp3;
3980         t[row + 3] = b[row + 3] - tmp4;
3981         row += 4;
3982         cnt += 16;
3983         break;
3984       case 5:
3985         x1         = x[row];
3986         x2         = x[row + 1];
3987         x3         = x[row + 2];
3988         x4         = x[row + 3];
3989         x5         = x[row + 4];
3990         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3991         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3992         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3993         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3994         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3995         t[row]     = b[row] - tmp1;
3996         t[row + 1] = b[row + 1] - tmp2;
3997         t[row + 2] = b[row + 2] - tmp3;
3998         t[row + 3] = b[row + 3] - tmp4;
3999         t[row + 4] = b[row + 4] - tmp5;
4000         row += 5;
4001         cnt += 25;
4002         break;
4003       default:
4004         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4005       }
4006     }
4007     PetscCall(PetscLogFlops(m));
4008 
4009     /*
4010           Apply (L + D)^-1 where D is the block diagonal
4011     */
4012     for (i = 0, row = 0; i < m; i++) {
4013       sz  = diag[row] - ii[row];
4014       v1  = a->a + ii[row];
4015       idx = a->j + ii[row];
4016       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4017       switch (sizes[i]) {
4018       case 1:
4019 
4020         sum1 = t[row];
4021         for (n = 0; n < sz - 1; n += 2) {
4022           i1 = idx[0];
4023           i2 = idx[1];
4024           idx += 2;
4025           tmp0 = t[i1];
4026           tmp1 = t[i2];
4027           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4028           v1 += 2;
4029         }
4030 
4031         if (n == sz - 1) {
4032           tmp0 = t[*idx];
4033           sum1 -= *v1 * tmp0;
4034         }
4035         x[row] += t[row] = sum1 * (*ibdiag++);
4036         row++;
4037         break;
4038       case 2:
4039         v2   = a->a + ii[row + 1];
4040         sum1 = t[row];
4041         sum2 = t[row + 1];
4042         for (n = 0; n < sz - 1; n += 2) {
4043           i1 = idx[0];
4044           i2 = idx[1];
4045           idx += 2;
4046           tmp0 = t[i1];
4047           tmp1 = t[i2];
4048           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4049           v1 += 2;
4050           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4051           v2 += 2;
4052         }
4053 
4054         if (n == sz - 1) {
4055           tmp0 = t[*idx];
4056           sum1 -= v1[0] * tmp0;
4057           sum2 -= v2[0] * tmp0;
4058         }
4059         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4060         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4061         ibdiag += 4;
4062         row += 2;
4063         break;
4064       case 3:
4065         v2   = a->a + ii[row + 1];
4066         v3   = a->a + ii[row + 2];
4067         sum1 = t[row];
4068         sum2 = t[row + 1];
4069         sum3 = t[row + 2];
4070         for (n = 0; n < sz - 1; n += 2) {
4071           i1 = idx[0];
4072           i2 = idx[1];
4073           idx += 2;
4074           tmp0 = t[i1];
4075           tmp1 = t[i2];
4076           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4077           v1 += 2;
4078           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4079           v2 += 2;
4080           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4081           v3 += 2;
4082         }
4083 
4084         if (n == sz - 1) {
4085           tmp0 = t[*idx];
4086           sum1 -= v1[0] * tmp0;
4087           sum2 -= v2[0] * tmp0;
4088           sum3 -= v3[0] * tmp0;
4089         }
4090         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4091         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4092         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4093         ibdiag += 9;
4094         row += 3;
4095         break;
4096       case 4:
4097         v2   = a->a + ii[row + 1];
4098         v3   = a->a + ii[row + 2];
4099         v4   = a->a + ii[row + 3];
4100         sum1 = t[row];
4101         sum2 = t[row + 1];
4102         sum3 = t[row + 2];
4103         sum4 = t[row + 3];
4104         for (n = 0; n < sz - 1; n += 2) {
4105           i1 = idx[0];
4106           i2 = idx[1];
4107           idx += 2;
4108           tmp0 = t[i1];
4109           tmp1 = t[i2];
4110           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4111           v1 += 2;
4112           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4113           v2 += 2;
4114           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4115           v3 += 2;
4116           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4117           v4 += 2;
4118         }
4119 
4120         if (n == sz - 1) {
4121           tmp0 = t[*idx];
4122           sum1 -= v1[0] * tmp0;
4123           sum2 -= v2[0] * tmp0;
4124           sum3 -= v3[0] * tmp0;
4125           sum4 -= v4[0] * tmp0;
4126         }
4127         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4128         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4129         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4130         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4131         ibdiag += 16;
4132         row += 4;
4133         break;
4134       case 5:
4135         v2   = a->a + ii[row + 1];
4136         v3   = a->a + ii[row + 2];
4137         v4   = a->a + ii[row + 3];
4138         v5   = a->a + ii[row + 4];
4139         sum1 = t[row];
4140         sum2 = t[row + 1];
4141         sum3 = t[row + 2];
4142         sum4 = t[row + 3];
4143         sum5 = t[row + 4];
4144         for (n = 0; n < sz - 1; n += 2) {
4145           i1 = idx[0];
4146           i2 = idx[1];
4147           idx += 2;
4148           tmp0 = t[i1];
4149           tmp1 = t[i2];
4150           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4151           v1 += 2;
4152           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4153           v2 += 2;
4154           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4155           v3 += 2;
4156           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4157           v4 += 2;
4158           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4159           v5 += 2;
4160         }
4161 
4162         if (n == sz - 1) {
4163           tmp0 = t[*idx];
4164           sum1 -= v1[0] * tmp0;
4165           sum2 -= v2[0] * tmp0;
4166           sum3 -= v3[0] * tmp0;
4167           sum4 -= v4[0] * tmp0;
4168           sum5 -= v5[0] * tmp0;
4169         }
4170         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4171         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4172         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4173         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4174         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4175         ibdiag += 25;
4176         row += 5;
4177         break;
4178       default:
4179         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4180       }
4181     }
4182     PetscCall(PetscLogFlops(a->nz));
4183   }
4184   PetscCall(VecRestoreArray(xx, &x));
4185   PetscCall(VecRestoreArrayRead(bb, &b));
4186   PetscFunctionReturn(PETSC_SUCCESS);
4187 }
4188 
4189 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4190 {
4191   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4192   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4193   const MatScalar   *bdiag = a->inode.bdiag;
4194   const PetscScalar *b;
4195   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4196   const PetscInt    *sizes = a->inode.size;
4197 
4198   PetscFunctionBegin;
4199   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4200   PetscCall(VecGetArray(xx, &x));
4201   PetscCall(VecGetArrayRead(bb, &b));
4202   cnt = 0;
4203   for (i = 0, row = 0; i < m; i++) {
4204     switch (sizes[i]) {
4205     case 1:
4206       x[row] = b[row] * bdiag[cnt++];
4207       row++;
4208       break;
4209     case 2:
4210       x1       = b[row];
4211       x2       = b[row + 1];
4212       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4213       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4214       x[row++] = tmp1;
4215       x[row++] = tmp2;
4216       cnt += 4;
4217       break;
4218     case 3:
4219       x1       = b[row];
4220       x2       = b[row + 1];
4221       x3       = b[row + 2];
4222       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4223       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4224       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4225       x[row++] = tmp1;
4226       x[row++] = tmp2;
4227       x[row++] = tmp3;
4228       cnt += 9;
4229       break;
4230     case 4:
4231       x1       = b[row];
4232       x2       = b[row + 1];
4233       x3       = b[row + 2];
4234       x4       = b[row + 3];
4235       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4236       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4237       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4238       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4239       x[row++] = tmp1;
4240       x[row++] = tmp2;
4241       x[row++] = tmp3;
4242       x[row++] = tmp4;
4243       cnt += 16;
4244       break;
4245     case 5:
4246       x1       = b[row];
4247       x2       = b[row + 1];
4248       x3       = b[row + 2];
4249       x4       = b[row + 3];
4250       x5       = b[row + 4];
4251       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4252       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4253       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4254       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4255       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4256       x[row++] = tmp1;
4257       x[row++] = tmp2;
4258       x[row++] = tmp3;
4259       x[row++] = tmp4;
4260       x[row++] = tmp5;
4261       cnt += 25;
4262       break;
4263     default:
4264       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4265     }
4266   }
4267   PetscCall(PetscLogFlops(2.0 * cnt));
4268   PetscCall(VecRestoreArray(xx, &x));
4269   PetscCall(VecRestoreArrayRead(bb, &b));
4270   PetscFunctionReturn(PETSC_SUCCESS);
4271 }
4272 
4273 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4274 {
4275   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4276 
4277   PetscFunctionBegin;
4278   a->inode.node_count       = 0;
4279   a->inode.use              = PETSC_FALSE;
4280   a->inode.checked          = PETSC_FALSE;
4281   a->inode.mat_nonzerostate = -1;
4282   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4283   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4284   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4285   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4286   A->ops->coloringpatch     = NULL;
4287   A->ops->multdiagonalblock = NULL;
4288   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4289   PetscFunctionReturn(PETSC_SUCCESS);
4290 }
4291 
4292 /*
4293     samestructure indicates that the matrix has not changed its nonzero structure so we
4294     do not need to recompute the inodes
4295 */
4296 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4297 {
4298   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4299   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4300   PetscBool       flag;
4301   const PetscInt *idx, *idy, *ii;
4302 
4303   PetscFunctionBegin;
4304   if (!a->inode.use) {
4305     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4306     PetscCall(PetscFree(a->inode.size));
4307     PetscFunctionReturn(PETSC_SUCCESS);
4308   }
4309   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4310 
4311   m = A->rmap->n;
4312   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4313   ns = a->inode.size;
4314 
4315   i          = 0;
4316   node_count = 0;
4317   idx        = a->j;
4318   ii         = a->i;
4319   if (idx) {
4320     while (i < m) {            /* For each row */
4321       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4322       /* Limits the number of elements in a node to 'a->inode.limit' */
4323       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4324         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4325         if (nzy != nzx) break;
4326         idy += nzx; /* Same nonzero pattern */
4327         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4328         if (!flag) break;
4329       }
4330       ns[node_count++] = blk_size;
4331       idx += blk_size * nzx;
4332       i = j;
4333     }
4334   }
4335   /* If not enough inodes found,, do not use inode version of the routines */
4336   if (!m || !idx || node_count > .8 * m) {
4337     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4338     PetscCall(PetscFree(a->inode.size));
4339     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4340   } else {
4341     if (!A->factortype) {
4342       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4343       if (A->rmap->n == A->cmap->n) {
4344         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4345         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4346         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4347         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4348         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4349       }
4350     } else {
4351       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4352     }
4353     a->inode.node_count = node_count;
4354     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4355   }
4356   a->inode.checked          = PETSC_TRUE;
4357   a->inode.mat_nonzerostate = A->nonzerostate;
4358   PetscFunctionReturn(PETSC_SUCCESS);
4359 }
4360 
4361 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4362 {
4363   Mat         B = *C;
4364   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4365   PetscInt    m = A->rmap->n;
4366 
4367   PetscFunctionBegin;
4368   c->inode.use              = a->inode.use;
4369   c->inode.limit            = a->inode.limit;
4370   c->inode.max_limit        = a->inode.max_limit;
4371   c->inode.checked          = PETSC_FALSE;
4372   c->inode.size             = NULL;
4373   c->inode.node_count       = 0;
4374   c->inode.ibdiagvalid      = PETSC_FALSE;
4375   c->inode.ibdiag           = NULL;
4376   c->inode.bdiag            = NULL;
4377   c->inode.mat_nonzerostate = -1;
4378   if (a->inode.use) {
4379     if (a->inode.checked && a->inode.size) {
4380       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4381       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4382 
4383       c->inode.checked          = PETSC_TRUE;
4384       c->inode.node_count       = a->inode.node_count;
4385       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4386     }
4387     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4388     if (!B->factortype) {
4389       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4390       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4391       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4392       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4393       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4394       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4395     } else {
4396       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4397     }
4398   }
4399   PetscFunctionReturn(PETSC_SUCCESS);
4400 }
4401 
4402 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4403 {
4404   PetscInt        k;
4405   const PetscInt *vi;
4406 
4407   PetscFunctionBegin;
4408   vi = aj + ai[row];
4409   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4410   vi        = aj + adiag[row];
4411   cols[nzl] = vi[0];
4412   vi        = aj + adiag[row + 1] + 1;
4413   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4414   PetscFunctionReturn(PETSC_SUCCESS);
4415 }
4416 /*
4417    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4418    Modified from MatSeqAIJCheckInode().
4419 
4420    Input Parameters:
4421 .  Mat A - ILU or LU matrix factor
4422 
4423 */
4424 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4425 {
4426   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4427   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4428   PetscInt       *cols1, *cols2, *ns;
4429   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4430   PetscBool       flag;
4431 
4432   PetscFunctionBegin;
4433   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4434   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4435 
4436   m = A->rmap->n;
4437   if (a->inode.size) ns = a->inode.size;
4438   else PetscCall(PetscMalloc1(m + 1, &ns));
4439 
4440   i          = 0;
4441   node_count = 0;
4442   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4443   while (i < m) {                       /* For each row */
4444     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4445     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4446     nzx  = nzl1 + nzu1 + 1;
4447     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4448 
4449     /* Limits the number of elements in a node to 'a->inode.limit' */
4450     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4451       nzl2 = ai[j + 1] - ai[j];
4452       nzu2 = adiag[j] - adiag[j + 1] - 1;
4453       nzy  = nzl2 + nzu2 + 1;
4454       if (nzy != nzx) break;
4455       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4456       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4457       if (!flag) break;
4458     }
4459     ns[node_count++] = blk_size;
4460     i                = j;
4461   }
4462   PetscCall(PetscFree2(cols1, cols2));
4463   /* If not enough inodes found,, do not use inode version of the routines */
4464   if (!m || node_count > .8 * m) {
4465     PetscCall(PetscFree(ns));
4466 
4467     a->inode.node_count = 0;
4468     a->inode.size       = NULL;
4469     a->inode.use        = PETSC_FALSE;
4470 
4471     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4472   } else {
4473     A->ops->mult              = NULL;
4474     A->ops->sor               = NULL;
4475     A->ops->multadd           = NULL;
4476     A->ops->getrowij          = NULL;
4477     A->ops->restorerowij      = NULL;
4478     A->ops->getcolumnij       = NULL;
4479     A->ops->restorecolumnij   = NULL;
4480     A->ops->coloringpatch     = NULL;
4481     A->ops->multdiagonalblock = NULL;
4482     a->inode.node_count       = node_count;
4483     a->inode.size             = ns;
4484 
4485     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4486   }
4487   a->inode.checked = PETSC_TRUE;
4488   PetscFunctionReturn(PETSC_SUCCESS);
4489 }
4490 
4491 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4492 {
4493   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4494 
4495   PetscFunctionBegin;
4496   a->inode.ibdiagvalid = PETSC_FALSE;
4497   PetscFunctionReturn(PETSC_SUCCESS);
4498 }
4499 
4500 /*
4501      This is really ugly. if inodes are used this replaces the
4502   permutations with ones that correspond to rows/cols of the matrix
4503   rather then inode blocks
4504 */
4505 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4506 {
4507   PetscFunctionBegin;
4508   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4509   PetscFunctionReturn(PETSC_SUCCESS);
4510 }
4511 
4512 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4513 {
4514   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4515   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4516   const PetscInt *ridx, *cidx;
4517   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4518   PetscInt        nslim_col, *ns_col;
4519   IS              ris = *rperm, cis = *cperm;
4520 
4521   PetscFunctionBegin;
4522   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4523   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4524 
4525   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4526   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4527   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4528 
4529   PetscCall(ISGetIndices(ris, &ridx));
4530   PetscCall(ISGetIndices(cis, &cidx));
4531 
4532   /* Form the inode structure for the rows of permuted matric using inv perm*/
4533   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4534 
4535   /* Construct the permutations for rows*/
4536   for (i = 0, row = 0; i < nslim_row; ++i) {
4537     indx      = ridx[i];
4538     start_val = tns[indx];
4539     end_val   = tns[indx + 1];
4540     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4541   }
4542 
4543   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4544   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4545 
4546   /* Construct permutations for columns */
4547   for (i = 0, col = 0; i < nslim_col; ++i) {
4548     indx      = cidx[i];
4549     start_val = tns[indx];
4550     end_val   = tns[indx + 1];
4551     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4552   }
4553 
4554   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4555   PetscCall(ISSetPermutation(*rperm));
4556   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4557   PetscCall(ISSetPermutation(*cperm));
4558 
4559   PetscCall(ISRestoreIndices(ris, &ridx));
4560   PetscCall(ISRestoreIndices(cis, &cidx));
4561 
4562   PetscCall(PetscFree(ns_col));
4563   PetscCall(PetscFree2(permr, permc));
4564   PetscCall(ISDestroy(&cis));
4565   PetscCall(ISDestroy(&ris));
4566   PetscCall(PetscFree(tns));
4567   PetscFunctionReturn(PETSC_SUCCESS);
4568 }
4569 
4570 /*@C
4571    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4572 
4573    Not Collective
4574 
4575    Input Parameter:
4576 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4577 
4578    Output Parameters:
4579 +  node_count - no of inodes present in the matrix.
4580 .  sizes      - an array of size `node_count`, with the sizes of each inode.
4581 -  limit      - the max size used to generate the inodes.
4582 
4583    Level: advanced
4584 
4585    Note:
4586    It should be called after the matrix is assembled.
4587    The contents of the sizes[] array should not be changed.
4588    `NULL` may be passed for information not needed
4589 
4590 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4591 @*/
4592 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4593 {
4594   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4595 
4596   PetscFunctionBegin;
4597   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4598   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4599   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4600   PetscFunctionReturn(PETSC_SUCCESS);
4601 }
4602 
4603 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4604 {
4605   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4606 
4607   PetscFunctionBegin;
4608   if (node_count) *node_count = a->inode.node_count;
4609   if (sizes) *sizes = a->inode.size;
4610   if (limit) *limit = a->inode.limit;
4611   PetscFunctionReturn(PETSC_SUCCESS);
4612 }
4613