xref: /petsc/src/mat/impls/aij/seq/inode.c (revision dc9a610e8a93b2ceab441eb761644f4cbb77655c)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30 
31   /* Use the same row structure wherever feasible. */
32   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
33 
34   /* if m < n; pad up the remainder with inode_limit */
35   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36   /* The last node is the odd ball. pad it up with the remaining rows; */
37   if (count < n) {
38     ns_col[i] = n - count;
39     i++;
40   } else if (count > n) {
41     /* Adjust for the over estimation */
42     ns_col[i - 1] += n - count;
43   }
44   *size = i;
45   *ns   = ns_col;
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 
49 /*
50       This builds symmetric version of nonzero structure,
51 */
52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53 {
54   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
55   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
58 
59   PetscFunctionBegin;
60   nslim_row = a->inode.node_count;
61   m         = A->rmap->n;
62   n         = A->cmap->n;
63   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
65 
66   /* Use the row_inode as column_inode */
67   nslim_col = nslim_row;
68   ns_col    = ns_row;
69 
70   /* allocate space for reformatted inode structure */
71   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
73 
74   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75     nsz = ns_col[i1];
76     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77   }
78   /* allocate space for row pointers */
79   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80   *iia = ia;
81   PetscCall(PetscMalloc1(nslim_row + 1, &work));
82 
83   /* determine the number of columns in each row */
84   ia[0] = oshift;
85   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86     j    = aj + ai[row] + ishift;
87     jmax = aj + ai[row + 1] + ishift;
88     if (j == jmax) continue; /* empty row */
89     col = *j++ + ishift;
90     i2  = tvc[col];
91     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92       ia[i1 + 1]++;
93       ia[i2 + 1]++;
94       i2++; /* Start col of next node */
95       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96       i2 = tvc[col];
97     }
98     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99   }
100 
101   /* shift ia[i] to point to next row */
102   for (i1 = 1; i1 < nslim_row + 1; i1++) {
103     row = ia[i1 - 1];
104     ia[i1] += row;
105     work[i1 - 1] = row - oshift;
106   }
107 
108   /* allocate space for column pointers */
109   nz = ia[nslim_row] + (!ishift);
110   PetscCall(PetscMalloc1(nz, &ja));
111   *jja = ja;
112 
113   /* loop over lower triangular part putting into ja */
114   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115     j    = aj + ai[row] + ishift;
116     jmax = aj + ai[row + 1] + ishift;
117     if (j == jmax) continue; /* empty row */
118     col = *j++ + ishift;
119     i2  = tvc[col];
120     while (i2 < i1 && j < jmax) {
121       ja[work[i2]++] = i1 + oshift;
122       ja[work[i1]++] = i2 + oshift;
123       ++i2;
124       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125       i2 = tvc[col];
126     }
127     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128   }
129   PetscCall(PetscFree(work));
130   PetscCall(PetscFree2(tns, tvc));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*
135       This builds nonsymmetric version of nonzero structure,
136 */
137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138 {
139   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
140   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141   PetscInt       *tns, *tvc, nsz, i1, i2;
142   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
143 
144   PetscFunctionBegin;
145   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146   nslim_row = a->inode.node_count;
147   n         = A->cmap->n;
148 
149   /* Create The column_inode for this matrix */
150   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
151 
152   /* allocate space for reformatted column_inode structure */
153   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
155 
156   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157     nsz = ns_col[i1];
158     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159   }
160   /* allocate space for row pointers */
161   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162   *iia = ia;
163   PetscCall(PetscMalloc1(nslim_row + 1, &work));
164 
165   /* determine the number of columns in each row */
166   ia[0] = oshift;
167   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168     j  = aj + ai[row] + ishift;
169     nz = ai[row + 1] - ai[row];
170     if (!nz) continue; /* empty row */
171     col = *j++ + ishift;
172     i2  = tvc[col];
173     while (nz-- > 0) { /* off-diagonal elements */
174       ia[i1 + 1]++;
175       i2++; /* Start col of next node */
176       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177       if (nz > 0) i2 = tvc[col];
178     }
179   }
180 
181   /* shift ia[i] to point to next row */
182   for (i1 = 1; i1 < nslim_row + 1; i1++) {
183     row = ia[i1 - 1];
184     ia[i1] += row;
185     work[i1 - 1] = row - oshift;
186   }
187 
188   /* allocate space for column pointers */
189   nz = ia[nslim_row] + (!ishift);
190   PetscCall(PetscMalloc1(nz, &ja));
191   *jja = ja;
192 
193   /* loop over matrix putting into ja */
194   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195     j  = aj + ai[row] + ishift;
196     nz = ai[row + 1] - ai[row];
197     if (!nz) continue; /* empty row */
198     col = *j++ + ishift;
199     i2  = tvc[col];
200     while (nz-- > 0) {
201       ja[work[i1]++] = i2 + oshift;
202       ++i2;
203       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204       if (nz > 0) i2 = tvc[col];
205     }
206   }
207   PetscCall(PetscFree(ns_col));
208   PetscCall(PetscFree(work));
209   PetscCall(PetscFree2(tns, tvc));
210   PetscFunctionReturn(PETSC_SUCCESS);
211 }
212 
213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214 {
215   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
216 
217   PetscFunctionBegin;
218   if (n) *n = a->inode.node_count;
219   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220   if (!blockcompressed) {
221     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222   } else if (symmetric) {
223     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224   } else {
225     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226   }
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231 {
232   PetscFunctionBegin;
233   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
234 
235   if (!blockcompressed) {
236     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237   } else {
238     PetscCall(PetscFree(*ia));
239     PetscCall(PetscFree(*ja));
240   }
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245 {
246   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
247   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
248   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
249 
250   PetscFunctionBegin;
251   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
252   nslim_row = a->inode.node_count;
253   n         = A->cmap->n;
254 
255   /* Create The column_inode for this matrix */
256   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
257 
258   /* allocate space for reformatted column_inode structure */
259   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
260   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
261 
262   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
263     nsz = ns_col[i1];
264     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
265   }
266   /* allocate space for column pointers */
267   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
268   *iia = ia;
269   PetscCall(PetscMalloc1(nslim_col + 1, &work));
270 
271   /* determine the number of columns in each row */
272   ia[0] = oshift;
273   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
274     j   = aj + ai[row] + ishift;
275     col = *j++ + ishift;
276     i2  = tvc[col];
277     nz  = ai[row + 1] - ai[row];
278     while (nz-- > 0) { /* off-diagonal elements */
279       /* ia[i1+1]++; */
280       ia[i2 + 1]++;
281       i2++;
282       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
283       if (nz > 0) i2 = tvc[col];
284     }
285   }
286 
287   /* shift ia[i] to point to next col */
288   for (i1 = 1; i1 < nslim_col + 1; i1++) {
289     col = ia[i1 - 1];
290     ia[i1] += col;
291     work[i1 - 1] = col - oshift;
292   }
293 
294   /* allocate space for column pointers */
295   nz = ia[nslim_col] + (!ishift);
296   PetscCall(PetscMalloc1(nz, &ja));
297   *jja = ja;
298 
299   /* loop over matrix putting into ja */
300   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
301     j   = aj + ai[row] + ishift;
302     col = *j++ + ishift;
303     i2  = tvc[col];
304     nz  = ai[row + 1] - ai[row];
305     while (nz-- > 0) {
306       /* ja[work[i1]++] = i2 + oshift; */
307       ja[work[i2]++] = i1 + oshift;
308       i2++;
309       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
310       if (nz > 0) i2 = tvc[col];
311     }
312   }
313   PetscCall(PetscFree(ns_col));
314   PetscCall(PetscFree(work));
315   PetscCall(PetscFree2(tns, tvc));
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320 {
321   PetscFunctionBegin;
322   PetscCall(MatCreateColInode_Private(A, n, NULL));
323   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
324 
325   if (!blockcompressed) {
326     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
327   } else if (symmetric) {
328     /* Since the indices are symmetric it doesn't matter */
329     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
330   } else {
331     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
332   }
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335 
336 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337 {
338   PetscFunctionBegin;
339   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
340   if (!blockcompressed) {
341     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
342   } else {
343     PetscCall(PetscFree(*ia));
344     PetscCall(PetscFree(*ja));
345   }
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350 {
351   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
352   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353   PetscScalar       *y;
354   const PetscScalar *x;
355   const MatScalar   *v1, *v2, *v3, *v4, *v5;
356   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
357   const PetscInt    *idx, *ns, *ii;
358 
359 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
360   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
361 #endif
362 
363   PetscFunctionBegin;
364   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
365   node_max = a->inode.node_count;
366   ns       = a->inode.size; /* Node Size array */
367   PetscCall(VecGetArrayRead(xx, &x));
368   PetscCall(VecGetArray(yy, &y));
369   idx = a->j;
370   v1  = a->a;
371   ii  = a->i;
372 
373   for (i = 0, row = 0; i < node_max; ++i) {
374     nsz = ns[i];
375     n   = ii[1] - ii[0];
376     nonzerorow += (n > 0) * nsz;
377     ii += nsz;
378     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
379     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
380     sz = n;                                                                /* No of non zeros in this row */
381                                                                            /* Switch on the size of Node */
382     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
383     case 1:
384       sum1 = 0.;
385 
386       for (n = 0; n < sz - 1; n += 2) {
387         i1 = idx[0]; /* The instructions are ordered to */
388         i2 = idx[1]; /* make the compiler's job easy */
389         idx += 2;
390         tmp0 = x[i1];
391         tmp1 = x[i2];
392         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
393         v1 += 2;
394       }
395 
396       if (n == sz - 1) { /* Take care of the last nonzero  */
397         tmp0 = x[*idx++];
398         sum1 += *v1++ * tmp0;
399       }
400       y[row++] = sum1;
401       break;
402     case 2:
403       sum1 = 0.;
404       sum2 = 0.;
405       v2   = v1 + n;
406 
407       for (n = 0; n < sz - 1; n += 2) {
408         i1 = idx[0];
409         i2 = idx[1];
410         idx += 2;
411         tmp0 = x[i1];
412         tmp1 = x[i2];
413         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
414         v1 += 2;
415         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
416         v2 += 2;
417       }
418       if (n == sz - 1) {
419         tmp0 = x[*idx++];
420         sum1 += *v1++ * tmp0;
421         sum2 += *v2++ * tmp0;
422       }
423       y[row++] = sum1;
424       y[row++] = sum2;
425       v1       = v2; /* Since the next block to be processed starts there*/
426       idx += sz;
427       break;
428     case 3:
429       sum1 = 0.;
430       sum2 = 0.;
431       sum3 = 0.;
432       v2   = v1 + n;
433       v3   = v2 + n;
434 
435       for (n = 0; n < sz - 1; n += 2) {
436         i1 = idx[0];
437         i2 = idx[1];
438         idx += 2;
439         tmp0 = x[i1];
440         tmp1 = x[i2];
441         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
442         v1 += 2;
443         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
444         v2 += 2;
445         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
446         v3 += 2;
447       }
448       if (n == sz - 1) {
449         tmp0 = x[*idx++];
450         sum1 += *v1++ * tmp0;
451         sum2 += *v2++ * tmp0;
452         sum3 += *v3++ * tmp0;
453       }
454       y[row++] = sum1;
455       y[row++] = sum2;
456       y[row++] = sum3;
457       v1       = v3; /* Since the next block to be processed starts there*/
458       idx += 2 * sz;
459       break;
460     case 4:
461       sum1 = 0.;
462       sum2 = 0.;
463       sum3 = 0.;
464       sum4 = 0.;
465       v2   = v1 + n;
466       v3   = v2 + n;
467       v4   = v3 + n;
468 
469       for (n = 0; n < sz - 1; n += 2) {
470         i1 = idx[0];
471         i2 = idx[1];
472         idx += 2;
473         tmp0 = x[i1];
474         tmp1 = x[i2];
475         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
476         v1 += 2;
477         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
478         v2 += 2;
479         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
480         v3 += 2;
481         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
482         v4 += 2;
483       }
484       if (n == sz - 1) {
485         tmp0 = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489         sum4 += *v4++ * tmp0;
490       }
491       y[row++] = sum1;
492       y[row++] = sum2;
493       y[row++] = sum3;
494       y[row++] = sum4;
495       v1       = v4; /* Since the next block to be processed starts there*/
496       idx += 3 * sz;
497       break;
498     case 5:
499       sum1 = 0.;
500       sum2 = 0.;
501       sum3 = 0.;
502       sum4 = 0.;
503       sum5 = 0.;
504       v2   = v1 + n;
505       v3   = v2 + n;
506       v4   = v3 + n;
507       v5   = v4 + n;
508 
509       for (n = 0; n < sz - 1; n += 2) {
510         i1 = idx[0];
511         i2 = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
516         v1 += 2;
517         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
518         v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
520         v3 += 2;
521         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
522         v4 += 2;
523         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
524         v5 += 2;
525       }
526       if (n == sz - 1) {
527         tmp0 = x[*idx++];
528         sum1 += *v1++ * tmp0;
529         sum2 += *v2++ * tmp0;
530         sum3 += *v3++ * tmp0;
531         sum4 += *v4++ * tmp0;
532         sum5 += *v5++ * tmp0;
533       }
534       y[row++] = sum1;
535       y[row++] = sum2;
536       y[row++] = sum3;
537       y[row++] = sum4;
538       y[row++] = sum5;
539       v1       = v5; /* Since the next block to be processed starts there */
540       idx += 4 * sz;
541       break;
542     default:
543       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
544     }
545   }
546   PetscCall(VecRestoreArrayRead(xx, &x));
547   PetscCall(VecRestoreArray(yy, &y));
548   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
549   PetscFunctionReturn(PETSC_SUCCESS);
550 }
551 
552 /* Almost same code as the MatMult_SeqAIJ_Inode() */
553 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554 {
555   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
556   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
557   const MatScalar   *v1, *v2, *v3, *v4, *v5;
558   const PetscScalar *x;
559   PetscScalar       *y, *z, *zt;
560   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
561   const PetscInt    *idx, *ns, *ii;
562 
563   PetscFunctionBegin;
564   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
565   node_max = a->inode.node_count;
566   ns       = a->inode.size; /* Node Size array */
567 
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
570   zt = z;
571 
572   idx = a->j;
573   v1  = a->a;
574   ii  = a->i;
575 
576   for (i = 0, row = 0; i < node_max; ++i) {
577     nsz = ns[i];
578     n   = ii[1] - ii[0];
579     ii += nsz;
580     sz = n;        /* No of non zeros in this row */
581                    /* Switch on the size of Node */
582     switch (nsz) { /* Each loop in 'case' is unrolled */
583     case 1:
584       sum1 = *zt++;
585 
586       for (n = 0; n < sz - 1; n += 2) {
587         i1 = idx[0]; /* The instructions are ordered to */
588         i2 = idx[1]; /* make the compiler's job easy */
589         idx += 2;
590         tmp0 = x[i1];
591         tmp1 = x[i2];
592         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
593         v1 += 2;
594       }
595 
596       if (n == sz - 1) { /* Take care of the last nonzero  */
597         tmp0 = x[*idx++];
598         sum1 += *v1++ * tmp0;
599       }
600       y[row++] = sum1;
601       break;
602     case 2:
603       sum1 = *zt++;
604       sum2 = *zt++;
605       v2   = v1 + n;
606 
607       for (n = 0; n < sz - 1; n += 2) {
608         i1 = idx[0];
609         i2 = idx[1];
610         idx += 2;
611         tmp0 = x[i1];
612         tmp1 = x[i2];
613         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
614         v1 += 2;
615         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
616         v2 += 2;
617       }
618       if (n == sz - 1) {
619         tmp0 = x[*idx++];
620         sum1 += *v1++ * tmp0;
621         sum2 += *v2++ * tmp0;
622       }
623       y[row++] = sum1;
624       y[row++] = sum2;
625       v1       = v2; /* Since the next block to be processed starts there*/
626       idx += sz;
627       break;
628     case 3:
629       sum1 = *zt++;
630       sum2 = *zt++;
631       sum3 = *zt++;
632       v2   = v1 + n;
633       v3   = v2 + n;
634 
635       for (n = 0; n < sz - 1; n += 2) {
636         i1 = idx[0];
637         i2 = idx[1];
638         idx += 2;
639         tmp0 = x[i1];
640         tmp1 = x[i2];
641         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
642         v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
644         v2 += 2;
645         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
646         v3 += 2;
647       }
648       if (n == sz - 1) {
649         tmp0 = x[*idx++];
650         sum1 += *v1++ * tmp0;
651         sum2 += *v2++ * tmp0;
652         sum3 += *v3++ * tmp0;
653       }
654       y[row++] = sum1;
655       y[row++] = sum2;
656       y[row++] = sum3;
657       v1       = v3; /* Since the next block to be processed starts there*/
658       idx += 2 * sz;
659       break;
660     case 4:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       sum4 = *zt++;
665       v2   = v1 + n;
666       v3   = v2 + n;
667       v4   = v3 + n;
668 
669       for (n = 0; n < sz - 1; n += 2) {
670         i1 = idx[0];
671         i2 = idx[1];
672         idx += 2;
673         tmp0 = x[i1];
674         tmp1 = x[i2];
675         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
676         v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
678         v2 += 2;
679         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
680         v3 += 2;
681         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
682         v4 += 2;
683       }
684       if (n == sz - 1) {
685         tmp0 = x[*idx++];
686         sum1 += *v1++ * tmp0;
687         sum2 += *v2++ * tmp0;
688         sum3 += *v3++ * tmp0;
689         sum4 += *v4++ * tmp0;
690       }
691       y[row++] = sum1;
692       y[row++] = sum2;
693       y[row++] = sum3;
694       y[row++] = sum4;
695       v1       = v4; /* Since the next block to be processed starts there*/
696       idx += 3 * sz;
697       break;
698     case 5:
699       sum1 = *zt++;
700       sum2 = *zt++;
701       sum3 = *zt++;
702       sum4 = *zt++;
703       sum5 = *zt++;
704       v2   = v1 + n;
705       v3   = v2 + n;
706       v4   = v3 + n;
707       v5   = v4 + n;
708 
709       for (n = 0; n < sz - 1; n += 2) {
710         i1 = idx[0];
711         i2 = idx[1];
712         idx += 2;
713         tmp0 = x[i1];
714         tmp1 = x[i2];
715         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
716         v1 += 2;
717         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
718         v2 += 2;
719         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
720         v3 += 2;
721         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
722         v4 += 2;
723         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
724         v5 += 2;
725       }
726       if (n == sz - 1) {
727         tmp0 = x[*idx++];
728         sum1 += *v1++ * tmp0;
729         sum2 += *v2++ * tmp0;
730         sum3 += *v3++ * tmp0;
731         sum4 += *v4++ * tmp0;
732         sum5 += *v5++ * tmp0;
733       }
734       y[row++] = sum1;
735       y[row++] = sum2;
736       y[row++] = sum3;
737       y[row++] = sum4;
738       y[row++] = sum5;
739       v1       = v5; /* Since the next block to be processed starts there */
740       idx += 4 * sz;
741       break;
742     default:
743       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
744     }
745   }
746   PetscCall(VecRestoreArrayRead(xx, &x));
747   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
748   PetscCall(PetscLogFlops(2.0 * a->nz));
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753 {
754   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
755   IS                 iscol = a->col, isrow = a->row;
756   const PetscInt    *r, *c, *rout, *cout;
757   PetscInt           i, j, n = A->rmap->n, nz;
758   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
759   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
761   PetscScalar        sum1, sum2, sum3, sum4, sum5;
762   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763   const PetscScalar *b;
764 
765   PetscFunctionBegin;
766   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
767   node_max = a->inode.node_count;
768   ns       = a->inode.size; /* Node Size array */
769 
770   PetscCall(VecGetArrayRead(bb, &b));
771   PetscCall(VecGetArrayWrite(xx, &x));
772   tmp = a->solve_work;
773 
774   PetscCall(ISGetIndices(isrow, &rout));
775   r = rout;
776   PetscCall(ISGetIndices(iscol, &cout));
777   c = cout + (n - 1);
778 
779   /* forward solve the lower triangular */
780   tmps = tmp;
781   aa   = a_a;
782   aj   = a_j;
783   ad   = a->diag;
784 
785   for (i = 0, row = 0; i < node_max; ++i) {
786     nsz = ns[i];
787     aii = ai[row];
788     v1  = aa + aii;
789     vi  = aj + aii;
790     nz  = ad[row] - aii;
791     if (i < node_max - 1) {
792       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
793       * but our indexing to determine its size could. */
794       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
795       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
796       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
797       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
798     }
799 
800     switch (nsz) { /* Each loop in 'case' is unrolled */
801     case 1:
802       sum1 = b[*r++];
803       for (j = 0; j < nz - 1; j += 2) {
804         i0 = vi[0];
805         i1 = vi[1];
806         vi += 2;
807         tmp0 = tmps[i0];
808         tmp1 = tmps[i1];
809         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
810         v1 += 2;
811       }
812       if (j == nz - 1) {
813         tmp0 = tmps[*vi++];
814         sum1 -= *v1++ * tmp0;
815       }
816       tmp[row++] = sum1;
817       break;
818     case 2:
819       sum1 = b[*r++];
820       sum2 = b[*r++];
821       v2   = aa + ai[row + 1];
822 
823       for (j = 0; j < nz - 1; j += 2) {
824         i0 = vi[0];
825         i1 = vi[1];
826         vi += 2;
827         tmp0 = tmps[i0];
828         tmp1 = tmps[i1];
829         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
830         v1 += 2;
831         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
832         v2 += 2;
833       }
834       if (j == nz - 1) {
835         tmp0 = tmps[*vi++];
836         sum1 -= *v1++ * tmp0;
837         sum2 -= *v2++ * tmp0;
838       }
839       sum2 -= *v2++ * sum1;
840       tmp[row++] = sum1;
841       tmp[row++] = sum2;
842       break;
843     case 3:
844       sum1 = b[*r++];
845       sum2 = b[*r++];
846       sum3 = b[*r++];
847       v2   = aa + ai[row + 1];
848       v3   = aa + ai[row + 2];
849 
850       for (j = 0; j < nz - 1; j += 2) {
851         i0 = vi[0];
852         i1 = vi[1];
853         vi += 2;
854         tmp0 = tmps[i0];
855         tmp1 = tmps[i1];
856         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
857         v1 += 2;
858         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
859         v2 += 2;
860         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
861         v3 += 2;
862       }
863       if (j == nz - 1) {
864         tmp0 = tmps[*vi++];
865         sum1 -= *v1++ * tmp0;
866         sum2 -= *v2++ * tmp0;
867         sum3 -= *v3++ * tmp0;
868       }
869       sum2 -= *v2++ * sum1;
870       sum3 -= *v3++ * sum1;
871       sum3 -= *v3++ * sum2;
872 
873       tmp[row++] = sum1;
874       tmp[row++] = sum2;
875       tmp[row++] = sum3;
876       break;
877 
878     case 4:
879       sum1 = b[*r++];
880       sum2 = b[*r++];
881       sum3 = b[*r++];
882       sum4 = b[*r++];
883       v2   = aa + ai[row + 1];
884       v3   = aa + ai[row + 2];
885       v4   = aa + ai[row + 3];
886 
887       for (j = 0; j < nz - 1; j += 2) {
888         i0 = vi[0];
889         i1 = vi[1];
890         vi += 2;
891         tmp0 = tmps[i0];
892         tmp1 = tmps[i1];
893         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
894         v1 += 2;
895         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
896         v2 += 2;
897         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
898         v3 += 2;
899         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
900         v4 += 2;
901       }
902       if (j == nz - 1) {
903         tmp0 = tmps[*vi++];
904         sum1 -= *v1++ * tmp0;
905         sum2 -= *v2++ * tmp0;
906         sum3 -= *v3++ * tmp0;
907         sum4 -= *v4++ * tmp0;
908       }
909       sum2 -= *v2++ * sum1;
910       sum3 -= *v3++ * sum1;
911       sum4 -= *v4++ * sum1;
912       sum3 -= *v3++ * sum2;
913       sum4 -= *v4++ * sum2;
914       sum4 -= *v4++ * sum3;
915 
916       tmp[row++] = sum1;
917       tmp[row++] = sum2;
918       tmp[row++] = sum3;
919       tmp[row++] = sum4;
920       break;
921     case 5:
922       sum1 = b[*r++];
923       sum2 = b[*r++];
924       sum3 = b[*r++];
925       sum4 = b[*r++];
926       sum5 = b[*r++];
927       v2   = aa + ai[row + 1];
928       v3   = aa + ai[row + 2];
929       v4   = aa + ai[row + 3];
930       v5   = aa + ai[row + 4];
931 
932       for (j = 0; j < nz - 1; j += 2) {
933         i0 = vi[0];
934         i1 = vi[1];
935         vi += 2;
936         tmp0 = tmps[i0];
937         tmp1 = tmps[i1];
938         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
939         v1 += 2;
940         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
941         v2 += 2;
942         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
943         v3 += 2;
944         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
945         v4 += 2;
946         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
947         v5 += 2;
948       }
949       if (j == nz - 1) {
950         tmp0 = tmps[*vi++];
951         sum1 -= *v1++ * tmp0;
952         sum2 -= *v2++ * tmp0;
953         sum3 -= *v3++ * tmp0;
954         sum4 -= *v4++ * tmp0;
955         sum5 -= *v5++ * tmp0;
956       }
957 
958       sum2 -= *v2++ * sum1;
959       sum3 -= *v3++ * sum1;
960       sum4 -= *v4++ * sum1;
961       sum5 -= *v5++ * sum1;
962       sum3 -= *v3++ * sum2;
963       sum4 -= *v4++ * sum2;
964       sum5 -= *v5++ * sum2;
965       sum4 -= *v4++ * sum3;
966       sum5 -= *v5++ * sum3;
967       sum5 -= *v5++ * sum4;
968 
969       tmp[row++] = sum1;
970       tmp[row++] = sum2;
971       tmp[row++] = sum3;
972       tmp[row++] = sum4;
973       tmp[row++] = sum5;
974       break;
975     default:
976       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
977     }
978   }
979   /* backward solve the upper triangular */
980   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
981     nsz = ns[i];
982     aii = ai[row + 1] - 1;
983     v1  = aa + aii;
984     vi  = aj + aii;
985     nz  = aii - ad[row];
986     switch (nsz) { /* Each loop in 'case' is unrolled */
987     case 1:
988       sum1 = tmp[row];
989 
990       for (j = nz; j > 1; j -= 2) {
991         vi -= 2;
992         i0   = vi[2];
993         i1   = vi[1];
994         tmp0 = tmps[i0];
995         tmp1 = tmps[i1];
996         v1 -= 2;
997         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
998       }
999       if (j == 1) {
1000         tmp0 = tmps[*vi--];
1001         sum1 -= *v1-- * tmp0;
1002       }
1003       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1004       row--;
1005       break;
1006     case 2:
1007       sum1 = tmp[row];
1008       sum2 = tmp[row - 1];
1009       v2   = aa + ai[row] - 1;
1010       for (j = nz; j > 1; j -= 2) {
1011         vi -= 2;
1012         i0   = vi[2];
1013         i1   = vi[1];
1014         tmp0 = tmps[i0];
1015         tmp1 = tmps[i1];
1016         v1 -= 2;
1017         v2 -= 2;
1018         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020       }
1021       if (j == 1) {
1022         tmp0 = tmps[*vi--];
1023         sum1 -= *v1-- * tmp0;
1024         sum2 -= *v2-- * tmp0;
1025       }
1026 
1027       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1028       row--;
1029       sum2 -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1031       row--;
1032       break;
1033     case 3:
1034       sum1 = tmp[row];
1035       sum2 = tmp[row - 1];
1036       sum3 = tmp[row - 2];
1037       v2   = aa + ai[row] - 1;
1038       v3   = aa + ai[row - 1] - 1;
1039       for (j = nz; j > 1; j -= 2) {
1040         vi -= 2;
1041         i0   = vi[2];
1042         i1   = vi[1];
1043         tmp0 = tmps[i0];
1044         tmp1 = tmps[i1];
1045         v1 -= 2;
1046         v2 -= 2;
1047         v3 -= 2;
1048         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051       }
1052       if (j == 1) {
1053         tmp0 = tmps[*vi--];
1054         sum1 -= *v1-- * tmp0;
1055         sum2 -= *v2-- * tmp0;
1056         sum3 -= *v3-- * tmp0;
1057       }
1058       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1059       row--;
1060       sum2 -= *v2-- * tmp0;
1061       sum3 -= *v3-- * tmp0;
1062       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1063       row--;
1064       sum3 -= *v3-- * tmp0;
1065       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1066       row--;
1067 
1068       break;
1069     case 4:
1070       sum1 = tmp[row];
1071       sum2 = tmp[row - 1];
1072       sum3 = tmp[row - 2];
1073       sum4 = tmp[row - 3];
1074       v2   = aa + ai[row] - 1;
1075       v3   = aa + ai[row - 1] - 1;
1076       v4   = aa + ai[row - 2] - 1;
1077 
1078       for (j = nz; j > 1; j -= 2) {
1079         vi -= 2;
1080         i0   = vi[2];
1081         i1   = vi[1];
1082         tmp0 = tmps[i0];
1083         tmp1 = tmps[i1];
1084         v1 -= 2;
1085         v2 -= 2;
1086         v3 -= 2;
1087         v4 -= 2;
1088         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092       }
1093       if (j == 1) {
1094         tmp0 = tmps[*vi--];
1095         sum1 -= *v1-- * tmp0;
1096         sum2 -= *v2-- * tmp0;
1097         sum3 -= *v3-- * tmp0;
1098         sum4 -= *v4-- * tmp0;
1099       }
1100 
1101       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1102       row--;
1103       sum2 -= *v2-- * tmp0;
1104       sum3 -= *v3-- * tmp0;
1105       sum4 -= *v4-- * tmp0;
1106       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1107       row--;
1108       sum3 -= *v3-- * tmp0;
1109       sum4 -= *v4-- * tmp0;
1110       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1111       row--;
1112       sum4 -= *v4-- * tmp0;
1113       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1114       row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row - 1];
1119       sum3 = tmp[row - 2];
1120       sum4 = tmp[row - 3];
1121       sum5 = tmp[row - 4];
1122       v2   = aa + ai[row] - 1;
1123       v3   = aa + ai[row - 1] - 1;
1124       v4   = aa + ai[row - 2] - 1;
1125       v5   = aa + ai[row - 3] - 1;
1126       for (j = nz; j > 1; j -= 2) {
1127         vi -= 2;
1128         i0   = vi[2];
1129         i1   = vi[1];
1130         tmp0 = tmps[i0];
1131         tmp1 = tmps[i1];
1132         v1 -= 2;
1133         v2 -= 2;
1134         v3 -= 2;
1135         v4 -= 2;
1136         v5 -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j == 1) {
1144         tmp0 = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1153       row--;
1154       sum2 -= *v2-- * tmp0;
1155       sum3 -= *v3-- * tmp0;
1156       sum4 -= *v4-- * tmp0;
1157       sum5 -= *v5-- * tmp0;
1158       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1159       row--;
1160       sum3 -= *v3-- * tmp0;
1161       sum4 -= *v4-- * tmp0;
1162       sum5 -= *v5-- * tmp0;
1163       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1164       row--;
1165       sum4 -= *v4-- * tmp0;
1166       sum5 -= *v5-- * tmp0;
1167       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1168       row--;
1169       sum5 -= *v5-- * tmp0;
1170       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1171       row--;
1172       break;
1173     default:
1174       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1175     }
1176   }
1177   PetscCall(ISRestoreIndices(isrow, &rout));
1178   PetscCall(ISRestoreIndices(iscol, &cout));
1179   PetscCall(VecRestoreArrayRead(bb, &b));
1180   PetscCall(VecRestoreArrayWrite(xx, &x));
1181   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186 {
1187   Mat              C = B;
1188   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1189   IS               isrow = b->row, isicol = b->icol;
1190   const PetscInt  *r, *ic, *ics;
1191   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1192   PetscInt         i, j, k, nz, nzL, row, *pj;
1193   const PetscInt  *ajtmp, *bjtmp;
1194   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1195   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1196   FactorShiftCtx   sctx;
1197   const PetscInt  *ddiag;
1198   PetscReal        rs;
1199   MatScalar        d;
1200   PetscInt         inod, nodesz, node_max, col;
1201   const PetscInt  *ns;
1202   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1203 
1204   PetscFunctionBegin;
1205   /* MatPivotSetUp(): initialize shift context sctx */
1206   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1207 
1208   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1209     ddiag          = a->diag;
1210     sctx.shift_top = info->zeropivot;
1211     for (i = 0; i < n; i++) {
1212       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1213       d  = (aa)[ddiag[i]];
1214       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1215       v  = aa + ai[i];
1216       nz = ai[i + 1] - ai[i];
1217       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1218       if (rs > sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top *= 1.1;
1221     sctx.nshift_max = 5;
1222     sctx.shift_lo   = 0.;
1223     sctx.shift_hi   = 1.;
1224   }
1225 
1226   PetscCall(ISGetIndices(isrow, &r));
1227   PetscCall(ISGetIndices(isicol, &ic));
1228 
1229   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1230   ics = ic;
1231 
1232   node_max = a->inode.node_count;
1233   ns       = a->inode.size;
1234   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1235 
1236   /* If max inode size > 4, split it into two inodes.*/
1237   /* also map the inode sizes according to the ordering */
1238   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1239   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240     if (ns[i] > 4) {
1241       tmp_vec1[j] = 4;
1242       ++j;
1243       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1244     } else {
1245       tmp_vec1[j] = ns[i];
1246     }
1247   }
1248   /* Use the correct node_max */
1249   node_max = j;
1250 
1251   /* Now reorder the inode info based on mat re-ordering info */
1252   /* First create a row -> inode_size_array_index map */
1253   PetscCall(PetscMalloc1(n + 1, &nsmap));
1254   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1255   for (i = 0, row = 0; i < node_max; i++) {
1256     nodesz = tmp_vec1[i];
1257     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i = 0, j = 0; i < node_max; i++) {
1261     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262     tmp_vec2[i] = nodesz;
1263     j += nodesz;
1264   }
1265   PetscCall(PetscFree(nsmap));
1266   PetscCall(PetscFree(tmp_vec1));
1267 
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.newshift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1275       nodesz = ns[inod];
1276 
1277       switch (nodesz) {
1278       case 1:
1279         /* zero rtmp1 */
1280         /* L part */
1281         nz    = bi[i + 1] - bi[i];
1282         bjtmp = bj + bi[i];
1283         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1284 
1285         /* U part */
1286         nz    = bdiag[i] - bdiag[i + 1];
1287         bjtmp = bj + bdiag[i + 1] + 1;
1288         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289 
1290         /* load in initial (unfactored row) */
1291         nz    = ai[r[i] + 1] - ai[r[i]];
1292         ajtmp = aj + ai[r[i]];
1293         v     = aa + ai[r[i]];
1294         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1295 
1296         /* ZeropivotApply() */
1297         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1298 
1299         /* elimination */
1300         bjtmp = bj + bi[i];
1301         row   = *bjtmp++;
1302         nzL   = bi[i + 1] - bi[i];
1303         for (k = 0; k < nzL; k++) {
1304           pc = rtmp1 + row;
1305           if (*pc != 0.0) {
1306             pv   = b->a + bdiag[row];
1307             mul1 = *pc * (*pv);
1308             *pc  = mul1;
1309             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1310             pv   = b->a + bdiag[row + 1] + 1;
1311             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1313             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1314           }
1315           row = *bjtmp++;
1316         }
1317 
1318         /* finished row so stick it into b->a */
1319         rs = 0.0;
1320         /* L part */
1321         pv = b->a + bi[i];
1322         pj = b->j + bi[i];
1323         nz = bi[i + 1] - bi[i];
1324         for (j = 0; j < nz; j++) {
1325           pv[j] = rtmp1[pj[j]];
1326           rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* U part */
1330         pv = b->a + bdiag[i + 1] + 1;
1331         pj = b->j + bdiag[i + 1] + 1;
1332         nz = bdiag[i] - bdiag[i + 1] - 1;
1333         for (j = 0; j < nz; j++) {
1334           pv[j] = rtmp1[pj[j]];
1335           rs += PetscAbsScalar(pv[j]);
1336         }
1337 
1338         /* Check zero pivot */
1339         sctx.rs = rs;
1340         sctx.pv = rtmp1[i];
1341         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1342         if (sctx.newshift) break;
1343 
1344         /* Mark diagonal and invert diagonal for simpler triangular solves */
1345         pv  = b->a + bdiag[i];
1346         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347         break;
1348 
1349       case 2:
1350         /* zero rtmp1 and rtmp2 */
1351         /* L part */
1352         nz    = bi[i + 1] - bi[i];
1353         bjtmp = bj + bi[i];
1354         for (j = 0; j < nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0;
1357           rtmp2[col] = 0.0;
1358         }
1359 
1360         /* U part */
1361         nz    = bdiag[i] - bdiag[i + 1];
1362         bjtmp = bj + bdiag[i + 1] + 1;
1363         for (j = 0; j < nz; j++) {
1364           col        = bjtmp[j];
1365           rtmp1[col] = 0.0;
1366           rtmp2[col] = 0.0;
1367         }
1368 
1369         /* load in initial (unfactored row) */
1370         nz    = ai[r[i] + 1] - ai[r[i]];
1371         ajtmp = aj + ai[r[i]];
1372         v1    = aa + ai[r[i]];
1373         v2    = aa + ai[r[i] + 1];
1374         for (j = 0; j < nz; j++) {
1375           col        = ics[ajtmp[j]];
1376           rtmp1[col] = v1[j];
1377           rtmp2[col] = v2[j];
1378         }
1379         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380         rtmp1[i] += sctx.shift_amount;
1381         rtmp2[i + 1] += sctx.shift_amount;
1382 
1383         /* elimination */
1384         bjtmp = bj + bi[i];
1385         row   = *bjtmp++; /* pivot row */
1386         nzL   = bi[i + 1] - bi[i];
1387         for (k = 0; k < nzL; k++) {
1388           pc1 = rtmp1 + row;
1389           pc2 = rtmp2 + row;
1390           if (*pc1 != 0.0 || *pc2 != 0.0) {
1391             pv   = b->a + bdiag[row];
1392             mul1 = *pc1 * (*pv);
1393             mul2 = *pc2 * (*pv);
1394             *pc1 = mul1;
1395             *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row + 1] + 1;
1399             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1400             for (j = 0; j < nz; j++) {
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i];
1415         nz  = bi[i + 1] - bi[i];
1416         for (j = 0; j < nz; j++) {
1417           col    = pj[j];
1418           pc1[j] = rtmp1[col];
1419           rs += PetscAbsScalar(pc1[j]);
1420         }
1421         /* U part */
1422         pc1 = b->a + bdiag[i + 1] + 1;
1423         pj  = b->j + bdiag[i + 1] + 1;
1424         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1425         for (j = 0; j < nz; j++) {
1426           col    = pj[j];
1427           pc1[j] = rtmp1[col];
1428           rs += PetscAbsScalar(pc1[j]);
1429         }
1430 
1431         sctx.rs = rs;
1432         sctx.pv = rtmp1[i];
1433         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1434         if (sctx.newshift) break;
1435         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1436         *pc1 = 1.0 / sctx.pv;
1437 
1438         /* Now take care of diagonal 2x2 block. */
1439         pc2 = rtmp2 + i;
1440         if (*pc2 != 0.0) {
1441           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1442           *pc2 = mul1;                        /* insert L entry */
1443           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1444           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445           for (j = 0; j < nz; j++) {
1446             col = pj[j];
1447             rtmp2[col] -= mul1 * rtmp1[col];
1448           }
1449           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450         }
1451 
1452         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453         rs = 0.0;
1454         /* L part */
1455         pc2 = b->a + bi[i + 1];
1456         pj  = b->j + bi[i + 1];
1457         nz  = bi[i + 2] - bi[i + 1];
1458         for (j = 0; j < nz; j++) {
1459           col    = pj[j];
1460           pc2[j] = rtmp2[col];
1461           rs += PetscAbsScalar(pc2[j]);
1462         }
1463         /* U part */
1464         pc2 = b->a + bdiag[i + 2] + 1;
1465         pj  = b->j + bdiag[i + 2] + 1;
1466         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467         for (j = 0; j < nz; j++) {
1468           col    = pj[j];
1469           pc2[j] = rtmp2[col];
1470           rs += PetscAbsScalar(pc2[j]);
1471         }
1472 
1473         sctx.rs = rs;
1474         sctx.pv = rtmp2[i + 1];
1475         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1476         if (sctx.newshift) break;
1477         pc2  = b->a + bdiag[i + 1];
1478         *pc2 = 1.0 / sctx.pv;
1479         break;
1480 
1481       case 3:
1482         /* zero rtmp */
1483         /* L part */
1484         nz    = bi[i + 1] - bi[i];
1485         bjtmp = bj + bi[i];
1486         for (j = 0; j < nz; j++) {
1487           col        = bjtmp[j];
1488           rtmp1[col] = 0.0;
1489           rtmp2[col] = 0.0;
1490           rtmp3[col] = 0.0;
1491         }
1492 
1493         /* U part */
1494         nz    = bdiag[i] - bdiag[i + 1];
1495         bjtmp = bj + bdiag[i + 1] + 1;
1496         for (j = 0; j < nz; j++) {
1497           col        = bjtmp[j];
1498           rtmp1[col] = 0.0;
1499           rtmp2[col] = 0.0;
1500           rtmp3[col] = 0.0;
1501         }
1502 
1503         /* load in initial (unfactored row) */
1504         nz    = ai[r[i] + 1] - ai[r[i]];
1505         ajtmp = aj + ai[r[i]];
1506         v1    = aa + ai[r[i]];
1507         v2    = aa + ai[r[i] + 1];
1508         v3    = aa + ai[r[i] + 2];
1509         for (j = 0; j < nz; j++) {
1510           col        = ics[ajtmp[j]];
1511           rtmp1[col] = v1[j];
1512           rtmp2[col] = v2[j];
1513           rtmp3[col] = v3[j];
1514         }
1515         /* ZeropivotApply(): shift the diagonal of the matrix  */
1516         rtmp1[i] += sctx.shift_amount;
1517         rtmp2[i + 1] += sctx.shift_amount;
1518         rtmp3[i + 2] += sctx.shift_amount;
1519 
1520         /* elimination */
1521         bjtmp = bj + bi[i];
1522         row   = *bjtmp++; /* pivot row */
1523         nzL   = bi[i + 1] - bi[i];
1524         for (k = 0; k < nzL; k++) {
1525           pc1 = rtmp1 + row;
1526           pc2 = rtmp2 + row;
1527           pc3 = rtmp3 + row;
1528           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1529             pv   = b->a + bdiag[row];
1530             mul1 = *pc1 * (*pv);
1531             mul2 = *pc2 * (*pv);
1532             mul3 = *pc3 * (*pv);
1533             *pc1 = mul1;
1534             *pc2 = mul2;
1535             *pc3 = mul3;
1536 
1537             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1538             pv = b->a + bdiag[row + 1] + 1;
1539             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1540             for (j = 0; j < nz; j++) {
1541               col = pj[j];
1542               rtmp1[col] -= mul1 * pv[j];
1543               rtmp2[col] -= mul2 * pv[j];
1544               rtmp3[col] -= mul3 * pv[j];
1545             }
1546             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1547           }
1548           row = *bjtmp++;
1549         }
1550 
1551         /* finished row i; check zero pivot, then stick row i into b->a */
1552         rs = 0.0;
1553         /* L part */
1554         pc1 = b->a + bi[i];
1555         pj  = b->j + bi[i];
1556         nz  = bi[i + 1] - bi[i];
1557         for (j = 0; j < nz; j++) {
1558           col    = pj[j];
1559           pc1[j] = rtmp1[col];
1560           rs += PetscAbsScalar(pc1[j]);
1561         }
1562         /* U part */
1563         pc1 = b->a + bdiag[i + 1] + 1;
1564         pj  = b->j + bdiag[i + 1] + 1;
1565         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566         for (j = 0; j < nz; j++) {
1567           col    = pj[j];
1568           pc1[j] = rtmp1[col];
1569           rs += PetscAbsScalar(pc1[j]);
1570         }
1571 
1572         sctx.rs = rs;
1573         sctx.pv = rtmp1[i];
1574         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1575         if (sctx.newshift) break;
1576         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1577         *pc1 = 1.0 / sctx.pv;
1578 
1579         /* Now take care of 1st column of diagonal 3x3 block. */
1580         pc2 = rtmp2 + i;
1581         pc3 = rtmp3 + i;
1582         if (*pc2 != 0.0 || *pc3 != 0.0) {
1583           mul2 = (*pc2) * (*pc1);
1584           *pc2 = mul2;
1585           mul3 = (*pc3) * (*pc1);
1586           *pc3 = mul3;
1587           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1588           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1589           for (j = 0; j < nz; j++) {
1590             col = pj[j];
1591             rtmp2[col] -= mul2 * rtmp1[col];
1592             rtmp3[col] -= mul3 * rtmp1[col];
1593           }
1594           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1595         }
1596 
1597         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598         rs = 0.0;
1599         /* L part */
1600         pc2 = b->a + bi[i + 1];
1601         pj  = b->j + bi[i + 1];
1602         nz  = bi[i + 2] - bi[i + 1];
1603         for (j = 0; j < nz; j++) {
1604           col    = pj[j];
1605           pc2[j] = rtmp2[col];
1606           rs += PetscAbsScalar(pc2[j]);
1607         }
1608         /* U part */
1609         pc2 = b->a + bdiag[i + 2] + 1;
1610         pj  = b->j + bdiag[i + 2] + 1;
1611         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612         for (j = 0; j < nz; j++) {
1613           col    = pj[j];
1614           pc2[j] = rtmp2[col];
1615           rs += PetscAbsScalar(pc2[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp2[i + 1];
1620         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1621         if (sctx.newshift) break;
1622         pc2  = b->a + bdiag[i + 1];
1623         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1624 
1625         /* Now take care of 2nd column of diagonal 3x3 block. */
1626         pc3 = rtmp3 + i + 1;
1627         if (*pc3 != 0.0) {
1628           mul3 = (*pc3) * (*pc2);
1629           *pc3 = mul3;
1630           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1631           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1632           for (j = 0; j < nz; j++) {
1633             col = pj[j];
1634             rtmp3[col] -= mul3 * rtmp2[col];
1635           }
1636           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1637         }
1638 
1639         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1640         rs = 0.0;
1641         /* L part */
1642         pc3 = b->a + bi[i + 2];
1643         pj  = b->j + bi[i + 2];
1644         nz  = bi[i + 3] - bi[i + 2];
1645         for (j = 0; j < nz; j++) {
1646           col    = pj[j];
1647           pc3[j] = rtmp3[col];
1648           rs += PetscAbsScalar(pc3[j]);
1649         }
1650         /* U part */
1651         pc3 = b->a + bdiag[i + 3] + 1;
1652         pj  = b->j + bdiag[i + 3] + 1;
1653         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1654         for (j = 0; j < nz; j++) {
1655           col    = pj[j];
1656           pc3[j] = rtmp3[col];
1657           rs += PetscAbsScalar(pc3[j]);
1658         }
1659 
1660         sctx.rs = rs;
1661         sctx.pv = rtmp3[i + 2];
1662         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1663         if (sctx.newshift) break;
1664         pc3  = b->a + bdiag[i + 2];
1665         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1666         break;
1667       case 4:
1668         /* zero rtmp */
1669         /* L part */
1670         nz    = bi[i + 1] - bi[i];
1671         bjtmp = bj + bi[i];
1672         for (j = 0; j < nz; j++) {
1673           col        = bjtmp[j];
1674           rtmp1[col] = 0.0;
1675           rtmp2[col] = 0.0;
1676           rtmp3[col] = 0.0;
1677           rtmp4[col] = 0.0;
1678         }
1679 
1680         /* U part */
1681         nz    = bdiag[i] - bdiag[i + 1];
1682         bjtmp = bj + bdiag[i + 1] + 1;
1683         for (j = 0; j < nz; j++) {
1684           col        = bjtmp[j];
1685           rtmp1[col] = 0.0;
1686           rtmp2[col] = 0.0;
1687           rtmp3[col] = 0.0;
1688           rtmp4[col] = 0.0;
1689         }
1690 
1691         /* load in initial (unfactored row) */
1692         nz    = ai[r[i] + 1] - ai[r[i]];
1693         ajtmp = aj + ai[r[i]];
1694         v1    = aa + ai[r[i]];
1695         v2    = aa + ai[r[i] + 1];
1696         v3    = aa + ai[r[i] + 2];
1697         v4    = aa + ai[r[i] + 3];
1698         for (j = 0; j < nz; j++) {
1699           col        = ics[ajtmp[j]];
1700           rtmp1[col] = v1[j];
1701           rtmp2[col] = v2[j];
1702           rtmp3[col] = v3[j];
1703           rtmp4[col] = v4[j];
1704         }
1705         /* ZeropivotApply(): shift the diagonal of the matrix  */
1706         rtmp1[i] += sctx.shift_amount;
1707         rtmp2[i + 1] += sctx.shift_amount;
1708         rtmp3[i + 2] += sctx.shift_amount;
1709         rtmp4[i + 3] += sctx.shift_amount;
1710 
1711         /* elimination */
1712         bjtmp = bj + bi[i];
1713         row   = *bjtmp++; /* pivot row */
1714         nzL   = bi[i + 1] - bi[i];
1715         for (k = 0; k < nzL; k++) {
1716           pc1 = rtmp1 + row;
1717           pc2 = rtmp2 + row;
1718           pc3 = rtmp3 + row;
1719           pc4 = rtmp4 + row;
1720           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721             pv   = b->a + bdiag[row];
1722             mul1 = *pc1 * (*pv);
1723             mul2 = *pc2 * (*pv);
1724             mul3 = *pc3 * (*pv);
1725             mul4 = *pc4 * (*pv);
1726             *pc1 = mul1;
1727             *pc2 = mul2;
1728             *pc3 = mul3;
1729             *pc4 = mul4;
1730 
1731             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1732             pv = b->a + bdiag[row + 1] + 1;
1733             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1734             for (j = 0; j < nz; j++) {
1735               col = pj[j];
1736               rtmp1[col] -= mul1 * pv[j];
1737               rtmp2[col] -= mul2 * pv[j];
1738               rtmp3[col] -= mul3 * pv[j];
1739               rtmp4[col] -= mul4 * pv[j];
1740             }
1741             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1742           }
1743           row = *bjtmp++;
1744         }
1745 
1746         /* finished row i; check zero pivot, then stick row i into b->a */
1747         rs = 0.0;
1748         /* L part */
1749         pc1 = b->a + bi[i];
1750         pj  = b->j + bi[i];
1751         nz  = bi[i + 1] - bi[i];
1752         for (j = 0; j < nz; j++) {
1753           col    = pj[j];
1754           pc1[j] = rtmp1[col];
1755           rs += PetscAbsScalar(pc1[j]);
1756         }
1757         /* U part */
1758         pc1 = b->a + bdiag[i + 1] + 1;
1759         pj  = b->j + bdiag[i + 1] + 1;
1760         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1761         for (j = 0; j < nz; j++) {
1762           col    = pj[j];
1763           pc1[j] = rtmp1[col];
1764           rs += PetscAbsScalar(pc1[j]);
1765         }
1766 
1767         sctx.rs = rs;
1768         sctx.pv = rtmp1[i];
1769         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1770         if (sctx.newshift) break;
1771         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1772         *pc1 = 1.0 / sctx.pv;
1773 
1774         /* Now take care of 1st column of diagonal 4x4 block. */
1775         pc2 = rtmp2 + i;
1776         pc3 = rtmp3 + i;
1777         pc4 = rtmp4 + i;
1778         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1779           mul2 = (*pc2) * (*pc1);
1780           *pc2 = mul2;
1781           mul3 = (*pc3) * (*pc1);
1782           *pc3 = mul3;
1783           mul4 = (*pc4) * (*pc1);
1784           *pc4 = mul4;
1785           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1786           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1787           for (j = 0; j < nz; j++) {
1788             col = pj[j];
1789             rtmp2[col] -= mul2 * rtmp1[col];
1790             rtmp3[col] -= mul3 * rtmp1[col];
1791             rtmp4[col] -= mul4 * rtmp1[col];
1792           }
1793           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1794         }
1795 
1796         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1797         rs = 0.0;
1798         /* L part */
1799         pc2 = b->a + bi[i + 1];
1800         pj  = b->j + bi[i + 1];
1801         nz  = bi[i + 2] - bi[i + 1];
1802         for (j = 0; j < nz; j++) {
1803           col    = pj[j];
1804           pc2[j] = rtmp2[col];
1805           rs += PetscAbsScalar(pc2[j]);
1806         }
1807         /* U part */
1808         pc2 = b->a + bdiag[i + 2] + 1;
1809         pj  = b->j + bdiag[i + 2] + 1;
1810         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1811         for (j = 0; j < nz; j++) {
1812           col    = pj[j];
1813           pc2[j] = rtmp2[col];
1814           rs += PetscAbsScalar(pc2[j]);
1815         }
1816 
1817         sctx.rs = rs;
1818         sctx.pv = rtmp2[i + 1];
1819         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1820         if (sctx.newshift) break;
1821         pc2  = b->a + bdiag[i + 1];
1822         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1823 
1824         /* Now take care of 2nd column of diagonal 4x4 block. */
1825         pc3 = rtmp3 + i + 1;
1826         pc4 = rtmp4 + i + 1;
1827         if (*pc3 != 0.0 || *pc4 != 0.0) {
1828           mul3 = (*pc3) * (*pc2);
1829           *pc3 = mul3;
1830           mul4 = (*pc4) * (*pc2);
1831           *pc4 = mul4;
1832           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1833           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1834           for (j = 0; j < nz; j++) {
1835             col = pj[j];
1836             rtmp3[col] -= mul3 * rtmp2[col];
1837             rtmp4[col] -= mul4 * rtmp2[col];
1838           }
1839           PetscCall(PetscLogFlops(4.0 * nz));
1840         }
1841 
1842         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1843         rs = 0.0;
1844         /* L part */
1845         pc3 = b->a + bi[i + 2];
1846         pj  = b->j + bi[i + 2];
1847         nz  = bi[i + 3] - bi[i + 2];
1848         for (j = 0; j < nz; j++) {
1849           col    = pj[j];
1850           pc3[j] = rtmp3[col];
1851           rs += PetscAbsScalar(pc3[j]);
1852         }
1853         /* U part */
1854         pc3 = b->a + bdiag[i + 3] + 1;
1855         pj  = b->j + bdiag[i + 3] + 1;
1856         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1857         for (j = 0; j < nz; j++) {
1858           col    = pj[j];
1859           pc3[j] = rtmp3[col];
1860           rs += PetscAbsScalar(pc3[j]);
1861         }
1862 
1863         sctx.rs = rs;
1864         sctx.pv = rtmp3[i + 2];
1865         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1866         if (sctx.newshift) break;
1867         pc3  = b->a + bdiag[i + 2];
1868         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1869 
1870         /* Now take care of 3rd column of diagonal 4x4 block. */
1871         pc4 = rtmp4 + i + 2;
1872         if (*pc4 != 0.0) {
1873           mul4 = (*pc4) * (*pc3);
1874           *pc4 = mul4;
1875           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1876           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1877           for (j = 0; j < nz; j++) {
1878             col = pj[j];
1879             rtmp4[col] -= mul4 * rtmp3[col];
1880           }
1881           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1882         }
1883 
1884         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1885         rs = 0.0;
1886         /* L part */
1887         pc4 = b->a + bi[i + 3];
1888         pj  = b->j + bi[i + 3];
1889         nz  = bi[i + 4] - bi[i + 3];
1890         for (j = 0; j < nz; j++) {
1891           col    = pj[j];
1892           pc4[j] = rtmp4[col];
1893           rs += PetscAbsScalar(pc4[j]);
1894         }
1895         /* U part */
1896         pc4 = b->a + bdiag[i + 4] + 1;
1897         pj  = b->j + bdiag[i + 4] + 1;
1898         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1899         for (j = 0; j < nz; j++) {
1900           col    = pj[j];
1901           pc4[j] = rtmp4[col];
1902           rs += PetscAbsScalar(pc4[j]);
1903         }
1904 
1905         sctx.rs = rs;
1906         sctx.pv = rtmp4[i + 3];
1907         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1908         if (sctx.newshift) break;
1909         pc4  = b->a + bdiag[i + 3];
1910         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1911         break;
1912 
1913       default:
1914         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1915       }
1916       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1917       i += nodesz;              /* Update the row */
1918     }
1919 
1920     /* MatPivotRefine() */
1921     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1922       /*
1923        * if no shift in this attempt & shifting & started shifting & can refine,
1924        * then try lower shift
1925        */
1926       sctx.shift_hi       = sctx.shift_fraction;
1927       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1928       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1929       sctx.newshift       = PETSC_TRUE;
1930       sctx.nshift++;
1931     }
1932   } while (sctx.newshift);
1933 
1934   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1935   PetscCall(PetscFree(tmp_vec2));
1936   PetscCall(ISRestoreIndices(isicol, &ic));
1937   PetscCall(ISRestoreIndices(isrow, &r));
1938 
1939   if (b->inode.size) {
1940     C->ops->solve = MatSolve_SeqAIJ_Inode;
1941   } else {
1942     C->ops->solve = MatSolve_SeqAIJ;
1943   }
1944   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1945   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1946   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1947   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1948   C->assembled              = PETSC_TRUE;
1949   C->preallocated           = PETSC_TRUE;
1950 
1951   PetscCall(PetscLogFlops(C->cmap->n));
1952 
1953   /* MatShiftView(A,info,&sctx) */
1954   if (sctx.nshift) {
1955     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1956       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1957     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1958       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1959     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1960       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1961     }
1962   }
1963   PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965 
1966 #if 0
1967 // unused
1968 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1969 {
1970   Mat              C = B;
1971   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1972   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1973   const PetscInt  *r, *ic, *c, *ics;
1974   PetscInt         n = A->rmap->n, *bi = b->i;
1975   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1976   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1977   PetscInt        *ai = a->i, *aj = a->j;
1978   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1979   PetscScalar      mul1, mul2, mul3, tmp;
1980   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1981   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1982   PetscReal        rs = 0.0;
1983   FactorShiftCtx   sctx;
1984 
1985   PetscFunctionBegin;
1986   sctx.shift_top      = 0;
1987   sctx.nshift_max     = 0;
1988   sctx.shift_lo       = 0;
1989   sctx.shift_hi       = 0;
1990   sctx.shift_fraction = 0;
1991 
1992   /* if both shift schemes are chosen by user, only use info->shiftpd */
1993   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1994     sctx.shift_top = 0;
1995     for (i = 0; i < n; i++) {
1996       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1997       rs    = 0.0;
1998       ajtmp = aj + ai[i];
1999       rtmp1 = aa + ai[i];
2000       nz    = ai[i + 1] - ai[i];
2001       for (j = 0; j < nz; j++) {
2002         if (*ajtmp != i) {
2003           rs += PetscAbsScalar(*rtmp1++);
2004         } else {
2005           rs -= PetscRealPart(*rtmp1++);
2006         }
2007         ajtmp++;
2008       }
2009       if (rs > sctx.shift_top) sctx.shift_top = rs;
2010     }
2011     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2012     sctx.shift_top *= 1.1;
2013     sctx.nshift_max = 5;
2014     sctx.shift_lo   = 0.;
2015     sctx.shift_hi   = 1.;
2016   }
2017   sctx.shift_amount = 0;
2018   sctx.nshift       = 0;
2019 
2020   PetscCall(ISGetIndices(isrow, &r));
2021   PetscCall(ISGetIndices(iscol, &c));
2022   PetscCall(ISGetIndices(isicol, &ic));
2023   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2024   ics = ic;
2025 
2026   node_max = a->inode.node_count;
2027   ns       = a->inode.size;
2028   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2029 
2030   /* If max inode size > 3, split it into two inodes.*/
2031   /* also map the inode sizes according to the ordering */
2032   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2033   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2034     if (ns[i] > 3) {
2035       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2036       ++j;
2037       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2038     } else {
2039       tmp_vec1[j] = ns[i];
2040     }
2041   }
2042   /* Use the correct node_max */
2043   node_max = j;
2044 
2045   /* Now reorder the inode info based on mat re-ordering info */
2046   /* First create a row -> inode_size_array_index map */
2047   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2048   for (i = 0, row = 0; i < node_max; i++) {
2049     nodesz = tmp_vec1[i];
2050     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2051   }
2052   /* Using nsmap, create a reordered ns structure */
2053   for (i = 0, j = 0; i < node_max; i++) {
2054     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2055     tmp_vec2[i] = nodesz;
2056     j += nodesz;
2057   }
2058   PetscCall(PetscFree2(nsmap, tmp_vec1));
2059   /* Now use the correct ns */
2060   ns = tmp_vec2;
2061 
2062   do {
2063     sctx.newshift = PETSC_FALSE;
2064     /* Now loop over each block-row, and do the factorization */
2065     for (i = 0, row = 0; i < node_max; i++) {
2066       nodesz = ns[i];
2067       nz     = bi[row + 1] - bi[row];
2068       bjtmp  = bj + bi[row];
2069 
2070       switch (nodesz) {
2071       case 1:
2072         for (j = 0; j < nz; j++) {
2073           idx         = bjtmp[j];
2074           rtmp11[idx] = 0.0;
2075         }
2076 
2077         /* load in initial (unfactored row) */
2078         idx    = r[row];
2079         nz_tmp = ai[idx + 1] - ai[idx];
2080         ajtmp  = aj + ai[idx];
2081         v1     = aa + ai[idx];
2082 
2083         for (j = 0; j < nz_tmp; j++) {
2084           idx         = ics[ajtmp[j]];
2085           rtmp11[idx] = v1[j];
2086         }
2087         rtmp11[ics[r[row]]] += sctx.shift_amount;
2088 
2089         prow = *bjtmp++;
2090         while (prow < row) {
2091           pc1 = rtmp11 + prow;
2092           if (*pc1 != 0.0) {
2093             pv     = ba + bd[prow];
2094             pj     = nbj + bd[prow];
2095             mul1   = *pc1 * *pv++;
2096             *pc1   = mul1;
2097             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2098             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2099             for (j = 0; j < nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103             }
2104           }
2105           prow = *bjtmp++;
2106         }
2107         pj  = bj + bi[row];
2108         pc1 = ba + bi[row];
2109 
2110         sctx.pv     = rtmp11[row];
2111         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2112         rs          = 0.0;
2113         for (j = 0; j < nz; j++) {
2114           idx    = pj[j];
2115           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2116           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2117         }
2118         sctx.rs = rs;
2119         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2120         if (sctx.newshift) goto endofwhile;
2121         break;
2122 
2123       case 2:
2124         for (j = 0; j < nz; j++) {
2125           idx         = bjtmp[j];
2126           rtmp11[idx] = 0.0;
2127           rtmp22[idx] = 0.0;
2128         }
2129 
2130         /* load in initial (unfactored row) */
2131         idx    = r[row];
2132         nz_tmp = ai[idx + 1] - ai[idx];
2133         ajtmp  = aj + ai[idx];
2134         v1     = aa + ai[idx];
2135         v2     = aa + ai[idx + 1];
2136         for (j = 0; j < nz_tmp; j++) {
2137           idx         = ics[ajtmp[j]];
2138           rtmp11[idx] = v1[j];
2139           rtmp22[idx] = v2[j];
2140         }
2141         rtmp11[ics[r[row]]] += sctx.shift_amount;
2142         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2143 
2144         prow = *bjtmp++;
2145         while (prow < row) {
2146           pc1 = rtmp11 + prow;
2147           pc2 = rtmp22 + prow;
2148           if (*pc1 != 0.0 || *pc2 != 0.0) {
2149             pv   = ba + bd[prow];
2150             pj   = nbj + bd[prow];
2151             mul1 = *pc1 * *pv;
2152             mul2 = *pc2 * *pv;
2153             ++pv;
2154             *pc1 = mul1;
2155             *pc2 = mul2;
2156 
2157             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2158             for (j = 0; j < nz_tmp; j++) {
2159               tmp = pv[j];
2160               idx = pj[j];
2161               rtmp11[idx] -= mul1 * tmp;
2162               rtmp22[idx] -= mul2 * tmp;
2163             }
2164             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2165           }
2166           prow = *bjtmp++;
2167         }
2168 
2169         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2170         pc1 = rtmp11 + prow;
2171         pc2 = rtmp22 + prow;
2172 
2173         sctx.pv = *pc1;
2174         pj      = bj + bi[prow];
2175         rs      = 0.0;
2176         for (j = 0; j < nz; j++) {
2177           idx = pj[j];
2178           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2179         }
2180         sctx.rs = rs;
2181         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2182         if (sctx.newshift) goto endofwhile;
2183 
2184         if (*pc2 != 0.0) {
2185           pj     = nbj + bd[prow];
2186           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2187           *pc2   = mul2;
2188           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2189           for (j = 0; j < nz_tmp; j++) {
2190             idx = pj[j];
2191             tmp = rtmp11[idx];
2192             rtmp22[idx] -= mul2 * tmp;
2193           }
2194           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2195         }
2196 
2197         pj  = bj + bi[row];
2198         pc1 = ba + bi[row];
2199         pc2 = ba + bi[row + 1];
2200 
2201         sctx.pv         = rtmp22[row + 1];
2202         rs              = 0.0;
2203         rtmp11[row]     = 1.0 / rtmp11[row];
2204         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2205         /* copy row entries from dense representation to sparse */
2206         for (j = 0; j < nz; j++) {
2207           idx    = pj[j];
2208           pc1[j] = rtmp11[idx];
2209           pc2[j] = rtmp22[idx];
2210           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2211         }
2212         sctx.rs = rs;
2213         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2214         if (sctx.newshift) goto endofwhile;
2215         break;
2216 
2217       case 3:
2218         for (j = 0; j < nz; j++) {
2219           idx         = bjtmp[j];
2220           rtmp11[idx] = 0.0;
2221           rtmp22[idx] = 0.0;
2222           rtmp33[idx] = 0.0;
2223         }
2224         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2225         idx    = r[row];
2226         nz_tmp = ai[idx + 1] - ai[idx];
2227         ajtmp  = aj + ai[idx];
2228         v1     = aa + ai[idx];
2229         v2     = aa + ai[idx + 1];
2230         v3     = aa + ai[idx + 2];
2231         for (j = 0; j < nz_tmp; j++) {
2232           idx         = ics[ajtmp[j]];
2233           rtmp11[idx] = v1[j];
2234           rtmp22[idx] = v2[j];
2235           rtmp33[idx] = v3[j];
2236         }
2237         rtmp11[ics[r[row]]] += sctx.shift_amount;
2238         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2239         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2240 
2241         /* loop over all pivot row blocks above this row block */
2242         prow = *bjtmp++;
2243         while (prow < row) {
2244           pc1 = rtmp11 + prow;
2245           pc2 = rtmp22 + prow;
2246           pc3 = rtmp33 + prow;
2247           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2248             pv   = ba + bd[prow];
2249             pj   = nbj + bd[prow];
2250             mul1 = *pc1 * *pv;
2251             mul2 = *pc2 * *pv;
2252             mul3 = *pc3 * *pv;
2253             ++pv;
2254             *pc1 = mul1;
2255             *pc2 = mul2;
2256             *pc3 = mul3;
2257 
2258             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2259             /* update this row based on pivot row */
2260             for (j = 0; j < nz_tmp; j++) {
2261               tmp = pv[j];
2262               idx = pj[j];
2263               rtmp11[idx] -= mul1 * tmp;
2264               rtmp22[idx] -= mul2 * tmp;
2265               rtmp33[idx] -= mul3 * tmp;
2266             }
2267             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2268           }
2269           prow = *bjtmp++;
2270         }
2271 
2272         /* Now take care of diagonal 3x3 block in this set of rows */
2273         /* note: prow = row here */
2274         pc1 = rtmp11 + prow;
2275         pc2 = rtmp22 + prow;
2276         pc3 = rtmp33 + prow;
2277 
2278         sctx.pv = *pc1;
2279         pj      = bj + bi[prow];
2280         rs      = 0.0;
2281         for (j = 0; j < nz; j++) {
2282           idx = pj[j];
2283           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2284         }
2285         sctx.rs = rs;
2286         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2287         if (sctx.newshift) goto endofwhile;
2288 
2289         if (*pc2 != 0.0 || *pc3 != 0.0) {
2290           mul2   = (*pc2) / (*pc1);
2291           mul3   = (*pc3) / (*pc1);
2292           *pc2   = mul2;
2293           *pc3   = mul3;
2294           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2295           pj     = nbj + bd[prow];
2296           for (j = 0; j < nz_tmp; j++) {
2297             idx = pj[j];
2298             tmp = rtmp11[idx];
2299             rtmp22[idx] -= mul2 * tmp;
2300             rtmp33[idx] -= mul3 * tmp;
2301           }
2302           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2303         }
2304         ++prow;
2305 
2306         pc2     = rtmp22 + prow;
2307         pc3     = rtmp33 + prow;
2308         sctx.pv = *pc2;
2309         pj      = bj + bi[prow];
2310         rs      = 0.0;
2311         for (j = 0; j < nz; j++) {
2312           idx = pj[j];
2313           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2314         }
2315         sctx.rs = rs;
2316         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2317         if (sctx.newshift) goto endofwhile;
2318 
2319         if (*pc3 != 0.0) {
2320           mul3   = (*pc3) / (*pc2);
2321           *pc3   = mul3;
2322           pj     = nbj + bd[prow];
2323           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2324           for (j = 0; j < nz_tmp; j++) {
2325             idx = pj[j];
2326             tmp = rtmp22[idx];
2327             rtmp33[idx] -= mul3 * tmp;
2328           }
2329           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2330         }
2331 
2332         pj  = bj + bi[row];
2333         pc1 = ba + bi[row];
2334         pc2 = ba + bi[row + 1];
2335         pc3 = ba + bi[row + 2];
2336 
2337         sctx.pv         = rtmp33[row + 2];
2338         rs              = 0.0;
2339         rtmp11[row]     = 1.0 / rtmp11[row];
2340         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2341         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2342         /* copy row entries from dense representation to sparse */
2343         for (j = 0; j < nz; j++) {
2344           idx    = pj[j];
2345           pc1[j] = rtmp11[idx];
2346           pc2[j] = rtmp22[idx];
2347           pc3[j] = rtmp33[idx];
2348           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2349         }
2350 
2351         sctx.rs = rs;
2352         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2353         if (sctx.newshift) goto endofwhile;
2354         break;
2355 
2356       default:
2357         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2358       }
2359       row += nodesz; /* Update the row */
2360     }
2361   endofwhile:;
2362   } while (sctx.newshift);
2363   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2364   PetscCall(PetscFree(tmp_vec2));
2365   PetscCall(ISRestoreIndices(isicol, &ic));
2366   PetscCall(ISRestoreIndices(isrow, &r));
2367   PetscCall(ISRestoreIndices(iscol, &c));
2368 
2369   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2370   /* do not set solve add, since MatSolve_Inode + Add is faster */
2371   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2372   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2373   C->assembled              = PETSC_TRUE;
2374   C->preallocated           = PETSC_TRUE;
2375   if (sctx.nshift) {
2376     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2377       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2378     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2379       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2380     }
2381   }
2382   PetscCall(PetscLogFlops(C->cmap->n));
2383   PetscCall(MatSeqAIJCheckInode(C));
2384   PetscFunctionReturn(PETSC_SUCCESS);
2385 }
2386 #endif
2387 
2388 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2389 {
2390   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2391   IS                 iscol = a->col, isrow = a->row;
2392   const PetscInt    *r, *c, *rout, *cout;
2393   PetscInt           i, j, n = A->rmap->n;
2394   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2395   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2396   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2397   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2398   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2399   const PetscScalar *b;
2400 
2401   PetscFunctionBegin;
2402   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2403   node_max = a->inode.node_count;
2404   ns       = a->inode.size; /* Node Size array */
2405 
2406   PetscCall(VecGetArrayRead(bb, &b));
2407   PetscCall(VecGetArrayWrite(xx, &x));
2408   tmp = a->solve_work;
2409 
2410   PetscCall(ISGetIndices(isrow, &rout));
2411   r = rout;
2412   PetscCall(ISGetIndices(iscol, &cout));
2413   c = cout;
2414 
2415   /* forward solve the lower triangular */
2416   tmps = tmp;
2417   aa   = a_a;
2418   aj   = a_j;
2419   ad   = a->diag;
2420 
2421   for (i = 0, row = 0; i < node_max; ++i) {
2422     nsz = ns[i];
2423     aii = ai[row];
2424     v1  = aa + aii;
2425     vi  = aj + aii;
2426     nz  = ai[row + 1] - ai[row];
2427 
2428     if (i < node_max - 1) {
2429       /* Prefetch the indices for the next block */
2430       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2431       /* Prefetch the data for the next block */
2432       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2433     }
2434 
2435     switch (nsz) { /* Each loop in 'case' is unrolled */
2436     case 1:
2437       sum1 = b[r[row]];
2438       for (j = 0; j < nz - 1; j += 2) {
2439         i0   = vi[j];
2440         i1   = vi[j + 1];
2441         tmp0 = tmps[i0];
2442         tmp1 = tmps[i1];
2443         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2444       }
2445       if (j == nz - 1) {
2446         tmp0 = tmps[vi[j]];
2447         sum1 -= v1[j] * tmp0;
2448       }
2449       tmp[row++] = sum1;
2450       break;
2451     case 2:
2452       sum1 = b[r[row]];
2453       sum2 = b[r[row + 1]];
2454       v2   = aa + ai[row + 1];
2455 
2456       for (j = 0; j < nz - 1; j += 2) {
2457         i0   = vi[j];
2458         i1   = vi[j + 1];
2459         tmp0 = tmps[i0];
2460         tmp1 = tmps[i1];
2461         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2462         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2463       }
2464       if (j == nz - 1) {
2465         tmp0 = tmps[vi[j]];
2466         sum1 -= v1[j] * tmp0;
2467         sum2 -= v2[j] * tmp0;
2468       }
2469       sum2 -= v2[nz] * sum1;
2470       tmp[row++] = sum1;
2471       tmp[row++] = sum2;
2472       break;
2473     case 3:
2474       sum1 = b[r[row]];
2475       sum2 = b[r[row + 1]];
2476       sum3 = b[r[row + 2]];
2477       v2   = aa + ai[row + 1];
2478       v3   = aa + ai[row + 2];
2479 
2480       for (j = 0; j < nz - 1; j += 2) {
2481         i0   = vi[j];
2482         i1   = vi[j + 1];
2483         tmp0 = tmps[i0];
2484         tmp1 = tmps[i1];
2485         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2486         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2487         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2488       }
2489       if (j == nz - 1) {
2490         tmp0 = tmps[vi[j]];
2491         sum1 -= v1[j] * tmp0;
2492         sum2 -= v2[j] * tmp0;
2493         sum3 -= v3[j] * tmp0;
2494       }
2495       sum2 -= v2[nz] * sum1;
2496       sum3 -= v3[nz] * sum1;
2497       sum3 -= v3[nz + 1] * sum2;
2498       tmp[row++] = sum1;
2499       tmp[row++] = sum2;
2500       tmp[row++] = sum3;
2501       break;
2502 
2503     case 4:
2504       sum1 = b[r[row]];
2505       sum2 = b[r[row + 1]];
2506       sum3 = b[r[row + 2]];
2507       sum4 = b[r[row + 3]];
2508       v2   = aa + ai[row + 1];
2509       v3   = aa + ai[row + 2];
2510       v4   = aa + ai[row + 3];
2511 
2512       for (j = 0; j < nz - 1; j += 2) {
2513         i0   = vi[j];
2514         i1   = vi[j + 1];
2515         tmp0 = tmps[i0];
2516         tmp1 = tmps[i1];
2517         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2518         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2519         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2520         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2521       }
2522       if (j == nz - 1) {
2523         tmp0 = tmps[vi[j]];
2524         sum1 -= v1[j] * tmp0;
2525         sum2 -= v2[j] * tmp0;
2526         sum3 -= v3[j] * tmp0;
2527         sum4 -= v4[j] * tmp0;
2528       }
2529       sum2 -= v2[nz] * sum1;
2530       sum3 -= v3[nz] * sum1;
2531       sum4 -= v4[nz] * sum1;
2532       sum3 -= v3[nz + 1] * sum2;
2533       sum4 -= v4[nz + 1] * sum2;
2534       sum4 -= v4[nz + 2] * sum3;
2535 
2536       tmp[row++] = sum1;
2537       tmp[row++] = sum2;
2538       tmp[row++] = sum3;
2539       tmp[row++] = sum4;
2540       break;
2541     case 5:
2542       sum1 = b[r[row]];
2543       sum2 = b[r[row + 1]];
2544       sum3 = b[r[row + 2]];
2545       sum4 = b[r[row + 3]];
2546       sum5 = b[r[row + 4]];
2547       v2   = aa + ai[row + 1];
2548       v3   = aa + ai[row + 2];
2549       v4   = aa + ai[row + 3];
2550       v5   = aa + ai[row + 4];
2551 
2552       for (j = 0; j < nz - 1; j += 2) {
2553         i0   = vi[j];
2554         i1   = vi[j + 1];
2555         tmp0 = tmps[i0];
2556         tmp1 = tmps[i1];
2557         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2558         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2559         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2560         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2561         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2562       }
2563       if (j == nz - 1) {
2564         tmp0 = tmps[vi[j]];
2565         sum1 -= v1[j] * tmp0;
2566         sum2 -= v2[j] * tmp0;
2567         sum3 -= v3[j] * tmp0;
2568         sum4 -= v4[j] * tmp0;
2569         sum5 -= v5[j] * tmp0;
2570       }
2571 
2572       sum2 -= v2[nz] * sum1;
2573       sum3 -= v3[nz] * sum1;
2574       sum4 -= v4[nz] * sum1;
2575       sum5 -= v5[nz] * sum1;
2576       sum3 -= v3[nz + 1] * sum2;
2577       sum4 -= v4[nz + 1] * sum2;
2578       sum5 -= v5[nz + 1] * sum2;
2579       sum4 -= v4[nz + 2] * sum3;
2580       sum5 -= v5[nz + 2] * sum3;
2581       sum5 -= v5[nz + 3] * sum4;
2582 
2583       tmp[row++] = sum1;
2584       tmp[row++] = sum2;
2585       tmp[row++] = sum3;
2586       tmp[row++] = sum4;
2587       tmp[row++] = sum5;
2588       break;
2589     default:
2590       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2591     }
2592   }
2593   /* backward solve the upper triangular */
2594   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2595     nsz = ns[i];
2596     aii = ad[row + 1] + 1;
2597     v1  = aa + aii;
2598     vi  = aj + aii;
2599     nz  = ad[row] - ad[row + 1] - 1;
2600 
2601     if (i > 0) {
2602       /* Prefetch the indices for the next block */
2603       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2604       /* Prefetch the data for the next block */
2605       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2606     }
2607 
2608     switch (nsz) { /* Each loop in 'case' is unrolled */
2609     case 1:
2610       sum1 = tmp[row];
2611 
2612       for (j = 0; j < nz - 1; j += 2) {
2613         i0   = vi[j];
2614         i1   = vi[j + 1];
2615         tmp0 = tmps[i0];
2616         tmp1 = tmps[i1];
2617         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2618       }
2619       if (j == nz - 1) {
2620         tmp0 = tmps[vi[j]];
2621         sum1 -= v1[j] * tmp0;
2622       }
2623       x[c[row]] = tmp[row] = sum1 * v1[nz];
2624       row--;
2625       break;
2626     case 2:
2627       sum1 = tmp[row];
2628       sum2 = tmp[row - 1];
2629       v2   = aa + ad[row] + 1;
2630       for (j = 0; j < nz - 1; j += 2) {
2631         i0   = vi[j];
2632         i1   = vi[j + 1];
2633         tmp0 = tmps[i0];
2634         tmp1 = tmps[i1];
2635         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2637       }
2638       if (j == nz - 1) {
2639         tmp0 = tmps[vi[j]];
2640         sum1 -= v1[j] * tmp0;
2641         sum2 -= v2[j + 1] * tmp0;
2642       }
2643 
2644       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2645       row--;
2646       sum2 -= v2[0] * tmp0;
2647       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2648       row--;
2649       break;
2650     case 3:
2651       sum1 = tmp[row];
2652       sum2 = tmp[row - 1];
2653       sum3 = tmp[row - 2];
2654       v2   = aa + ad[row] + 1;
2655       v3   = aa + ad[row - 1] + 1;
2656       for (j = 0; j < nz - 1; j += 2) {
2657         i0   = vi[j];
2658         i1   = vi[j + 1];
2659         tmp0 = tmps[i0];
2660         tmp1 = tmps[i1];
2661         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2662         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2663         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2664       }
2665       if (j == nz - 1) {
2666         tmp0 = tmps[vi[j]];
2667         sum1 -= v1[j] * tmp0;
2668         sum2 -= v2[j + 1] * tmp0;
2669         sum3 -= v3[j + 2] * tmp0;
2670       }
2671       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2672       row--;
2673       sum2 -= v2[0] * tmp0;
2674       sum3 -= v3[1] * tmp0;
2675       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2676       row--;
2677       sum3 -= v3[0] * tmp0;
2678       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2679       row--;
2680 
2681       break;
2682     case 4:
2683       sum1 = tmp[row];
2684       sum2 = tmp[row - 1];
2685       sum3 = tmp[row - 2];
2686       sum4 = tmp[row - 3];
2687       v2   = aa + ad[row] + 1;
2688       v3   = aa + ad[row - 1] + 1;
2689       v4   = aa + ad[row - 2] + 1;
2690 
2691       for (j = 0; j < nz - 1; j += 2) {
2692         i0   = vi[j];
2693         i1   = vi[j + 1];
2694         tmp0 = tmps[i0];
2695         tmp1 = tmps[i1];
2696         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2697         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2698         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2699         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2700       }
2701       if (j == nz - 1) {
2702         tmp0 = tmps[vi[j]];
2703         sum1 -= v1[j] * tmp0;
2704         sum2 -= v2[j + 1] * tmp0;
2705         sum3 -= v3[j + 2] * tmp0;
2706         sum4 -= v4[j + 3] * tmp0;
2707       }
2708 
2709       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2710       row--;
2711       sum2 -= v2[0] * tmp0;
2712       sum3 -= v3[1] * tmp0;
2713       sum4 -= v4[2] * tmp0;
2714       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2715       row--;
2716       sum3 -= v3[0] * tmp0;
2717       sum4 -= v4[1] * tmp0;
2718       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2719       row--;
2720       sum4 -= v4[0] * tmp0;
2721       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2722       row--;
2723       break;
2724     case 5:
2725       sum1 = tmp[row];
2726       sum2 = tmp[row - 1];
2727       sum3 = tmp[row - 2];
2728       sum4 = tmp[row - 3];
2729       sum5 = tmp[row - 4];
2730       v2   = aa + ad[row] + 1;
2731       v3   = aa + ad[row - 1] + 1;
2732       v4   = aa + ad[row - 2] + 1;
2733       v5   = aa + ad[row - 3] + 1;
2734       for (j = 0; j < nz - 1; j += 2) {
2735         i0   = vi[j];
2736         i1   = vi[j + 1];
2737         tmp0 = tmps[i0];
2738         tmp1 = tmps[i1];
2739         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2740         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2741         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2742         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2743         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2744       }
2745       if (j == nz - 1) {
2746         tmp0 = tmps[vi[j]];
2747         sum1 -= v1[j] * tmp0;
2748         sum2 -= v2[j + 1] * tmp0;
2749         sum3 -= v3[j + 2] * tmp0;
2750         sum4 -= v4[j + 3] * tmp0;
2751         sum5 -= v5[j + 4] * tmp0;
2752       }
2753 
2754       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2755       row--;
2756       sum2 -= v2[0] * tmp0;
2757       sum3 -= v3[1] * tmp0;
2758       sum4 -= v4[2] * tmp0;
2759       sum5 -= v5[3] * tmp0;
2760       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2761       row--;
2762       sum3 -= v3[0] * tmp0;
2763       sum4 -= v4[1] * tmp0;
2764       sum5 -= v5[2] * tmp0;
2765       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2766       row--;
2767       sum4 -= v4[0] * tmp0;
2768       sum5 -= v5[1] * tmp0;
2769       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2770       row--;
2771       sum5 -= v5[0] * tmp0;
2772       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2773       row--;
2774       break;
2775     default:
2776       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2777     }
2778   }
2779   PetscCall(ISRestoreIndices(isrow, &rout));
2780   PetscCall(ISRestoreIndices(iscol, &cout));
2781   PetscCall(VecRestoreArrayRead(bb, &b));
2782   PetscCall(VecRestoreArrayWrite(xx, &x));
2783   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2784   PetscFunctionReturn(PETSC_SUCCESS);
2785 }
2786 
2787 /*
2788      Makes a longer coloring[] array and calls the usual code with that
2789 */
2790 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2791 {
2792   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2793   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2794   PetscInt        *colorused, i;
2795   ISColoringValue *newcolor;
2796 
2797   PetscFunctionBegin;
2798   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2799   PetscCall(PetscMalloc1(n + 1, &newcolor));
2800   /* loop over inodes, marking a color for each column*/
2801   row = 0;
2802   for (i = 0; i < m; i++) {
2803     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2804   }
2805 
2806   /* eliminate unneeded colors */
2807   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2808   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2809 
2810   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2811   ncolors = colorused[5 * ncolors - 1];
2812   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2813   PetscCall(PetscFree(colorused));
2814   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2815   PetscCall(PetscFree(coloring));
2816   PetscFunctionReturn(PETSC_SUCCESS);
2817 }
2818 
2819 #include <petsc/private/kernels/blockinvert.h>
2820 
2821 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2822 {
2823   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2824   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2825   MatScalar         *ibdiag, *bdiag, work[25], *t;
2826   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2827   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2828   const PetscScalar *xb, *b;
2829   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2830   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2831   PetscInt           sz, k, ipvt[5];
2832   PetscBool          allowzeropivot, zeropivotdetected;
2833   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2834 
2835   PetscFunctionBegin;
2836   allowzeropivot = PetscNot(A->erroriffailure);
2837   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2838   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2839   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2840 
2841   if (!a->inode.ibdiagvalid) {
2842     if (!a->inode.ibdiag) {
2843       /* calculate space needed for diagonal blocks */
2844       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2845       a->inode.bdiagsize = cnt;
2846 
2847       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2848     }
2849 
2850     /* copy over the diagonal blocks and invert them */
2851     ibdiag = a->inode.ibdiag;
2852     bdiag  = a->inode.bdiag;
2853     cnt    = 0;
2854     for (i = 0, row = 0; i < m; i++) {
2855       for (j = 0; j < sizes[i]; j++) {
2856         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2857       }
2858       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2859 
2860       switch (sizes[i]) {
2861       case 1:
2862         /* Create matrix data structure */
2863         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2864           if (allowzeropivot) {
2865             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2866             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2867             A->factorerror_zeropivot_row   = row;
2868             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2869           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2870         }
2871         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2872         break;
2873       case 2:
2874         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2875         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2876         break;
2877       case 3:
2878         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2879         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2880         break;
2881       case 4:
2882         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2883         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2884         break;
2885       case 5:
2886         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2887         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888         break;
2889       default:
2890         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2891       }
2892       cnt += sizes[i] * sizes[i];
2893       row += sizes[i];
2894     }
2895     a->inode.ibdiagvalid = PETSC_TRUE;
2896   }
2897   ibdiag = a->inode.ibdiag;
2898   bdiag  = a->inode.bdiag;
2899   t      = a->inode.ssor_work;
2900 
2901   PetscCall(VecGetArray(xx, &x));
2902   PetscCall(VecGetArrayRead(bb, &b));
2903   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2904   if (flag & SOR_ZERO_INITIAL_GUESS) {
2905     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2906       for (i = 0, row = 0; i < m; i++) {
2907         sz  = diag[row] - ii[row];
2908         v1  = a->a + ii[row];
2909         idx = a->j + ii[row];
2910 
2911         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2912         switch (sizes[i]) {
2913         case 1:
2914 
2915           sum1 = b[row];
2916           for (n = 0; n < sz - 1; n += 2) {
2917             i1 = idx[0];
2918             i2 = idx[1];
2919             idx += 2;
2920             tmp0 = x[i1];
2921             tmp1 = x[i2];
2922             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2923             v1 += 2;
2924           }
2925 
2926           if (n == sz - 1) {
2927             tmp0 = x[*idx];
2928             sum1 -= *v1 * tmp0;
2929           }
2930           t[row]   = sum1;
2931           x[row++] = sum1 * (*ibdiag++);
2932           break;
2933         case 2:
2934           v2   = a->a + ii[row + 1];
2935           sum1 = b[row];
2936           sum2 = b[row + 1];
2937           for (n = 0; n < sz - 1; n += 2) {
2938             i1 = idx[0];
2939             i2 = idx[1];
2940             idx += 2;
2941             tmp0 = x[i1];
2942             tmp1 = x[i2];
2943             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2944             v1 += 2;
2945             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2946             v2 += 2;
2947           }
2948 
2949           if (n == sz - 1) {
2950             tmp0 = x[*idx];
2951             sum1 -= v1[0] * tmp0;
2952             sum2 -= v2[0] * tmp0;
2953           }
2954           t[row]     = sum1;
2955           t[row + 1] = sum2;
2956           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2957           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2958           ibdiag += 4;
2959           break;
2960         case 3:
2961           v2   = a->a + ii[row + 1];
2962           v3   = a->a + ii[row + 2];
2963           sum1 = b[row];
2964           sum2 = b[row + 1];
2965           sum3 = b[row + 2];
2966           for (n = 0; n < sz - 1; n += 2) {
2967             i1 = idx[0];
2968             i2 = idx[1];
2969             idx += 2;
2970             tmp0 = x[i1];
2971             tmp1 = x[i2];
2972             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2973             v1 += 2;
2974             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2975             v2 += 2;
2976             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2977             v3 += 2;
2978           }
2979 
2980           if (n == sz - 1) {
2981             tmp0 = x[*idx];
2982             sum1 -= v1[0] * tmp0;
2983             sum2 -= v2[0] * tmp0;
2984             sum3 -= v3[0] * tmp0;
2985           }
2986           t[row]     = sum1;
2987           t[row + 1] = sum2;
2988           t[row + 2] = sum3;
2989           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2990           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2991           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2992           ibdiag += 9;
2993           break;
2994         case 4:
2995           v2   = a->a + ii[row + 1];
2996           v3   = a->a + ii[row + 2];
2997           v4   = a->a + ii[row + 3];
2998           sum1 = b[row];
2999           sum2 = b[row + 1];
3000           sum3 = b[row + 2];
3001           sum4 = b[row + 3];
3002           for (n = 0; n < sz - 1; n += 2) {
3003             i1 = idx[0];
3004             i2 = idx[1];
3005             idx += 2;
3006             tmp0 = x[i1];
3007             tmp1 = x[i2];
3008             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3009             v1 += 2;
3010             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3011             v2 += 2;
3012             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3013             v3 += 2;
3014             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3015             v4 += 2;
3016           }
3017 
3018           if (n == sz - 1) {
3019             tmp0 = x[*idx];
3020             sum1 -= v1[0] * tmp0;
3021             sum2 -= v2[0] * tmp0;
3022             sum3 -= v3[0] * tmp0;
3023             sum4 -= v4[0] * tmp0;
3024           }
3025           t[row]     = sum1;
3026           t[row + 1] = sum2;
3027           t[row + 2] = sum3;
3028           t[row + 3] = sum4;
3029           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3030           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3031           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3032           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3033           ibdiag += 16;
3034           break;
3035         case 5:
3036           v2   = a->a + ii[row + 1];
3037           v3   = a->a + ii[row + 2];
3038           v4   = a->a + ii[row + 3];
3039           v5   = a->a + ii[row + 4];
3040           sum1 = b[row];
3041           sum2 = b[row + 1];
3042           sum3 = b[row + 2];
3043           sum4 = b[row + 3];
3044           sum5 = b[row + 4];
3045           for (n = 0; n < sz - 1; n += 2) {
3046             i1 = idx[0];
3047             i2 = idx[1];
3048             idx += 2;
3049             tmp0 = x[i1];
3050             tmp1 = x[i2];
3051             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3052             v1 += 2;
3053             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3054             v2 += 2;
3055             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3056             v3 += 2;
3057             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3058             v4 += 2;
3059             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3060             v5 += 2;
3061           }
3062 
3063           if (n == sz - 1) {
3064             tmp0 = x[*idx];
3065             sum1 -= v1[0] * tmp0;
3066             sum2 -= v2[0] * tmp0;
3067             sum3 -= v3[0] * tmp0;
3068             sum4 -= v4[0] * tmp0;
3069             sum5 -= v5[0] * tmp0;
3070           }
3071           t[row]     = sum1;
3072           t[row + 1] = sum2;
3073           t[row + 2] = sum3;
3074           t[row + 3] = sum4;
3075           t[row + 4] = sum5;
3076           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3077           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3078           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3079           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3080           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3081           ibdiag += 25;
3082           break;
3083         default:
3084           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3085         }
3086       }
3087 
3088       xb = t;
3089       PetscCall(PetscLogFlops(a->nz));
3090     } else xb = b;
3091     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3092       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3093       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3094         ibdiag -= sizes[i] * sizes[i];
3095         sz  = ii[row + 1] - diag[row] - 1;
3096         v1  = a->a + diag[row] + 1;
3097         idx = a->j + diag[row] + 1;
3098 
3099         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3100         switch (sizes[i]) {
3101         case 1:
3102 
3103           sum1 = xb[row];
3104           for (n = 0; n < sz - 1; n += 2) {
3105             i1 = idx[0];
3106             i2 = idx[1];
3107             idx += 2;
3108             tmp0 = x[i1];
3109             tmp1 = x[i2];
3110             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3111             v1 += 2;
3112           }
3113 
3114           if (n == sz - 1) {
3115             tmp0 = x[*idx];
3116             sum1 -= *v1 * tmp0;
3117           }
3118           x[row--] = sum1 * (*ibdiag);
3119           break;
3120 
3121         case 2:
3122 
3123           sum1 = xb[row];
3124           sum2 = xb[row - 1];
3125           /* note that sum1 is associated with the second of the two rows */
3126           v2 = a->a + diag[row - 1] + 2;
3127           for (n = 0; n < sz - 1; n += 2) {
3128             i1 = idx[0];
3129             i2 = idx[1];
3130             idx += 2;
3131             tmp0 = x[i1];
3132             tmp1 = x[i2];
3133             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3134             v1 += 2;
3135             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3136             v2 += 2;
3137           }
3138 
3139           if (n == sz - 1) {
3140             tmp0 = x[*idx];
3141             sum1 -= *v1 * tmp0;
3142             sum2 -= *v2 * tmp0;
3143           }
3144           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3145           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3146           break;
3147         case 3:
3148 
3149           sum1 = xb[row];
3150           sum2 = xb[row - 1];
3151           sum3 = xb[row - 2];
3152           v2   = a->a + diag[row - 1] + 2;
3153           v3   = a->a + diag[row - 2] + 3;
3154           for (n = 0; n < sz - 1; n += 2) {
3155             i1 = idx[0];
3156             i2 = idx[1];
3157             idx += 2;
3158             tmp0 = x[i1];
3159             tmp1 = x[i2];
3160             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3161             v1 += 2;
3162             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3163             v2 += 2;
3164             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3165             v3 += 2;
3166           }
3167 
3168           if (n == sz - 1) {
3169             tmp0 = x[*idx];
3170             sum1 -= *v1 * tmp0;
3171             sum2 -= *v2 * tmp0;
3172             sum3 -= *v3 * tmp0;
3173           }
3174           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3175           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3176           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3177           break;
3178         case 4:
3179 
3180           sum1 = xb[row];
3181           sum2 = xb[row - 1];
3182           sum3 = xb[row - 2];
3183           sum4 = xb[row - 3];
3184           v2   = a->a + diag[row - 1] + 2;
3185           v3   = a->a + diag[row - 2] + 3;
3186           v4   = a->a + diag[row - 3] + 4;
3187           for (n = 0; n < sz - 1; n += 2) {
3188             i1 = idx[0];
3189             i2 = idx[1];
3190             idx += 2;
3191             tmp0 = x[i1];
3192             tmp1 = x[i2];
3193             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3194             v1 += 2;
3195             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3196             v2 += 2;
3197             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3198             v3 += 2;
3199             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3200             v4 += 2;
3201           }
3202 
3203           if (n == sz - 1) {
3204             tmp0 = x[*idx];
3205             sum1 -= *v1 * tmp0;
3206             sum2 -= *v2 * tmp0;
3207             sum3 -= *v3 * tmp0;
3208             sum4 -= *v4 * tmp0;
3209           }
3210           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3211           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3212           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3213           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3214           break;
3215         case 5:
3216 
3217           sum1 = xb[row];
3218           sum2 = xb[row - 1];
3219           sum3 = xb[row - 2];
3220           sum4 = xb[row - 3];
3221           sum5 = xb[row - 4];
3222           v2   = a->a + diag[row - 1] + 2;
3223           v3   = a->a + diag[row - 2] + 3;
3224           v4   = a->a + diag[row - 3] + 4;
3225           v5   = a->a + diag[row - 4] + 5;
3226           for (n = 0; n < sz - 1; n += 2) {
3227             i1 = idx[0];
3228             i2 = idx[1];
3229             idx += 2;
3230             tmp0 = x[i1];
3231             tmp1 = x[i2];
3232             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3233             v1 += 2;
3234             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3235             v2 += 2;
3236             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3237             v3 += 2;
3238             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3239             v4 += 2;
3240             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3241             v5 += 2;
3242           }
3243 
3244           if (n == sz - 1) {
3245             tmp0 = x[*idx];
3246             sum1 -= *v1 * tmp0;
3247             sum2 -= *v2 * tmp0;
3248             sum3 -= *v3 * tmp0;
3249             sum4 -= *v4 * tmp0;
3250             sum5 -= *v5 * tmp0;
3251           }
3252           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3253           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3254           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3255           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3256           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3257           break;
3258         default:
3259           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3260         }
3261       }
3262 
3263       PetscCall(PetscLogFlops(a->nz));
3264     }
3265     its--;
3266   }
3267   while (its--) {
3268     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3269       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3270         sz  = diag[row] - ii[row];
3271         v1  = a->a + ii[row];
3272         idx = a->j + ii[row];
3273         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3274         switch (sizes[i]) {
3275         case 1:
3276           sum1 = b[row];
3277           for (n = 0; n < sz - 1; n += 2) {
3278             i1 = idx[0];
3279             i2 = idx[1];
3280             idx += 2;
3281             tmp0 = x[i1];
3282             tmp1 = x[i2];
3283             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284             v1 += 2;
3285           }
3286           if (n == sz - 1) {
3287             tmp0 = x[*idx++];
3288             sum1 -= *v1 * tmp0;
3289             v1++;
3290           }
3291           t[row] = sum1;
3292           sz     = ii[row + 1] - diag[row] - 1;
3293           idx    = a->j + diag[row] + 1;
3294           v1 += 1;
3295           for (n = 0; n < sz - 1; n += 2) {
3296             i1 = idx[0];
3297             i2 = idx[1];
3298             idx += 2;
3299             tmp0 = x[i1];
3300             tmp1 = x[i2];
3301             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3302             v1 += 2;
3303           }
3304           if (n == sz - 1) {
3305             tmp0 = x[*idx++];
3306             sum1 -= *v1 * tmp0;
3307           }
3308           /* in MatSOR_SeqAIJ this line would be
3309            *
3310            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3311            *
3312            * but omega == 1, so this becomes
3313            *
3314            * x[row] = sum1*(*ibdiag++);
3315            *
3316            */
3317           x[row] = sum1 * (*ibdiag);
3318           break;
3319         case 2:
3320           v2   = a->a + ii[row + 1];
3321           sum1 = b[row];
3322           sum2 = b[row + 1];
3323           for (n = 0; n < sz - 1; n += 2) {
3324             i1 = idx[0];
3325             i2 = idx[1];
3326             idx += 2;
3327             tmp0 = x[i1];
3328             tmp1 = x[i2];
3329             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3330             v1 += 2;
3331             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3332             v2 += 2;
3333           }
3334           if (n == sz - 1) {
3335             tmp0 = x[*idx++];
3336             sum1 -= v1[0] * tmp0;
3337             sum2 -= v2[0] * tmp0;
3338             v1++;
3339             v2++;
3340           }
3341           t[row]     = sum1;
3342           t[row + 1] = sum2;
3343           sz         = ii[row + 1] - diag[row] - 2;
3344           idx        = a->j + diag[row] + 2;
3345           v1 += 2;
3346           v2 += 2;
3347           for (n = 0; n < sz - 1; n += 2) {
3348             i1 = idx[0];
3349             i2 = idx[1];
3350             idx += 2;
3351             tmp0 = x[i1];
3352             tmp1 = x[i2];
3353             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3354             v1 += 2;
3355             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3356             v2 += 2;
3357           }
3358           if (n == sz - 1) {
3359             tmp0 = x[*idx];
3360             sum1 -= v1[0] * tmp0;
3361             sum2 -= v2[0] * tmp0;
3362           }
3363           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3364           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3365           break;
3366         case 3:
3367           v2   = a->a + ii[row + 1];
3368           v3   = a->a + ii[row + 2];
3369           sum1 = b[row];
3370           sum2 = b[row + 1];
3371           sum3 = b[row + 2];
3372           for (n = 0; n < sz - 1; n += 2) {
3373             i1 = idx[0];
3374             i2 = idx[1];
3375             idx += 2;
3376             tmp0 = x[i1];
3377             tmp1 = x[i2];
3378             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3379             v1 += 2;
3380             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3381             v2 += 2;
3382             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3383             v3 += 2;
3384           }
3385           if (n == sz - 1) {
3386             tmp0 = x[*idx++];
3387             sum1 -= v1[0] * tmp0;
3388             sum2 -= v2[0] * tmp0;
3389             sum3 -= v3[0] * tmp0;
3390             v1++;
3391             v2++;
3392             v3++;
3393           }
3394           t[row]     = sum1;
3395           t[row + 1] = sum2;
3396           t[row + 2] = sum3;
3397           sz         = ii[row + 1] - diag[row] - 3;
3398           idx        = a->j + diag[row] + 3;
3399           v1 += 3;
3400           v2 += 3;
3401           v3 += 3;
3402           for (n = 0; n < sz - 1; n += 2) {
3403             i1 = idx[0];
3404             i2 = idx[1];
3405             idx += 2;
3406             tmp0 = x[i1];
3407             tmp1 = x[i2];
3408             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3409             v1 += 2;
3410             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3411             v2 += 2;
3412             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3413             v3 += 2;
3414           }
3415           if (n == sz - 1) {
3416             tmp0 = x[*idx];
3417             sum1 -= v1[0] * tmp0;
3418             sum2 -= v2[0] * tmp0;
3419             sum3 -= v3[0] * tmp0;
3420           }
3421           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3422           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3423           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3424           break;
3425         case 4:
3426           v2   = a->a + ii[row + 1];
3427           v3   = a->a + ii[row + 2];
3428           v4   = a->a + ii[row + 3];
3429           sum1 = b[row];
3430           sum2 = b[row + 1];
3431           sum3 = b[row + 2];
3432           sum4 = b[row + 3];
3433           for (n = 0; n < sz - 1; n += 2) {
3434             i1 = idx[0];
3435             i2 = idx[1];
3436             idx += 2;
3437             tmp0 = x[i1];
3438             tmp1 = x[i2];
3439             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3440             v1 += 2;
3441             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3442             v2 += 2;
3443             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3444             v3 += 2;
3445             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3446             v4 += 2;
3447           }
3448           if (n == sz - 1) {
3449             tmp0 = x[*idx++];
3450             sum1 -= v1[0] * tmp0;
3451             sum2 -= v2[0] * tmp0;
3452             sum3 -= v3[0] * tmp0;
3453             sum4 -= v4[0] * tmp0;
3454             v1++;
3455             v2++;
3456             v3++;
3457             v4++;
3458           }
3459           t[row]     = sum1;
3460           t[row + 1] = sum2;
3461           t[row + 2] = sum3;
3462           t[row + 3] = sum4;
3463           sz         = ii[row + 1] - diag[row] - 4;
3464           idx        = a->j + diag[row] + 4;
3465           v1 += 4;
3466           v2 += 4;
3467           v3 += 4;
3468           v4 += 4;
3469           for (n = 0; n < sz - 1; n += 2) {
3470             i1 = idx[0];
3471             i2 = idx[1];
3472             idx += 2;
3473             tmp0 = x[i1];
3474             tmp1 = x[i2];
3475             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3476             v1 += 2;
3477             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3478             v2 += 2;
3479             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3480             v3 += 2;
3481             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3482             v4 += 2;
3483           }
3484           if (n == sz - 1) {
3485             tmp0 = x[*idx];
3486             sum1 -= v1[0] * tmp0;
3487             sum2 -= v2[0] * tmp0;
3488             sum3 -= v3[0] * tmp0;
3489             sum4 -= v4[0] * tmp0;
3490           }
3491           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3492           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3493           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3494           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3495           break;
3496         case 5:
3497           v2   = a->a + ii[row + 1];
3498           v3   = a->a + ii[row + 2];
3499           v4   = a->a + ii[row + 3];
3500           v5   = a->a + ii[row + 4];
3501           sum1 = b[row];
3502           sum2 = b[row + 1];
3503           sum3 = b[row + 2];
3504           sum4 = b[row + 3];
3505           sum5 = b[row + 4];
3506           for (n = 0; n < sz - 1; n += 2) {
3507             i1 = idx[0];
3508             i2 = idx[1];
3509             idx += 2;
3510             tmp0 = x[i1];
3511             tmp1 = x[i2];
3512             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3513             v1 += 2;
3514             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3515             v2 += 2;
3516             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3517             v3 += 2;
3518             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3519             v4 += 2;
3520             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3521             v5 += 2;
3522           }
3523           if (n == sz - 1) {
3524             tmp0 = x[*idx++];
3525             sum1 -= v1[0] * tmp0;
3526             sum2 -= v2[0] * tmp0;
3527             sum3 -= v3[0] * tmp0;
3528             sum4 -= v4[0] * tmp0;
3529             sum5 -= v5[0] * tmp0;
3530             v1++;
3531             v2++;
3532             v3++;
3533             v4++;
3534             v5++;
3535           }
3536           t[row]     = sum1;
3537           t[row + 1] = sum2;
3538           t[row + 2] = sum3;
3539           t[row + 3] = sum4;
3540           t[row + 4] = sum5;
3541           sz         = ii[row + 1] - diag[row] - 5;
3542           idx        = a->j + diag[row] + 5;
3543           v1 += 5;
3544           v2 += 5;
3545           v3 += 5;
3546           v4 += 5;
3547           v5 += 5;
3548           for (n = 0; n < sz - 1; n += 2) {
3549             i1 = idx[0];
3550             i2 = idx[1];
3551             idx += 2;
3552             tmp0 = x[i1];
3553             tmp1 = x[i2];
3554             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3555             v1 += 2;
3556             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3557             v2 += 2;
3558             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3559             v3 += 2;
3560             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3561             v4 += 2;
3562             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3563             v5 += 2;
3564           }
3565           if (n == sz - 1) {
3566             tmp0 = x[*idx];
3567             sum1 -= v1[0] * tmp0;
3568             sum2 -= v2[0] * tmp0;
3569             sum3 -= v3[0] * tmp0;
3570             sum4 -= v4[0] * tmp0;
3571             sum5 -= v5[0] * tmp0;
3572           }
3573           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3574           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3575           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3576           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3577           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3578           break;
3579         default:
3580           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3581         }
3582       }
3583       xb = t;
3584       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3585     } else xb = b;
3586 
3587     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3588       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3589       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3590         ibdiag -= sizes[i] * sizes[i];
3591 
3592         /* set RHS */
3593         if (xb == b) {
3594           /* whole (old way) */
3595           sz  = ii[row + 1] - ii[row];
3596           idx = a->j + ii[row];
3597           switch (sizes[i]) {
3598           case 5:
3599             v5 = a->a + ii[row - 4]; /* fall through */
3600           case 4:
3601             v4 = a->a + ii[row - 3]; /* fall through */
3602           case 3:
3603             v3 = a->a + ii[row - 2]; /* fall through */
3604           case 2:
3605             v2 = a->a + ii[row - 1]; /* fall through */
3606           case 1:
3607             v1 = a->a + ii[row];
3608             break;
3609           default:
3610             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3611           }
3612         } else {
3613           /* upper, no diag */
3614           sz  = ii[row + 1] - diag[row] - 1;
3615           idx = a->j + diag[row] + 1;
3616           switch (sizes[i]) {
3617           case 5:
3618             v5 = a->a + diag[row - 4] + 5; /* fall through */
3619           case 4:
3620             v4 = a->a + diag[row - 3] + 4; /* fall through */
3621           case 3:
3622             v3 = a->a + diag[row - 2] + 3; /* fall through */
3623           case 2:
3624             v2 = a->a + diag[row - 1] + 2; /* fall through */
3625           case 1:
3626             v1 = a->a + diag[row] + 1;
3627           }
3628         }
3629         /* set sum */
3630         switch (sizes[i]) {
3631         case 5:
3632           sum5 = xb[row - 4]; /* fall through */
3633         case 4:
3634           sum4 = xb[row - 3]; /* fall through */
3635         case 3:
3636           sum3 = xb[row - 2]; /* fall through */
3637         case 2:
3638           sum2 = xb[row - 1]; /* fall through */
3639         case 1:
3640           /* note that sum1 is associated with the last row */
3641           sum1 = xb[row];
3642         }
3643         /* do sums */
3644         for (n = 0; n < sz - 1; n += 2) {
3645           i1 = idx[0];
3646           i2 = idx[1];
3647           idx += 2;
3648           tmp0 = x[i1];
3649           tmp1 = x[i2];
3650           switch (sizes[i]) {
3651           case 5:
3652             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3653             v5 += 2; /* fall through */
3654           case 4:
3655             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3656             v4 += 2; /* fall through */
3657           case 3:
3658             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3659             v3 += 2; /* fall through */
3660           case 2:
3661             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3662             v2 += 2; /* fall through */
3663           case 1:
3664             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3665             v1 += 2;
3666           }
3667         }
3668         /* ragged edge */
3669         if (n == sz - 1) {
3670           tmp0 = x[*idx];
3671           switch (sizes[i]) {
3672           case 5:
3673             sum5 -= *v5 * tmp0; /* fall through */
3674           case 4:
3675             sum4 -= *v4 * tmp0; /* fall through */
3676           case 3:
3677             sum3 -= *v3 * tmp0; /* fall through */
3678           case 2:
3679             sum2 -= *v2 * tmp0; /* fall through */
3680           case 1:
3681             sum1 -= *v1 * tmp0;
3682           }
3683         }
3684         /* update */
3685         if (xb == b) {
3686           /* whole (old way) w/ diag */
3687           switch (sizes[i]) {
3688           case 5:
3689             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3690             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3691             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3692             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3693             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3694             break;
3695           case 4:
3696             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3697             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3698             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3699             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3700             break;
3701           case 3:
3702             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3703             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3704             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3705             break;
3706           case 2:
3707             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3708             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3709             break;
3710           case 1:
3711             x[row--] += sum1 * (*ibdiag);
3712             break;
3713           }
3714         } else {
3715           /* no diag so set =  */
3716           switch (sizes[i]) {
3717           case 5:
3718             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3719             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3720             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3721             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3722             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3723             break;
3724           case 4:
3725             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3726             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3727             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3728             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3729             break;
3730           case 3:
3731             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3732             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3733             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3734             break;
3735           case 2:
3736             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3737             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3738             break;
3739           case 1:
3740             x[row--] = sum1 * (*ibdiag);
3741             break;
3742           }
3743         }
3744       }
3745       if (xb == b) {
3746         PetscCall(PetscLogFlops(2.0 * a->nz));
3747       } else {
3748         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3749       }
3750     }
3751   }
3752   if (flag & SOR_EISENSTAT) {
3753     /*
3754           Apply  (U + D)^-1  where D is now the block diagonal
3755     */
3756     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3757     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3758       ibdiag -= sizes[i] * sizes[i];
3759       sz  = ii[row + 1] - diag[row] - 1;
3760       v1  = a->a + diag[row] + 1;
3761       idx = a->j + diag[row] + 1;
3762       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3763       switch (sizes[i]) {
3764       case 1:
3765 
3766         sum1 = b[row];
3767         for (n = 0; n < sz - 1; n += 2) {
3768           i1 = idx[0];
3769           i2 = idx[1];
3770           idx += 2;
3771           tmp0 = x[i1];
3772           tmp1 = x[i2];
3773           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3774           v1 += 2;
3775         }
3776 
3777         if (n == sz - 1) {
3778           tmp0 = x[*idx];
3779           sum1 -= *v1 * tmp0;
3780         }
3781         x[row] = sum1 * (*ibdiag);
3782         row--;
3783         break;
3784 
3785       case 2:
3786 
3787         sum1 = b[row];
3788         sum2 = b[row - 1];
3789         /* note that sum1 is associated with the second of the two rows */
3790         v2 = a->a + diag[row - 1] + 2;
3791         for (n = 0; n < sz - 1; n += 2) {
3792           i1 = idx[0];
3793           i2 = idx[1];
3794           idx += 2;
3795           tmp0 = x[i1];
3796           tmp1 = x[i2];
3797           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3798           v1 += 2;
3799           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3800           v2 += 2;
3801         }
3802 
3803         if (n == sz - 1) {
3804           tmp0 = x[*idx];
3805           sum1 -= *v1 * tmp0;
3806           sum2 -= *v2 * tmp0;
3807         }
3808         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3809         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3810         row -= 2;
3811         break;
3812       case 3:
3813 
3814         sum1 = b[row];
3815         sum2 = b[row - 1];
3816         sum3 = b[row - 2];
3817         v2   = a->a + diag[row - 1] + 2;
3818         v3   = a->a + diag[row - 2] + 3;
3819         for (n = 0; n < sz - 1; n += 2) {
3820           i1 = idx[0];
3821           i2 = idx[1];
3822           idx += 2;
3823           tmp0 = x[i1];
3824           tmp1 = x[i2];
3825           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3826           v1 += 2;
3827           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3828           v2 += 2;
3829           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3830           v3 += 2;
3831         }
3832 
3833         if (n == sz - 1) {
3834           tmp0 = x[*idx];
3835           sum1 -= *v1 * tmp0;
3836           sum2 -= *v2 * tmp0;
3837           sum3 -= *v3 * tmp0;
3838         }
3839         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3840         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3841         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3842         row -= 3;
3843         break;
3844       case 4:
3845 
3846         sum1 = b[row];
3847         sum2 = b[row - 1];
3848         sum3 = b[row - 2];
3849         sum4 = b[row - 3];
3850         v2   = a->a + diag[row - 1] + 2;
3851         v3   = a->a + diag[row - 2] + 3;
3852         v4   = a->a + diag[row - 3] + 4;
3853         for (n = 0; n < sz - 1; n += 2) {
3854           i1 = idx[0];
3855           i2 = idx[1];
3856           idx += 2;
3857           tmp0 = x[i1];
3858           tmp1 = x[i2];
3859           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3860           v1 += 2;
3861           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3862           v2 += 2;
3863           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3864           v3 += 2;
3865           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3866           v4 += 2;
3867         }
3868 
3869         if (n == sz - 1) {
3870           tmp0 = x[*idx];
3871           sum1 -= *v1 * tmp0;
3872           sum2 -= *v2 * tmp0;
3873           sum3 -= *v3 * tmp0;
3874           sum4 -= *v4 * tmp0;
3875         }
3876         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3877         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3878         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3879         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3880         row -= 4;
3881         break;
3882       case 5:
3883 
3884         sum1 = b[row];
3885         sum2 = b[row - 1];
3886         sum3 = b[row - 2];
3887         sum4 = b[row - 3];
3888         sum5 = b[row - 4];
3889         v2   = a->a + diag[row - 1] + 2;
3890         v3   = a->a + diag[row - 2] + 3;
3891         v4   = a->a + diag[row - 3] + 4;
3892         v5   = a->a + diag[row - 4] + 5;
3893         for (n = 0; n < sz - 1; n += 2) {
3894           i1 = idx[0];
3895           i2 = idx[1];
3896           idx += 2;
3897           tmp0 = x[i1];
3898           tmp1 = x[i2];
3899           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3900           v1 += 2;
3901           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3902           v2 += 2;
3903           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3904           v3 += 2;
3905           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3906           v4 += 2;
3907           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3908           v5 += 2;
3909         }
3910 
3911         if (n == sz - 1) {
3912           tmp0 = x[*idx];
3913           sum1 -= *v1 * tmp0;
3914           sum2 -= *v2 * tmp0;
3915           sum3 -= *v3 * tmp0;
3916           sum4 -= *v4 * tmp0;
3917           sum5 -= *v5 * tmp0;
3918         }
3919         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3920         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3921         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3922         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3923         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3924         row -= 5;
3925         break;
3926       default:
3927         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3928       }
3929     }
3930     PetscCall(PetscLogFlops(a->nz));
3931 
3932     /*
3933            t = b - D x    where D is the block diagonal
3934     */
3935     cnt = 0;
3936     for (i = 0, row = 0; i < m; i++) {
3937       switch (sizes[i]) {
3938       case 1:
3939         t[row] = b[row] - bdiag[cnt++] * x[row];
3940         row++;
3941         break;
3942       case 2:
3943         x1         = x[row];
3944         x2         = x[row + 1];
3945         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3946         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3947         t[row]     = b[row] - tmp1;
3948         t[row + 1] = b[row + 1] - tmp2;
3949         row += 2;
3950         cnt += 4;
3951         break;
3952       case 3:
3953         x1         = x[row];
3954         x2         = x[row + 1];
3955         x3         = x[row + 2];
3956         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3957         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3958         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3959         t[row]     = b[row] - tmp1;
3960         t[row + 1] = b[row + 1] - tmp2;
3961         t[row + 2] = b[row + 2] - tmp3;
3962         row += 3;
3963         cnt += 9;
3964         break;
3965       case 4:
3966         x1         = x[row];
3967         x2         = x[row + 1];
3968         x3         = x[row + 2];
3969         x4         = x[row + 3];
3970         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3971         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3972         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3973         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3974         t[row]     = b[row] - tmp1;
3975         t[row + 1] = b[row + 1] - tmp2;
3976         t[row + 2] = b[row + 2] - tmp3;
3977         t[row + 3] = b[row + 3] - tmp4;
3978         row += 4;
3979         cnt += 16;
3980         break;
3981       case 5:
3982         x1         = x[row];
3983         x2         = x[row + 1];
3984         x3         = x[row + 2];
3985         x4         = x[row + 3];
3986         x5         = x[row + 4];
3987         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3988         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3989         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3990         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3991         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3992         t[row]     = b[row] - tmp1;
3993         t[row + 1] = b[row + 1] - tmp2;
3994         t[row + 2] = b[row + 2] - tmp3;
3995         t[row + 3] = b[row + 3] - tmp4;
3996         t[row + 4] = b[row + 4] - tmp5;
3997         row += 5;
3998         cnt += 25;
3999         break;
4000       default:
4001         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4002       }
4003     }
4004     PetscCall(PetscLogFlops(m));
4005 
4006     /*
4007           Apply (L + D)^-1 where D is the block diagonal
4008     */
4009     for (i = 0, row = 0; i < m; i++) {
4010       sz  = diag[row] - ii[row];
4011       v1  = a->a + ii[row];
4012       idx = a->j + ii[row];
4013       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4014       switch (sizes[i]) {
4015       case 1:
4016 
4017         sum1 = t[row];
4018         for (n = 0; n < sz - 1; n += 2) {
4019           i1 = idx[0];
4020           i2 = idx[1];
4021           idx += 2;
4022           tmp0 = t[i1];
4023           tmp1 = t[i2];
4024           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4025           v1 += 2;
4026         }
4027 
4028         if (n == sz - 1) {
4029           tmp0 = t[*idx];
4030           sum1 -= *v1 * tmp0;
4031         }
4032         x[row] += t[row] = sum1 * (*ibdiag++);
4033         row++;
4034         break;
4035       case 2:
4036         v2   = a->a + ii[row + 1];
4037         sum1 = t[row];
4038         sum2 = t[row + 1];
4039         for (n = 0; n < sz - 1; n += 2) {
4040           i1 = idx[0];
4041           i2 = idx[1];
4042           idx += 2;
4043           tmp0 = t[i1];
4044           tmp1 = t[i2];
4045           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4046           v1 += 2;
4047           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4048           v2 += 2;
4049         }
4050 
4051         if (n == sz - 1) {
4052           tmp0 = t[*idx];
4053           sum1 -= v1[0] * tmp0;
4054           sum2 -= v2[0] * tmp0;
4055         }
4056         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4057         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4058         ibdiag += 4;
4059         row += 2;
4060         break;
4061       case 3:
4062         v2   = a->a + ii[row + 1];
4063         v3   = a->a + ii[row + 2];
4064         sum1 = t[row];
4065         sum2 = t[row + 1];
4066         sum3 = t[row + 2];
4067         for (n = 0; n < sz - 1; n += 2) {
4068           i1 = idx[0];
4069           i2 = idx[1];
4070           idx += 2;
4071           tmp0 = t[i1];
4072           tmp1 = t[i2];
4073           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4074           v1 += 2;
4075           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4076           v2 += 2;
4077           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4078           v3 += 2;
4079         }
4080 
4081         if (n == sz - 1) {
4082           tmp0 = t[*idx];
4083           sum1 -= v1[0] * tmp0;
4084           sum2 -= v2[0] * tmp0;
4085           sum3 -= v3[0] * tmp0;
4086         }
4087         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4088         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4089         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4090         ibdiag += 9;
4091         row += 3;
4092         break;
4093       case 4:
4094         v2   = a->a + ii[row + 1];
4095         v3   = a->a + ii[row + 2];
4096         v4   = a->a + ii[row + 3];
4097         sum1 = t[row];
4098         sum2 = t[row + 1];
4099         sum3 = t[row + 2];
4100         sum4 = t[row + 3];
4101         for (n = 0; n < sz - 1; n += 2) {
4102           i1 = idx[0];
4103           i2 = idx[1];
4104           idx += 2;
4105           tmp0 = t[i1];
4106           tmp1 = t[i2];
4107           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4108           v1 += 2;
4109           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4110           v2 += 2;
4111           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4112           v3 += 2;
4113           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4114           v4 += 2;
4115         }
4116 
4117         if (n == sz - 1) {
4118           tmp0 = t[*idx];
4119           sum1 -= v1[0] * tmp0;
4120           sum2 -= v2[0] * tmp0;
4121           sum3 -= v3[0] * tmp0;
4122           sum4 -= v4[0] * tmp0;
4123         }
4124         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4125         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4126         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4127         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4128         ibdiag += 16;
4129         row += 4;
4130         break;
4131       case 5:
4132         v2   = a->a + ii[row + 1];
4133         v3   = a->a + ii[row + 2];
4134         v4   = a->a + ii[row + 3];
4135         v5   = a->a + ii[row + 4];
4136         sum1 = t[row];
4137         sum2 = t[row + 1];
4138         sum3 = t[row + 2];
4139         sum4 = t[row + 3];
4140         sum5 = t[row + 4];
4141         for (n = 0; n < sz - 1; n += 2) {
4142           i1 = idx[0];
4143           i2 = idx[1];
4144           idx += 2;
4145           tmp0 = t[i1];
4146           tmp1 = t[i2];
4147           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4148           v1 += 2;
4149           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4150           v2 += 2;
4151           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4152           v3 += 2;
4153           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4154           v4 += 2;
4155           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4156           v5 += 2;
4157         }
4158 
4159         if (n == sz - 1) {
4160           tmp0 = t[*idx];
4161           sum1 -= v1[0] * tmp0;
4162           sum2 -= v2[0] * tmp0;
4163           sum3 -= v3[0] * tmp0;
4164           sum4 -= v4[0] * tmp0;
4165           sum5 -= v5[0] * tmp0;
4166         }
4167         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4168         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4169         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4170         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4171         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4172         ibdiag += 25;
4173         row += 5;
4174         break;
4175       default:
4176         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4177       }
4178     }
4179     PetscCall(PetscLogFlops(a->nz));
4180   }
4181   PetscCall(VecRestoreArray(xx, &x));
4182   PetscCall(VecRestoreArrayRead(bb, &b));
4183   PetscFunctionReturn(PETSC_SUCCESS);
4184 }
4185 
4186 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4187 {
4188   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4189   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4190   const MatScalar   *bdiag = a->inode.bdiag;
4191   const PetscScalar *b;
4192   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4193   const PetscInt    *sizes = a->inode.size;
4194 
4195   PetscFunctionBegin;
4196   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4197   PetscCall(VecGetArray(xx, &x));
4198   PetscCall(VecGetArrayRead(bb, &b));
4199   cnt = 0;
4200   for (i = 0, row = 0; i < m; i++) {
4201     switch (sizes[i]) {
4202     case 1:
4203       x[row] = b[row] * bdiag[cnt++];
4204       row++;
4205       break;
4206     case 2:
4207       x1       = b[row];
4208       x2       = b[row + 1];
4209       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4210       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4211       x[row++] = tmp1;
4212       x[row++] = tmp2;
4213       cnt += 4;
4214       break;
4215     case 3:
4216       x1       = b[row];
4217       x2       = b[row + 1];
4218       x3       = b[row + 2];
4219       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4220       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4221       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4222       x[row++] = tmp1;
4223       x[row++] = tmp2;
4224       x[row++] = tmp3;
4225       cnt += 9;
4226       break;
4227     case 4:
4228       x1       = b[row];
4229       x2       = b[row + 1];
4230       x3       = b[row + 2];
4231       x4       = b[row + 3];
4232       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4233       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4234       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4235       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4236       x[row++] = tmp1;
4237       x[row++] = tmp2;
4238       x[row++] = tmp3;
4239       x[row++] = tmp4;
4240       cnt += 16;
4241       break;
4242     case 5:
4243       x1       = b[row];
4244       x2       = b[row + 1];
4245       x3       = b[row + 2];
4246       x4       = b[row + 3];
4247       x5       = b[row + 4];
4248       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4249       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4250       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4251       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4252       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4253       x[row++] = tmp1;
4254       x[row++] = tmp2;
4255       x[row++] = tmp3;
4256       x[row++] = tmp4;
4257       x[row++] = tmp5;
4258       cnt += 25;
4259       break;
4260     default:
4261       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4262     }
4263   }
4264   PetscCall(PetscLogFlops(2.0 * cnt));
4265   PetscCall(VecRestoreArray(xx, &x));
4266   PetscCall(VecRestoreArrayRead(bb, &b));
4267   PetscFunctionReturn(PETSC_SUCCESS);
4268 }
4269 
4270 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4271 {
4272   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4273 
4274   PetscFunctionBegin;
4275   a->inode.node_count       = 0;
4276   a->inode.use              = PETSC_FALSE;
4277   a->inode.checked          = PETSC_FALSE;
4278   a->inode.mat_nonzerostate = -1;
4279   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4280   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4281   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4282   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4283   A->ops->coloringpatch     = NULL;
4284   A->ops->multdiagonalblock = NULL;
4285   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4286   PetscFunctionReturn(PETSC_SUCCESS);
4287 }
4288 
4289 /*
4290     samestructure indicates that the matrix has not changed its nonzero structure so we
4291     do not need to recompute the inodes
4292 */
4293 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4294 {
4295   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4296   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4297   PetscBool       flag;
4298   const PetscInt *idx, *idy, *ii;
4299 
4300   PetscFunctionBegin;
4301   if (!a->inode.use) {
4302     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4303     PetscCall(PetscFree(a->inode.size));
4304     PetscFunctionReturn(PETSC_SUCCESS);
4305   }
4306   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4307 
4308   m = A->rmap->n;
4309   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4310   ns = a->inode.size;
4311 
4312   i          = 0;
4313   node_count = 0;
4314   idx        = a->j;
4315   ii         = a->i;
4316   if (idx) {
4317     while (i < m) {            /* For each row */
4318       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4319       /* Limits the number of elements in a node to 'a->inode.limit' */
4320       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4321         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4322         if (nzy != nzx) break;
4323         idy += nzx; /* Same nonzero pattern */
4324         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4325         if (!flag) break;
4326       }
4327       ns[node_count++] = blk_size;
4328       idx += blk_size * nzx;
4329       i = j;
4330     }
4331   }
4332   /* If not enough inodes found,, do not use inode version of the routines */
4333   if (!m || !idx || node_count > .8 * m) {
4334     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4335     PetscCall(PetscFree(a->inode.size));
4336     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4337   } else {
4338     if (!A->factortype) {
4339       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4340       if (A->rmap->n == A->cmap->n) {
4341         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4342         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4343         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4344         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4345         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4346       }
4347     } else {
4348       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4349     }
4350     a->inode.node_count = node_count;
4351     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4352   }
4353   a->inode.checked          = PETSC_TRUE;
4354   a->inode.mat_nonzerostate = A->nonzerostate;
4355   PetscFunctionReturn(PETSC_SUCCESS);
4356 }
4357 
4358 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4359 {
4360   Mat         B = *C;
4361   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4362   PetscInt    m = A->rmap->n;
4363 
4364   PetscFunctionBegin;
4365   c->inode.use              = a->inode.use;
4366   c->inode.limit            = a->inode.limit;
4367   c->inode.max_limit        = a->inode.max_limit;
4368   c->inode.checked          = PETSC_FALSE;
4369   c->inode.size             = NULL;
4370   c->inode.node_count       = 0;
4371   c->inode.ibdiagvalid      = PETSC_FALSE;
4372   c->inode.ibdiag           = NULL;
4373   c->inode.bdiag            = NULL;
4374   c->inode.mat_nonzerostate = -1;
4375   if (a->inode.use) {
4376     if (a->inode.checked && a->inode.size) {
4377       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4378       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4379 
4380       c->inode.checked          = PETSC_TRUE;
4381       c->inode.node_count       = a->inode.node_count;
4382       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4383     }
4384     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4385     if (!B->factortype) {
4386       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4387       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4388       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4389       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4390       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4391       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4392     } else {
4393       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4394     }
4395   }
4396   PetscFunctionReturn(PETSC_SUCCESS);
4397 }
4398 
4399 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4400 {
4401   PetscInt        k;
4402   const PetscInt *vi;
4403 
4404   PetscFunctionBegin;
4405   vi = aj + ai[row];
4406   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4407   vi        = aj + adiag[row];
4408   cols[nzl] = vi[0];
4409   vi        = aj + adiag[row + 1] + 1;
4410   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4411   PetscFunctionReturn(PETSC_SUCCESS);
4412 }
4413 /*
4414    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4415    Modified from MatSeqAIJCheckInode().
4416 
4417    Input Parameters:
4418 .  Mat A - ILU or LU matrix factor
4419 
4420 */
4421 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4422 {
4423   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4424   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4425   PetscInt       *cols1, *cols2, *ns;
4426   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4427   PetscBool       flag;
4428 
4429   PetscFunctionBegin;
4430   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4431   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4432 
4433   m = A->rmap->n;
4434   if (a->inode.size) ns = a->inode.size;
4435   else PetscCall(PetscMalloc1(m + 1, &ns));
4436 
4437   i          = 0;
4438   node_count = 0;
4439   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4440   while (i < m) {                       /* For each row */
4441     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4442     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4443     nzx  = nzl1 + nzu1 + 1;
4444     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4445 
4446     /* Limits the number of elements in a node to 'a->inode.limit' */
4447     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4448       nzl2 = ai[j + 1] - ai[j];
4449       nzu2 = adiag[j] - adiag[j + 1] - 1;
4450       nzy  = nzl2 + nzu2 + 1;
4451       if (nzy != nzx) break;
4452       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4453       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4454       if (!flag) break;
4455     }
4456     ns[node_count++] = blk_size;
4457     i                = j;
4458   }
4459   PetscCall(PetscFree2(cols1, cols2));
4460   /* If not enough inodes found,, do not use inode version of the routines */
4461   if (!m || node_count > .8 * m) {
4462     PetscCall(PetscFree(ns));
4463 
4464     a->inode.node_count = 0;
4465     a->inode.size       = NULL;
4466     a->inode.use        = PETSC_FALSE;
4467 
4468     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4469   } else {
4470     A->ops->mult              = NULL;
4471     A->ops->sor               = NULL;
4472     A->ops->multadd           = NULL;
4473     A->ops->getrowij          = NULL;
4474     A->ops->restorerowij      = NULL;
4475     A->ops->getcolumnij       = NULL;
4476     A->ops->restorecolumnij   = NULL;
4477     A->ops->coloringpatch     = NULL;
4478     A->ops->multdiagonalblock = NULL;
4479     a->inode.node_count       = node_count;
4480     a->inode.size             = ns;
4481 
4482     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4483   }
4484   a->inode.checked = PETSC_TRUE;
4485   PetscFunctionReturn(PETSC_SUCCESS);
4486 }
4487 
4488 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4489 {
4490   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4491 
4492   PetscFunctionBegin;
4493   a->inode.ibdiagvalid = PETSC_FALSE;
4494   PetscFunctionReturn(PETSC_SUCCESS);
4495 }
4496 
4497 /*
4498      This is really ugly. if inodes are used this replaces the
4499   permutations with ones that correspond to rows/cols of the matrix
4500   rather than inode blocks
4501 */
4502 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4503 {
4504   PetscFunctionBegin;
4505   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4506   PetscFunctionReturn(PETSC_SUCCESS);
4507 }
4508 
4509 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4510 {
4511   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4512   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4513   const PetscInt *ridx, *cidx;
4514   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4515   PetscInt        nslim_col, *ns_col;
4516   IS              ris = *rperm, cis = *cperm;
4517 
4518   PetscFunctionBegin;
4519   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4520   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4521 
4522   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4523   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4524   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4525 
4526   PetscCall(ISGetIndices(ris, &ridx));
4527   PetscCall(ISGetIndices(cis, &cidx));
4528 
4529   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4530   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4531 
4532   /* Construct the permutations for rows*/
4533   for (i = 0, row = 0; i < nslim_row; ++i) {
4534     indx      = ridx[i];
4535     start_val = tns[indx];
4536     end_val   = tns[indx + 1];
4537     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4538   }
4539 
4540   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4541   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4542 
4543   /* Construct permutations for columns */
4544   for (i = 0, col = 0; i < nslim_col; ++i) {
4545     indx      = cidx[i];
4546     start_val = tns[indx];
4547     end_val   = tns[indx + 1];
4548     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4549   }
4550 
4551   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4552   PetscCall(ISSetPermutation(*rperm));
4553   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4554   PetscCall(ISSetPermutation(*cperm));
4555 
4556   PetscCall(ISRestoreIndices(ris, &ridx));
4557   PetscCall(ISRestoreIndices(cis, &cidx));
4558 
4559   PetscCall(PetscFree(ns_col));
4560   PetscCall(PetscFree2(permr, permc));
4561   PetscCall(ISDestroy(&cis));
4562   PetscCall(ISDestroy(&ris));
4563   PetscCall(PetscFree(tns));
4564   PetscFunctionReturn(PETSC_SUCCESS);
4565 }
4566 
4567 /*@C
4568   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4569 
4570   Not Collective
4571 
4572   Input Parameter:
4573 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4574 
4575   Output Parameters:
4576 + node_count - no of inodes present in the matrix.
4577 . sizes      - an array of size `node_count`, with the sizes of each inode.
4578 - limit      - the max size used to generate the inodes.
4579 
4580   Level: advanced
4581 
4582   Note:
4583   It should be called after the matrix is assembled.
4584   The contents of the sizes[] array should not be changed.
4585   `NULL` may be passed for information not needed
4586 
4587 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4588 @*/
4589 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4590 {
4591   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4592 
4593   PetscFunctionBegin;
4594   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4595   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4596   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4597   PetscFunctionReturn(PETSC_SUCCESS);
4598 }
4599 
4600 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4601 {
4602   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4603 
4604   PetscFunctionBegin;
4605   if (node_count) *node_count = a->inode.node_count;
4606   if (sizes) *sizes = a->inode.size;
4607   if (limit) *limit = a->inode.limit;
4608   PetscFunctionReturn(PETSC_SUCCESS);
4609 }
4610