xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 750b007cd8d816cecd9de99077bb0a703b4cf61a)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) {
9   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
10   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
11 
12   PetscFunctionBegin;
13   n = A->cmap->n;
14   m = A->rmap->n;
15   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
16   ns_row = a->inode.size;
17 
18   min_mn = (m < n) ? m : n;
19   if (!ns) {
20     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
21       ;
22     for (; count + 1 < n; count++, i++)
23       ;
24     if (count < n) i++;
25     *size = i;
26     PetscFunctionReturn(0);
27   }
28   PetscCall(PetscMalloc1(n + 1, &ns_col));
29 
30   /* Use the same row structure wherever feasible. */
31   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
32 
33   /* if m < n; pad up the remainder with inode_limit */
34   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
35   /* The last node is the odd ball. padd it up with the remaining rows; */
36   if (count < n) {
37     ns_col[i] = n - count;
38     i++;
39   } else if (count > n) {
40     /* Adjust for the over estimation */
41     ns_col[i - 1] += n - count;
42   }
43   *size = i;
44   *ns   = ns_col;
45   PetscFunctionReturn(0);
46 }
47 
48 /*
49       This builds symmetric version of nonzero structure,
50 */
51 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) {
52   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
53   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
54   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
55   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
56 
57   PetscFunctionBegin;
58   nslim_row = a->inode.node_count;
59   m         = A->rmap->n;
60   n         = A->cmap->n;
61   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
62   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
63 
64   /* Use the row_inode as column_inode */
65   nslim_col = nslim_row;
66   ns_col    = ns_row;
67 
68   /* allocate space for reformated inode structure */
69   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
70   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
71 
72   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
73     nsz = ns_col[i1];
74     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
75   }
76   /* allocate space for row pointers */
77   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
78   *iia = ia;
79   PetscCall(PetscMalloc1(nslim_row + 1, &work));
80 
81   /* determine the number of columns in each row */
82   ia[0] = oshift;
83   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
84     j    = aj + ai[row] + ishift;
85     jmax = aj + ai[row + 1] + ishift;
86     if (j == jmax) continue; /* empty row */
87     col = *j++ + ishift;
88     i2  = tvc[col];
89     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
90       ia[i1 + 1]++;
91       ia[i2 + 1]++;
92       i2++; /* Start col of next node */
93       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
94       i2 = tvc[col];
95     }
96     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
97   }
98 
99   /* shift ia[i] to point to next row */
100   for (i1 = 1; i1 < nslim_row + 1; i1++) {
101     row = ia[i1 - 1];
102     ia[i1] += row;
103     work[i1 - 1] = row - oshift;
104   }
105 
106   /* allocate space for column pointers */
107   nz = ia[nslim_row] + (!ishift);
108   PetscCall(PetscMalloc1(nz, &ja));
109   *jja = ja;
110 
111   /* loop over lower triangular part putting into ja */
112   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
113     j    = aj + ai[row] + ishift;
114     jmax = aj + ai[row + 1] + ishift;
115     if (j == jmax) continue; /* empty row */
116     col = *j++ + ishift;
117     i2  = tvc[col];
118     while (i2 < i1 && j < jmax) {
119       ja[work[i2]++] = i1 + oshift;
120       ja[work[i1]++] = i2 + oshift;
121       ++i2;
122       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
123       i2 = tvc[col];
124     }
125     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
126   }
127   PetscCall(PetscFree(work));
128   PetscCall(PetscFree2(tns, tvc));
129   PetscFunctionReturn(0);
130 }
131 
132 /*
133       This builds nonsymmetric version of nonzero structure,
134 */
135 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) {
136   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
137   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
138   PetscInt       *tns, *tvc, nsz, i1, i2;
139   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
140 
141   PetscFunctionBegin;
142   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
143   nslim_row = a->inode.node_count;
144   n         = A->cmap->n;
145 
146   /* Create The column_inode for this matrix */
147   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
148 
149   /* allocate space for reformated column_inode structure */
150   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
151   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
152 
153   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
154     nsz = ns_col[i1];
155     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
156   }
157   /* allocate space for row pointers */
158   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
159   *iia = ia;
160   PetscCall(PetscMalloc1(nslim_row + 1, &work));
161 
162   /* determine the number of columns in each row */
163   ia[0] = oshift;
164   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
165     j  = aj + ai[row] + ishift;
166     nz = ai[row + 1] - ai[row];
167     if (!nz) continue; /* empty row */
168     col = *j++ + ishift;
169     i2  = tvc[col];
170     while (nz-- > 0) { /* off-diagonal elements */
171       ia[i1 + 1]++;
172       i2++; /* Start col of next node */
173       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
174       if (nz > 0) i2 = tvc[col];
175     }
176   }
177 
178   /* shift ia[i] to point to next row */
179   for (i1 = 1; i1 < nslim_row + 1; i1++) {
180     row = ia[i1 - 1];
181     ia[i1] += row;
182     work[i1 - 1] = row - oshift;
183   }
184 
185   /* allocate space for column pointers */
186   nz = ia[nslim_row] + (!ishift);
187   PetscCall(PetscMalloc1(nz, &ja));
188   *jja = ja;
189 
190   /* loop over matrix putting into ja */
191   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
192     j  = aj + ai[row] + ishift;
193     nz = ai[row + 1] - ai[row];
194     if (!nz) continue; /* empty row */
195     col = *j++ + ishift;
196     i2  = tvc[col];
197     while (nz-- > 0) {
198       ja[work[i1]++] = i2 + oshift;
199       ++i2;
200       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
201       if (nz > 0) i2 = tvc[col];
202     }
203   }
204   PetscCall(PetscFree(ns_col));
205   PetscCall(PetscFree(work));
206   PetscCall(PetscFree2(tns, tvc));
207   PetscFunctionReturn(0);
208 }
209 
210 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
211   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
212 
213   PetscFunctionBegin;
214   if (n) *n = a->inode.node_count;
215   if (!ia) PetscFunctionReturn(0);
216   if (!blockcompressed) {
217     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
218   } else if (symmetric) {
219     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
220   } else {
221     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
222   }
223   PetscFunctionReturn(0);
224 }
225 
226 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
227   PetscFunctionBegin;
228   if (!ia) PetscFunctionReturn(0);
229 
230   if (!blockcompressed) {
231     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
232   } else {
233     PetscCall(PetscFree(*ia));
234     PetscCall(PetscFree(*ja));
235   }
236   PetscFunctionReturn(0);
237 }
238 
239 /* ----------------------------------------------------------- */
240 
241 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) {
242   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
243   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
244   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
245 
246   PetscFunctionBegin;
247   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
248   nslim_row = a->inode.node_count;
249   n         = A->cmap->n;
250 
251   /* Create The column_inode for this matrix */
252   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
253 
254   /* allocate space for reformated column_inode structure */
255   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
256   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
257 
258   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
259     nsz = ns_col[i1];
260     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
261   }
262   /* allocate space for column pointers */
263   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
264   *iia = ia;
265   PetscCall(PetscMalloc1(nslim_col + 1, &work));
266 
267   /* determine the number of columns in each row */
268   ia[0] = oshift;
269   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
270     j   = aj + ai[row] + ishift;
271     col = *j++ + ishift;
272     i2  = tvc[col];
273     nz  = ai[row + 1] - ai[row];
274     while (nz-- > 0) { /* off-diagonal elements */
275       /* ia[i1+1]++; */
276       ia[i2 + 1]++;
277       i2++;
278       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
279       if (nz > 0) i2 = tvc[col];
280     }
281   }
282 
283   /* shift ia[i] to point to next col */
284   for (i1 = 1; i1 < nslim_col + 1; i1++) {
285     col = ia[i1 - 1];
286     ia[i1] += col;
287     work[i1 - 1] = col - oshift;
288   }
289 
290   /* allocate space for column pointers */
291   nz = ia[nslim_col] + (!ishift);
292   PetscCall(PetscMalloc1(nz, &ja));
293   *jja = ja;
294 
295   /* loop over matrix putting into ja */
296   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
297     j   = aj + ai[row] + ishift;
298     col = *j++ + ishift;
299     i2  = tvc[col];
300     nz  = ai[row + 1] - ai[row];
301     while (nz-- > 0) {
302       /* ja[work[i1]++] = i2 + oshift; */
303       ja[work[i2]++] = i1 + oshift;
304       i2++;
305       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
306       if (nz > 0) i2 = tvc[col];
307     }
308   }
309   PetscCall(PetscFree(ns_col));
310   PetscCall(PetscFree(work));
311   PetscCall(PetscFree2(tns, tvc));
312   PetscFunctionReturn(0);
313 }
314 
315 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
316   PetscFunctionBegin;
317   PetscCall(MatCreateColInode_Private(A, n, NULL));
318   if (!ia) PetscFunctionReturn(0);
319 
320   if (!blockcompressed) {
321     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
322   } else if (symmetric) {
323     /* Since the indices are symmetric it doesn't matter */
324     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
325   } else {
326     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
327   }
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) {
332   PetscFunctionBegin;
333   if (!ia) PetscFunctionReturn(0);
334   if (!blockcompressed) {
335     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
336   } else {
337     PetscCall(PetscFree(*ia));
338     PetscCall(PetscFree(*ja));
339   }
340   PetscFunctionReturn(0);
341 }
342 
343 /* ----------------------------------------------------------- */
344 
345 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) {
346   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
347   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
348   PetscScalar       *y;
349   const PetscScalar *x;
350   const MatScalar   *v1, *v2, *v3, *v4, *v5;
351   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
352   const PetscInt    *idx, *ns, *ii;
353 
354 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
355 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
356 #endif
357 
358   PetscFunctionBegin;
359   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
360   node_max = a->inode.node_count;
361   ns       = a->inode.size; /* Node Size array */
362   PetscCall(VecGetArrayRead(xx, &x));
363   PetscCall(VecGetArray(yy, &y));
364   idx = a->j;
365   v1  = a->a;
366   ii  = a->i;
367 
368   for (i = 0, row = 0; i < node_max; ++i) {
369     nsz = ns[i];
370     n   = ii[1] - ii[0];
371     nonzerorow += (n > 0) * nsz;
372     ii += nsz;
373     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
374     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
375     sz = n;                                                                /* No of non zeros in this row */
376                                                                            /* Switch on the size of Node */
377     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
378     case 1:
379       sum1 = 0.;
380 
381       for (n = 0; n < sz - 1; n += 2) {
382         i1 = idx[0]; /* The instructions are ordered to */
383         i2 = idx[1]; /* make the compiler's job easy */
384         idx += 2;
385         tmp0 = x[i1];
386         tmp1 = x[i2];
387         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
388         v1 += 2;
389       }
390 
391       if (n == sz - 1) { /* Take care of the last nonzero  */
392         tmp0 = x[*idx++];
393         sum1 += *v1++ * tmp0;
394       }
395       y[row++] = sum1;
396       break;
397     case 2:
398       sum1 = 0.;
399       sum2 = 0.;
400       v2   = v1 + n;
401 
402       for (n = 0; n < sz - 1; n += 2) {
403         i1 = idx[0];
404         i2 = idx[1];
405         idx += 2;
406         tmp0 = x[i1];
407         tmp1 = x[i2];
408         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
409         v1 += 2;
410         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
411         v2 += 2;
412       }
413       if (n == sz - 1) {
414         tmp0 = x[*idx++];
415         sum1 += *v1++ * tmp0;
416         sum2 += *v2++ * tmp0;
417       }
418       y[row++] = sum1;
419       y[row++] = sum2;
420       v1       = v2; /* Since the next block to be processed starts there*/
421       idx += sz;
422       break;
423     case 3:
424       sum1 = 0.;
425       sum2 = 0.;
426       sum3 = 0.;
427       v2   = v1 + n;
428       v3   = v2 + n;
429 
430       for (n = 0; n < sz - 1; n += 2) {
431         i1 = idx[0];
432         i2 = idx[1];
433         idx += 2;
434         tmp0 = x[i1];
435         tmp1 = x[i2];
436         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
437         v1 += 2;
438         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
439         v2 += 2;
440         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
441         v3 += 2;
442       }
443       if (n == sz - 1) {
444         tmp0 = x[*idx++];
445         sum1 += *v1++ * tmp0;
446         sum2 += *v2++ * tmp0;
447         sum3 += *v3++ * tmp0;
448       }
449       y[row++] = sum1;
450       y[row++] = sum2;
451       y[row++] = sum3;
452       v1       = v3; /* Since the next block to be processed starts there*/
453       idx += 2 * sz;
454       break;
455     case 4:
456       sum1 = 0.;
457       sum2 = 0.;
458       sum3 = 0.;
459       sum4 = 0.;
460       v2   = v1 + n;
461       v3   = v2 + n;
462       v4   = v3 + n;
463 
464       for (n = 0; n < sz - 1; n += 2) {
465         i1 = idx[0];
466         i2 = idx[1];
467         idx += 2;
468         tmp0 = x[i1];
469         tmp1 = x[i2];
470         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
471         v1 += 2;
472         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
473         v2 += 2;
474         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
475         v3 += 2;
476         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
477         v4 += 2;
478       }
479       if (n == sz - 1) {
480         tmp0 = x[*idx++];
481         sum1 += *v1++ * tmp0;
482         sum2 += *v2++ * tmp0;
483         sum3 += *v3++ * tmp0;
484         sum4 += *v4++ * tmp0;
485       }
486       y[row++] = sum1;
487       y[row++] = sum2;
488       y[row++] = sum3;
489       y[row++] = sum4;
490       v1       = v4; /* Since the next block to be processed starts there*/
491       idx += 3 * sz;
492       break;
493     case 5:
494       sum1 = 0.;
495       sum2 = 0.;
496       sum3 = 0.;
497       sum4 = 0.;
498       sum5 = 0.;
499       v2   = v1 + n;
500       v3   = v2 + n;
501       v4   = v3 + n;
502       v5   = v4 + n;
503 
504       for (n = 0; n < sz - 1; n += 2) {
505         i1 = idx[0];
506         i2 = idx[1];
507         idx += 2;
508         tmp0 = x[i1];
509         tmp1 = x[i2];
510         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
511         v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
513         v2 += 2;
514         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
515         v3 += 2;
516         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
517         v4 += 2;
518         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
519         v5 += 2;
520       }
521       if (n == sz - 1) {
522         tmp0 = x[*idx++];
523         sum1 += *v1++ * tmp0;
524         sum2 += *v2++ * tmp0;
525         sum3 += *v3++ * tmp0;
526         sum4 += *v4++ * tmp0;
527         sum5 += *v5++ * tmp0;
528       }
529       y[row++] = sum1;
530       y[row++] = sum2;
531       y[row++] = sum3;
532       y[row++] = sum4;
533       y[row++] = sum5;
534       v1       = v5; /* Since the next block to be processed starts there */
535       idx += 4 * sz;
536       break;
537     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
538     }
539   }
540   PetscCall(VecRestoreArrayRead(xx, &x));
541   PetscCall(VecRestoreArray(yy, &y));
542   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
543   PetscFunctionReturn(0);
544 }
545 /* ----------------------------------------------------------- */
546 /* Almost same code as the MatMult_SeqAIJ_Inode() */
547 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) {
548   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
549   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
550   const MatScalar   *v1, *v2, *v3, *v4, *v5;
551   const PetscScalar *x;
552   PetscScalar       *y, *z, *zt;
553   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
554   const PetscInt    *idx, *ns, *ii;
555 
556   PetscFunctionBegin;
557   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
558   node_max = a->inode.node_count;
559   ns       = a->inode.size; /* Node Size array */
560 
561   PetscCall(VecGetArrayRead(xx, &x));
562   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
563   zt = z;
564 
565   idx = a->j;
566   v1  = a->a;
567   ii  = a->i;
568 
569   for (i = 0, row = 0; i < node_max; ++i) {
570     nsz = ns[i];
571     n   = ii[1] - ii[0];
572     ii += nsz;
573     sz = n;        /* No of non zeros in this row */
574                    /* Switch on the size of Node */
575     switch (nsz) { /* Each loop in 'case' is unrolled */
576     case 1:
577       sum1 = *zt++;
578 
579       for (n = 0; n < sz - 1; n += 2) {
580         i1 = idx[0]; /* The instructions are ordered to */
581         i2 = idx[1]; /* make the compiler's job easy */
582         idx += 2;
583         tmp0 = x[i1];
584         tmp1 = x[i2];
585         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
586         v1 += 2;
587       }
588 
589       if (n == sz - 1) { /* Take care of the last nonzero  */
590         tmp0 = x[*idx++];
591         sum1 += *v1++ * tmp0;
592       }
593       y[row++] = sum1;
594       break;
595     case 2:
596       sum1 = *zt++;
597       sum2 = *zt++;
598       v2   = v1 + n;
599 
600       for (n = 0; n < sz - 1; n += 2) {
601         i1 = idx[0];
602         i2 = idx[1];
603         idx += 2;
604         tmp0 = x[i1];
605         tmp1 = x[i2];
606         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
607         v1 += 2;
608         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
609         v2 += 2;
610       }
611       if (n == sz - 1) {
612         tmp0 = x[*idx++];
613         sum1 += *v1++ * tmp0;
614         sum2 += *v2++ * tmp0;
615       }
616       y[row++] = sum1;
617       y[row++] = sum2;
618       v1       = v2; /* Since the next block to be processed starts there*/
619       idx += sz;
620       break;
621     case 3:
622       sum1 = *zt++;
623       sum2 = *zt++;
624       sum3 = *zt++;
625       v2   = v1 + n;
626       v3   = v2 + n;
627 
628       for (n = 0; n < sz - 1; n += 2) {
629         i1 = idx[0];
630         i2 = idx[1];
631         idx += 2;
632         tmp0 = x[i1];
633         tmp1 = x[i2];
634         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
635         v1 += 2;
636         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
637         v2 += 2;
638         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
639         v3 += 2;
640       }
641       if (n == sz - 1) {
642         tmp0 = x[*idx++];
643         sum1 += *v1++ * tmp0;
644         sum2 += *v2++ * tmp0;
645         sum3 += *v3++ * tmp0;
646       }
647       y[row++] = sum1;
648       y[row++] = sum2;
649       y[row++] = sum3;
650       v1       = v3; /* Since the next block to be processed starts there*/
651       idx += 2 * sz;
652       break;
653     case 4:
654       sum1 = *zt++;
655       sum2 = *zt++;
656       sum3 = *zt++;
657       sum4 = *zt++;
658       v2   = v1 + n;
659       v3   = v2 + n;
660       v4   = v3 + n;
661 
662       for (n = 0; n < sz - 1; n += 2) {
663         i1 = idx[0];
664         i2 = idx[1];
665         idx += 2;
666         tmp0 = x[i1];
667         tmp1 = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
669         v1 += 2;
670         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
671         v2 += 2;
672         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
673         v3 += 2;
674         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
675         v4 += 2;
676       }
677       if (n == sz - 1) {
678         tmp0 = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682         sum4 += *v4++ * tmp0;
683       }
684       y[row++] = sum1;
685       y[row++] = sum2;
686       y[row++] = sum3;
687       y[row++] = sum4;
688       v1       = v4; /* Since the next block to be processed starts there*/
689       idx += 3 * sz;
690       break;
691     case 5:
692       sum1 = *zt++;
693       sum2 = *zt++;
694       sum3 = *zt++;
695       sum4 = *zt++;
696       sum5 = *zt++;
697       v2   = v1 + n;
698       v3   = v2 + n;
699       v4   = v3 + n;
700       v5   = v4 + n;
701 
702       for (n = 0; n < sz - 1; n += 2) {
703         i1 = idx[0];
704         i2 = idx[1];
705         idx += 2;
706         tmp0 = x[i1];
707         tmp1 = x[i2];
708         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
709         v1 += 2;
710         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
711         v2 += 2;
712         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
713         v3 += 2;
714         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
715         v4 += 2;
716         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
717         v5 += 2;
718       }
719       if (n == sz - 1) {
720         tmp0 = x[*idx++];
721         sum1 += *v1++ * tmp0;
722         sum2 += *v2++ * tmp0;
723         sum3 += *v3++ * tmp0;
724         sum4 += *v4++ * tmp0;
725         sum5 += *v5++ * tmp0;
726       }
727       y[row++] = sum1;
728       y[row++] = sum2;
729       y[row++] = sum3;
730       y[row++] = sum4;
731       y[row++] = sum5;
732       v1       = v5; /* Since the next block to be processed starts there */
733       idx += 4 * sz;
734       break;
735     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
736     }
737   }
738   PetscCall(VecRestoreArrayRead(xx, &x));
739   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
740   PetscCall(PetscLogFlops(2.0 * a->nz));
741   PetscFunctionReturn(0);
742 }
743 
744 /* ----------------------------------------------------------- */
745 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) {
746   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
747   IS                 iscol = a->col, isrow = a->row;
748   const PetscInt    *r, *c, *rout, *cout;
749   PetscInt           i, j, n = A->rmap->n, nz;
750   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
751   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
752   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
753   PetscScalar        sum1, sum2, sum3, sum4, sum5;
754   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
755   const PetscScalar *b;
756 
757   PetscFunctionBegin;
758   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
759   node_max = a->inode.node_count;
760   ns       = a->inode.size; /* Node Size array */
761 
762   PetscCall(VecGetArrayRead(bb, &b));
763   PetscCall(VecGetArrayWrite(xx, &x));
764   tmp = a->solve_work;
765 
766   PetscCall(ISGetIndices(isrow, &rout));
767   r = rout;
768   PetscCall(ISGetIndices(iscol, &cout));
769   c = cout + (n - 1);
770 
771   /* forward solve the lower triangular */
772   tmps = tmp;
773   aa   = a_a;
774   aj   = a_j;
775   ad   = a->diag;
776 
777   for (i = 0, row = 0; i < node_max; ++i) {
778     nsz = ns[i];
779     aii = ai[row];
780     v1  = aa + aii;
781     vi  = aj + aii;
782     nz  = ad[row] - aii;
783     if (i < node_max - 1) {
784       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
785       * but our indexing to determine it's size could. */
786       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
787       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
788       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
789       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
790     }
791 
792     switch (nsz) { /* Each loop in 'case' is unrolled */
793     case 1:
794       sum1 = b[*r++];
795       for (j = 0; j < nz - 1; j += 2) {
796         i0 = vi[0];
797         i1 = vi[1];
798         vi += 2;
799         tmp0 = tmps[i0];
800         tmp1 = tmps[i1];
801         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
802         v1 += 2;
803       }
804       if (j == nz - 1) {
805         tmp0 = tmps[*vi++];
806         sum1 -= *v1++ * tmp0;
807       }
808       tmp[row++] = sum1;
809       break;
810     case 2:
811       sum1 = b[*r++];
812       sum2 = b[*r++];
813       v2   = aa + ai[row + 1];
814 
815       for (j = 0; j < nz - 1; j += 2) {
816         i0 = vi[0];
817         i1 = vi[1];
818         vi += 2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822         v1 += 2;
823         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
824         v2 += 2;
825       }
826       if (j == nz - 1) {
827         tmp0 = tmps[*vi++];
828         sum1 -= *v1++ * tmp0;
829         sum2 -= *v2++ * tmp0;
830       }
831       sum2 -= *v2++ * sum1;
832       tmp[row++] = sum1;
833       tmp[row++] = sum2;
834       break;
835     case 3:
836       sum1 = b[*r++];
837       sum2 = b[*r++];
838       sum3 = b[*r++];
839       v2   = aa + ai[row + 1];
840       v3   = aa + ai[row + 2];
841 
842       for (j = 0; j < nz - 1; j += 2) {
843         i0 = vi[0];
844         i1 = vi[1];
845         vi += 2;
846         tmp0 = tmps[i0];
847         tmp1 = tmps[i1];
848         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
849         v1 += 2;
850         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
851         v2 += 2;
852         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
853         v3 += 2;
854       }
855       if (j == nz - 1) {
856         tmp0 = tmps[*vi++];
857         sum1 -= *v1++ * tmp0;
858         sum2 -= *v2++ * tmp0;
859         sum3 -= *v3++ * tmp0;
860       }
861       sum2 -= *v2++ * sum1;
862       sum3 -= *v3++ * sum1;
863       sum3 -= *v3++ * sum2;
864 
865       tmp[row++] = sum1;
866       tmp[row++] = sum2;
867       tmp[row++] = sum3;
868       break;
869 
870     case 4:
871       sum1 = b[*r++];
872       sum2 = b[*r++];
873       sum3 = b[*r++];
874       sum4 = b[*r++];
875       v2   = aa + ai[row + 1];
876       v3   = aa + ai[row + 2];
877       v4   = aa + ai[row + 3];
878 
879       for (j = 0; j < nz - 1; j += 2) {
880         i0 = vi[0];
881         i1 = vi[1];
882         vi += 2;
883         tmp0 = tmps[i0];
884         tmp1 = tmps[i1];
885         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
886         v1 += 2;
887         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
888         v2 += 2;
889         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
890         v3 += 2;
891         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
892         v4 += 2;
893       }
894       if (j == nz - 1) {
895         tmp0 = tmps[*vi++];
896         sum1 -= *v1++ * tmp0;
897         sum2 -= *v2++ * tmp0;
898         sum3 -= *v3++ * tmp0;
899         sum4 -= *v4++ * tmp0;
900       }
901       sum2 -= *v2++ * sum1;
902       sum3 -= *v3++ * sum1;
903       sum4 -= *v4++ * sum1;
904       sum3 -= *v3++ * sum2;
905       sum4 -= *v4++ * sum2;
906       sum4 -= *v4++ * sum3;
907 
908       tmp[row++] = sum1;
909       tmp[row++] = sum2;
910       tmp[row++] = sum3;
911       tmp[row++] = sum4;
912       break;
913     case 5:
914       sum1 = b[*r++];
915       sum2 = b[*r++];
916       sum3 = b[*r++];
917       sum4 = b[*r++];
918       sum5 = b[*r++];
919       v2   = aa + ai[row + 1];
920       v3   = aa + ai[row + 2];
921       v4   = aa + ai[row + 3];
922       v5   = aa + ai[row + 4];
923 
924       for (j = 0; j < nz - 1; j += 2) {
925         i0 = vi[0];
926         i1 = vi[1];
927         vi += 2;
928         tmp0 = tmps[i0];
929         tmp1 = tmps[i1];
930         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
931         v1 += 2;
932         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
933         v2 += 2;
934         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
935         v3 += 2;
936         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
937         v4 += 2;
938         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
939         v5 += 2;
940       }
941       if (j == nz - 1) {
942         tmp0 = tmps[*vi++];
943         sum1 -= *v1++ * tmp0;
944         sum2 -= *v2++ * tmp0;
945         sum3 -= *v3++ * tmp0;
946         sum4 -= *v4++ * tmp0;
947         sum5 -= *v5++ * tmp0;
948       }
949 
950       sum2 -= *v2++ * sum1;
951       sum3 -= *v3++ * sum1;
952       sum4 -= *v4++ * sum1;
953       sum5 -= *v5++ * sum1;
954       sum3 -= *v3++ * sum2;
955       sum4 -= *v4++ * sum2;
956       sum5 -= *v5++ * sum2;
957       sum4 -= *v4++ * sum3;
958       sum5 -= *v5++ * sum3;
959       sum5 -= *v5++ * sum4;
960 
961       tmp[row++] = sum1;
962       tmp[row++] = sum2;
963       tmp[row++] = sum3;
964       tmp[row++] = sum4;
965       tmp[row++] = sum5;
966       break;
967     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
968     }
969   }
970   /* backward solve the upper triangular */
971   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
972     nsz = ns[i];
973     aii = ai[row + 1] - 1;
974     v1  = aa + aii;
975     vi  = aj + aii;
976     nz  = aii - ad[row];
977     switch (nsz) { /* Each loop in 'case' is unrolled */
978     case 1:
979       sum1 = tmp[row];
980 
981       for (j = nz; j > 1; j -= 2) {
982         vi -= 2;
983         i0   = vi[2];
984         i1   = vi[1];
985         tmp0 = tmps[i0];
986         tmp1 = tmps[i1];
987         v1 -= 2;
988         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
989       }
990       if (j == 1) {
991         tmp0 = tmps[*vi--];
992         sum1 -= *v1-- * tmp0;
993       }
994       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
995       row--;
996       break;
997     case 2:
998       sum1 = tmp[row];
999       sum2 = tmp[row - 1];
1000       v2   = aa + ai[row] - 1;
1001       for (j = nz; j > 1; j -= 2) {
1002         vi -= 2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1 -= 2;
1008         v2 -= 2;
1009         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1010         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1011       }
1012       if (j == 1) {
1013         tmp0 = tmps[*vi--];
1014         sum1 -= *v1-- * tmp0;
1015         sum2 -= *v2-- * tmp0;
1016       }
1017 
1018       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1019       row--;
1020       sum2 -= *v2-- * tmp0;
1021       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1022       row--;
1023       break;
1024     case 3:
1025       sum1 = tmp[row];
1026       sum2 = tmp[row - 1];
1027       sum3 = tmp[row - 2];
1028       v2   = aa + ai[row] - 1;
1029       v3   = aa + ai[row - 1] - 1;
1030       for (j = nz; j > 1; j -= 2) {
1031         vi -= 2;
1032         i0   = vi[2];
1033         i1   = vi[1];
1034         tmp0 = tmps[i0];
1035         tmp1 = tmps[i1];
1036         v1 -= 2;
1037         v2 -= 2;
1038         v3 -= 2;
1039         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1040         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1041         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1042       }
1043       if (j == 1) {
1044         tmp0 = tmps[*vi--];
1045         sum1 -= *v1-- * tmp0;
1046         sum2 -= *v2-- * tmp0;
1047         sum3 -= *v3-- * tmp0;
1048       }
1049       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1050       row--;
1051       sum2 -= *v2-- * tmp0;
1052       sum3 -= *v3-- * tmp0;
1053       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1054       row--;
1055       sum3 -= *v3-- * tmp0;
1056       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1057       row--;
1058 
1059       break;
1060     case 4:
1061       sum1 = tmp[row];
1062       sum2 = tmp[row - 1];
1063       sum3 = tmp[row - 2];
1064       sum4 = tmp[row - 3];
1065       v2   = aa + ai[row] - 1;
1066       v3   = aa + ai[row - 1] - 1;
1067       v4   = aa + ai[row - 2] - 1;
1068 
1069       for (j = nz; j > 1; j -= 2) {
1070         vi -= 2;
1071         i0   = vi[2];
1072         i1   = vi[1];
1073         tmp0 = tmps[i0];
1074         tmp1 = tmps[i1];
1075         v1 -= 2;
1076         v2 -= 2;
1077         v3 -= 2;
1078         v4 -= 2;
1079         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1080         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1081         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1082         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1083       }
1084       if (j == 1) {
1085         tmp0 = tmps[*vi--];
1086         sum1 -= *v1-- * tmp0;
1087         sum2 -= *v2-- * tmp0;
1088         sum3 -= *v3-- * tmp0;
1089         sum4 -= *v4-- * tmp0;
1090       }
1091 
1092       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1093       row--;
1094       sum2 -= *v2-- * tmp0;
1095       sum3 -= *v3-- * tmp0;
1096       sum4 -= *v4-- * tmp0;
1097       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1098       row--;
1099       sum3 -= *v3-- * tmp0;
1100       sum4 -= *v4-- * tmp0;
1101       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1102       row--;
1103       sum4 -= *v4-- * tmp0;
1104       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1105       row--;
1106       break;
1107     case 5:
1108       sum1 = tmp[row];
1109       sum2 = tmp[row - 1];
1110       sum3 = tmp[row - 2];
1111       sum4 = tmp[row - 3];
1112       sum5 = tmp[row - 4];
1113       v2   = aa + ai[row] - 1;
1114       v3   = aa + ai[row - 1] - 1;
1115       v4   = aa + ai[row - 2] - 1;
1116       v5   = aa + ai[row - 3] - 1;
1117       for (j = nz; j > 1; j -= 2) {
1118         vi -= 2;
1119         i0   = vi[2];
1120         i1   = vi[1];
1121         tmp0 = tmps[i0];
1122         tmp1 = tmps[i1];
1123         v1 -= 2;
1124         v2 -= 2;
1125         v3 -= 2;
1126         v4 -= 2;
1127         v5 -= 2;
1128         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1129         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1130         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1131         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1132         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1133       }
1134       if (j == 1) {
1135         tmp0 = tmps[*vi--];
1136         sum1 -= *v1-- * tmp0;
1137         sum2 -= *v2-- * tmp0;
1138         sum3 -= *v3-- * tmp0;
1139         sum4 -= *v4-- * tmp0;
1140         sum5 -= *v5-- * tmp0;
1141       }
1142 
1143       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1144       row--;
1145       sum2 -= *v2-- * tmp0;
1146       sum3 -= *v3-- * tmp0;
1147       sum4 -= *v4-- * tmp0;
1148       sum5 -= *v5-- * tmp0;
1149       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1150       row--;
1151       sum3 -= *v3-- * tmp0;
1152       sum4 -= *v4-- * tmp0;
1153       sum5 -= *v5-- * tmp0;
1154       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1155       row--;
1156       sum4 -= *v4-- * tmp0;
1157       sum5 -= *v5-- * tmp0;
1158       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1159       row--;
1160       sum5 -= *v5-- * tmp0;
1161       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1162       row--;
1163       break;
1164     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1165     }
1166   }
1167   PetscCall(ISRestoreIndices(isrow, &rout));
1168   PetscCall(ISRestoreIndices(iscol, &cout));
1169   PetscCall(VecRestoreArrayRead(bb, &b));
1170   PetscCall(VecRestoreArrayWrite(xx, &x));
1171   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) {
1176   Mat              C = B;
1177   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1178   IS               isrow = b->row, isicol = b->icol;
1179   const PetscInt  *r, *ic, *ics;
1180   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1181   PetscInt         i, j, k, nz, nzL, row, *pj;
1182   const PetscInt  *ajtmp, *bjtmp;
1183   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1184   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1185   FactorShiftCtx   sctx;
1186   const PetscInt  *ddiag;
1187   PetscReal        rs;
1188   MatScalar        d;
1189   PetscInt         inod, nodesz, node_max, col;
1190   const PetscInt  *ns;
1191   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1192 
1193   PetscFunctionBegin;
1194   /* MatPivotSetUp(): initialize shift context sctx */
1195   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1196 
1197   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1198     ddiag          = a->diag;
1199     sctx.shift_top = info->zeropivot;
1200     for (i = 0; i < n; i++) {
1201       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1202       d  = (aa)[ddiag[i]];
1203       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1204       v  = aa + ai[i];
1205       nz = ai[i + 1] - ai[i];
1206       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1207       if (rs > sctx.shift_top) sctx.shift_top = rs;
1208     }
1209     sctx.shift_top *= 1.1;
1210     sctx.nshift_max = 5;
1211     sctx.shift_lo   = 0.;
1212     sctx.shift_hi   = 1.;
1213   }
1214 
1215   PetscCall(ISGetIndices(isrow, &r));
1216   PetscCall(ISGetIndices(isicol, &ic));
1217 
1218   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1219   ics = ic;
1220 
1221   node_max = a->inode.node_count;
1222   ns       = a->inode.size;
1223   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1224 
1225   /* If max inode size > 4, split it into two inodes.*/
1226   /* also map the inode sizes according to the ordering */
1227   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1228   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1229     if (ns[i] > 4) {
1230       tmp_vec1[j] = 4;
1231       ++j;
1232       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1233     } else {
1234       tmp_vec1[j] = ns[i];
1235     }
1236   }
1237   /* Use the correct node_max */
1238   node_max = j;
1239 
1240   /* Now reorder the inode info based on mat re-ordering info */
1241   /* First create a row -> inode_size_array_index map */
1242   PetscCall(PetscMalloc1(n + 1, &nsmap));
1243   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1244   for (i = 0, row = 0; i < node_max; i++) {
1245     nodesz = tmp_vec1[i];
1246     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1247   }
1248   /* Using nsmap, create a reordered ns structure */
1249   for (i = 0, j = 0; i < node_max; i++) {
1250     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1251     tmp_vec2[i] = nodesz;
1252     j += nodesz;
1253   }
1254   PetscCall(PetscFree(nsmap));
1255   PetscCall(PetscFree(tmp_vec1));
1256 
1257   /* Now use the correct ns */
1258   ns = tmp_vec2;
1259 
1260   do {
1261     sctx.newshift = PETSC_FALSE;
1262     /* Now loop over each block-row, and do the factorization */
1263     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1264       nodesz = ns[inod];
1265 
1266       switch (nodesz) {
1267       case 1:
1268         /*----------*/
1269         /* zero rtmp1 */
1270         /* L part */
1271         nz    = bi[i + 1] - bi[i];
1272         bjtmp = bj + bi[i];
1273         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1274 
1275         /* U part */
1276         nz    = bdiag[i] - bdiag[i + 1];
1277         bjtmp = bj + bdiag[i + 1] + 1;
1278         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1279 
1280         /* load in initial (unfactored row) */
1281         nz    = ai[r[i] + 1] - ai[r[i]];
1282         ajtmp = aj + ai[r[i]];
1283         v     = aa + ai[r[i]];
1284         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1285 
1286         /* ZeropivotApply() */
1287         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1288 
1289         /* elimination */
1290         bjtmp = bj + bi[i];
1291         row   = *bjtmp++;
1292         nzL   = bi[i + 1] - bi[i];
1293         for (k = 0; k < nzL; k++) {
1294           pc = rtmp1 + row;
1295           if (*pc != 0.0) {
1296             pv   = b->a + bdiag[row];
1297             mul1 = *pc * (*pv);
1298             *pc  = mul1;
1299             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1300             pv   = b->a + bdiag[row + 1] + 1;
1301             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1302             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1303             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1304           }
1305           row = *bjtmp++;
1306         }
1307 
1308         /* finished row so stick it into b->a */
1309         rs = 0.0;
1310         /* L part */
1311         pv = b->a + bi[i];
1312         pj = b->j + bi[i];
1313         nz = bi[i + 1] - bi[i];
1314         for (j = 0; j < nz; j++) {
1315           pv[j] = rtmp1[pj[j]];
1316           rs += PetscAbsScalar(pv[j]);
1317         }
1318 
1319         /* U part */
1320         pv = b->a + bdiag[i + 1] + 1;
1321         pj = b->j + bdiag[i + 1] + 1;
1322         nz = bdiag[i] - bdiag[i + 1] - 1;
1323         for (j = 0; j < nz; j++) {
1324           pv[j] = rtmp1[pj[j]];
1325           rs += PetscAbsScalar(pv[j]);
1326         }
1327 
1328         /* Check zero pivot */
1329         sctx.rs = rs;
1330         sctx.pv = rtmp1[i];
1331         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1332         if (sctx.newshift) break;
1333 
1334         /* Mark diagonal and invert diagonal for simpler triangular solves */
1335         pv  = b->a + bdiag[i];
1336         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1337         break;
1338 
1339       case 2:
1340         /*----------*/
1341         /* zero rtmp1 and rtmp2 */
1342         /* L part */
1343         nz    = bi[i + 1] - bi[i];
1344         bjtmp = bj + bi[i];
1345         for (j = 0; j < nz; j++) {
1346           col        = bjtmp[j];
1347           rtmp1[col] = 0.0;
1348           rtmp2[col] = 0.0;
1349         }
1350 
1351         /* U part */
1352         nz    = bdiag[i] - bdiag[i + 1];
1353         bjtmp = bj + bdiag[i + 1] + 1;
1354         for (j = 0; j < nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0;
1357           rtmp2[col] = 0.0;
1358         }
1359 
1360         /* load in initial (unfactored row) */
1361         nz    = ai[r[i] + 1] - ai[r[i]];
1362         ajtmp = aj + ai[r[i]];
1363         v1    = aa + ai[r[i]];
1364         v2    = aa + ai[r[i] + 1];
1365         for (j = 0; j < nz; j++) {
1366           col        = ics[ajtmp[j]];
1367           rtmp1[col] = v1[j];
1368           rtmp2[col] = v2[j];
1369         }
1370         /* ZeropivotApply(): shift the diagonal of the matrix  */
1371         rtmp1[i] += sctx.shift_amount;
1372         rtmp2[i + 1] += sctx.shift_amount;
1373 
1374         /* elimination */
1375         bjtmp = bj + bi[i];
1376         row   = *bjtmp++; /* pivot row */
1377         nzL   = bi[i + 1] - bi[i];
1378         for (k = 0; k < nzL; k++) {
1379           pc1 = rtmp1 + row;
1380           pc2 = rtmp2 + row;
1381           if (*pc1 != 0.0 || *pc2 != 0.0) {
1382             pv   = b->a + bdiag[row];
1383             mul1 = *pc1 * (*pv);
1384             mul2 = *pc2 * (*pv);
1385             *pc1 = mul1;
1386             *pc2 = mul2;
1387 
1388             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1389             pv = b->a + bdiag[row + 1] + 1;
1390             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1391             for (j = 0; j < nz; j++) {
1392               col = pj[j];
1393               rtmp1[col] -= mul1 * pv[j];
1394               rtmp2[col] -= mul2 * pv[j];
1395             }
1396             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1397           }
1398           row = *bjtmp++;
1399         }
1400 
1401         /* finished row i; check zero pivot, then stick row i into b->a */
1402         rs  = 0.0;
1403         /* L part */
1404         pc1 = b->a + bi[i];
1405         pj  = b->j + bi[i];
1406         nz  = bi[i + 1] - bi[i];
1407         for (j = 0; j < nz; j++) {
1408           col    = pj[j];
1409           pc1[j] = rtmp1[col];
1410           rs += PetscAbsScalar(pc1[j]);
1411         }
1412         /* U part */
1413         pc1 = b->a + bdiag[i + 1] + 1;
1414         pj  = b->j + bdiag[i + 1] + 1;
1415         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1416         for (j = 0; j < nz; j++) {
1417           col    = pj[j];
1418           pc1[j] = rtmp1[col];
1419           rs += PetscAbsScalar(pc1[j]);
1420         }
1421 
1422         sctx.rs = rs;
1423         sctx.pv = rtmp1[i];
1424         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1425         if (sctx.newshift) break;
1426         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1427         *pc1 = 1.0 / sctx.pv;
1428 
1429         /* Now take care of diagonal 2x2 block. */
1430         pc2 = rtmp2 + i;
1431         if (*pc2 != 0.0) {
1432           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1433           *pc2 = mul1;                        /* insert L entry */
1434           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1435           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1436           for (j = 0; j < nz; j++) {
1437             col = pj[j];
1438             rtmp2[col] -= mul1 * rtmp1[col];
1439           }
1440           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1441         }
1442 
1443         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1444         rs  = 0.0;
1445         /* L part */
1446         pc2 = b->a + bi[i + 1];
1447         pj  = b->j + bi[i + 1];
1448         nz  = bi[i + 2] - bi[i + 1];
1449         for (j = 0; j < nz; j++) {
1450           col    = pj[j];
1451           pc2[j] = rtmp2[col];
1452           rs += PetscAbsScalar(pc2[j]);
1453         }
1454         /* U part */
1455         pc2 = b->a + bdiag[i + 2] + 1;
1456         pj  = b->j + bdiag[i + 2] + 1;
1457         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1458         for (j = 0; j < nz; j++) {
1459           col    = pj[j];
1460           pc2[j] = rtmp2[col];
1461           rs += PetscAbsScalar(pc2[j]);
1462         }
1463 
1464         sctx.rs = rs;
1465         sctx.pv = rtmp2[i + 1];
1466         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1467         if (sctx.newshift) break;
1468         pc2  = b->a + bdiag[i + 1];
1469         *pc2 = 1.0 / sctx.pv;
1470         break;
1471 
1472       case 3:
1473         /*----------*/
1474         /* zero rtmp */
1475         /* L part */
1476         nz    = bi[i + 1] - bi[i];
1477         bjtmp = bj + bi[i];
1478         for (j = 0; j < nz; j++) {
1479           col        = bjtmp[j];
1480           rtmp1[col] = 0.0;
1481           rtmp2[col] = 0.0;
1482           rtmp3[col] = 0.0;
1483         }
1484 
1485         /* U part */
1486         nz    = bdiag[i] - bdiag[i + 1];
1487         bjtmp = bj + bdiag[i + 1] + 1;
1488         for (j = 0; j < nz; j++) {
1489           col        = bjtmp[j];
1490           rtmp1[col] = 0.0;
1491           rtmp2[col] = 0.0;
1492           rtmp3[col] = 0.0;
1493         }
1494 
1495         /* load in initial (unfactored row) */
1496         nz    = ai[r[i] + 1] - ai[r[i]];
1497         ajtmp = aj + ai[r[i]];
1498         v1    = aa + ai[r[i]];
1499         v2    = aa + ai[r[i] + 1];
1500         v3    = aa + ai[r[i] + 2];
1501         for (j = 0; j < nz; j++) {
1502           col        = ics[ajtmp[j]];
1503           rtmp1[col] = v1[j];
1504           rtmp2[col] = v2[j];
1505           rtmp3[col] = v3[j];
1506         }
1507         /* ZeropivotApply(): shift the diagonal of the matrix  */
1508         rtmp1[i] += sctx.shift_amount;
1509         rtmp2[i + 1] += sctx.shift_amount;
1510         rtmp3[i + 2] += sctx.shift_amount;
1511 
1512         /* elimination */
1513         bjtmp = bj + bi[i];
1514         row   = *bjtmp++; /* pivot row */
1515         nzL   = bi[i + 1] - bi[i];
1516         for (k = 0; k < nzL; k++) {
1517           pc1 = rtmp1 + row;
1518           pc2 = rtmp2 + row;
1519           pc3 = rtmp3 + row;
1520           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1521             pv   = b->a + bdiag[row];
1522             mul1 = *pc1 * (*pv);
1523             mul2 = *pc2 * (*pv);
1524             mul3 = *pc3 * (*pv);
1525             *pc1 = mul1;
1526             *pc2 = mul2;
1527             *pc3 = mul3;
1528 
1529             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1530             pv = b->a + bdiag[row + 1] + 1;
1531             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1532             for (j = 0; j < nz; j++) {
1533               col = pj[j];
1534               rtmp1[col] -= mul1 * pv[j];
1535               rtmp2[col] -= mul2 * pv[j];
1536               rtmp3[col] -= mul3 * pv[j];
1537             }
1538             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1539           }
1540           row = *bjtmp++;
1541         }
1542 
1543         /* finished row i; check zero pivot, then stick row i into b->a */
1544         rs  = 0.0;
1545         /* L part */
1546         pc1 = b->a + bi[i];
1547         pj  = b->j + bi[i];
1548         nz  = bi[i + 1] - bi[i];
1549         for (j = 0; j < nz; j++) {
1550           col    = pj[j];
1551           pc1[j] = rtmp1[col];
1552           rs += PetscAbsScalar(pc1[j]);
1553         }
1554         /* U part */
1555         pc1 = b->a + bdiag[i + 1] + 1;
1556         pj  = b->j + bdiag[i + 1] + 1;
1557         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1558         for (j = 0; j < nz; j++) {
1559           col    = pj[j];
1560           pc1[j] = rtmp1[col];
1561           rs += PetscAbsScalar(pc1[j]);
1562         }
1563 
1564         sctx.rs = rs;
1565         sctx.pv = rtmp1[i];
1566         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1567         if (sctx.newshift) break;
1568         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1569         *pc1 = 1.0 / sctx.pv;
1570 
1571         /* Now take care of 1st column of diagonal 3x3 block. */
1572         pc2 = rtmp2 + i;
1573         pc3 = rtmp3 + i;
1574         if (*pc2 != 0.0 || *pc3 != 0.0) {
1575           mul2 = (*pc2) * (*pc1);
1576           *pc2 = mul2;
1577           mul3 = (*pc3) * (*pc1);
1578           *pc3 = mul3;
1579           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1580           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1581           for (j = 0; j < nz; j++) {
1582             col = pj[j];
1583             rtmp2[col] -= mul2 * rtmp1[col];
1584             rtmp3[col] -= mul3 * rtmp1[col];
1585           }
1586           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1587         }
1588 
1589         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1590         rs  = 0.0;
1591         /* L part */
1592         pc2 = b->a + bi[i + 1];
1593         pj  = b->j + bi[i + 1];
1594         nz  = bi[i + 2] - bi[i + 1];
1595         for (j = 0; j < nz; j++) {
1596           col    = pj[j];
1597           pc2[j] = rtmp2[col];
1598           rs += PetscAbsScalar(pc2[j]);
1599         }
1600         /* U part */
1601         pc2 = b->a + bdiag[i + 2] + 1;
1602         pj  = b->j + bdiag[i + 2] + 1;
1603         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1604         for (j = 0; j < nz; j++) {
1605           col    = pj[j];
1606           pc2[j] = rtmp2[col];
1607           rs += PetscAbsScalar(pc2[j]);
1608         }
1609 
1610         sctx.rs = rs;
1611         sctx.pv = rtmp2[i + 1];
1612         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1613         if (sctx.newshift) break;
1614         pc2  = b->a + bdiag[i + 1];
1615         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1616 
1617         /* Now take care of 2nd column of diagonal 3x3 block. */
1618         pc3 = rtmp3 + i + 1;
1619         if (*pc3 != 0.0) {
1620           mul3 = (*pc3) * (*pc2);
1621           *pc3 = mul3;
1622           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1623           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1624           for (j = 0; j < nz; j++) {
1625             col = pj[j];
1626             rtmp3[col] -= mul3 * rtmp2[col];
1627           }
1628           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1629         }
1630 
1631         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1632         rs  = 0.0;
1633         /* L part */
1634         pc3 = b->a + bi[i + 2];
1635         pj  = b->j + bi[i + 2];
1636         nz  = bi[i + 3] - bi[i + 2];
1637         for (j = 0; j < nz; j++) {
1638           col    = pj[j];
1639           pc3[j] = rtmp3[col];
1640           rs += PetscAbsScalar(pc3[j]);
1641         }
1642         /* U part */
1643         pc3 = b->a + bdiag[i + 3] + 1;
1644         pj  = b->j + bdiag[i + 3] + 1;
1645         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1646         for (j = 0; j < nz; j++) {
1647           col    = pj[j];
1648           pc3[j] = rtmp3[col];
1649           rs += PetscAbsScalar(pc3[j]);
1650         }
1651 
1652         sctx.rs = rs;
1653         sctx.pv = rtmp3[i + 2];
1654         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1655         if (sctx.newshift) break;
1656         pc3  = b->a + bdiag[i + 2];
1657         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1658         break;
1659       case 4:
1660         /*----------*/
1661         /* zero rtmp */
1662         /* L part */
1663         nz    = bi[i + 1] - bi[i];
1664         bjtmp = bj + bi[i];
1665         for (j = 0; j < nz; j++) {
1666           col        = bjtmp[j];
1667           rtmp1[col] = 0.0;
1668           rtmp2[col] = 0.0;
1669           rtmp3[col] = 0.0;
1670           rtmp4[col] = 0.0;
1671         }
1672 
1673         /* U part */
1674         nz    = bdiag[i] - bdiag[i + 1];
1675         bjtmp = bj + bdiag[i + 1] + 1;
1676         for (j = 0; j < nz; j++) {
1677           col        = bjtmp[j];
1678           rtmp1[col] = 0.0;
1679           rtmp2[col] = 0.0;
1680           rtmp3[col] = 0.0;
1681           rtmp4[col] = 0.0;
1682         }
1683 
1684         /* load in initial (unfactored row) */
1685         nz    = ai[r[i] + 1] - ai[r[i]];
1686         ajtmp = aj + ai[r[i]];
1687         v1    = aa + ai[r[i]];
1688         v2    = aa + ai[r[i] + 1];
1689         v3    = aa + ai[r[i] + 2];
1690         v4    = aa + ai[r[i] + 3];
1691         for (j = 0; j < nz; j++) {
1692           col        = ics[ajtmp[j]];
1693           rtmp1[col] = v1[j];
1694           rtmp2[col] = v2[j];
1695           rtmp3[col] = v3[j];
1696           rtmp4[col] = v4[j];
1697         }
1698         /* ZeropivotApply(): shift the diagonal of the matrix  */
1699         rtmp1[i] += sctx.shift_amount;
1700         rtmp2[i + 1] += sctx.shift_amount;
1701         rtmp3[i + 2] += sctx.shift_amount;
1702         rtmp4[i + 3] += sctx.shift_amount;
1703 
1704         /* elimination */
1705         bjtmp = bj + bi[i];
1706         row   = *bjtmp++; /* pivot row */
1707         nzL   = bi[i + 1] - bi[i];
1708         for (k = 0; k < nzL; k++) {
1709           pc1 = rtmp1 + row;
1710           pc2 = rtmp2 + row;
1711           pc3 = rtmp3 + row;
1712           pc4 = rtmp4 + row;
1713           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1714             pv   = b->a + bdiag[row];
1715             mul1 = *pc1 * (*pv);
1716             mul2 = *pc2 * (*pv);
1717             mul3 = *pc3 * (*pv);
1718             mul4 = *pc4 * (*pv);
1719             *pc1 = mul1;
1720             *pc2 = mul2;
1721             *pc3 = mul3;
1722             *pc4 = mul4;
1723 
1724             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1725             pv = b->a + bdiag[row + 1] + 1;
1726             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1727             for (j = 0; j < nz; j++) {
1728               col = pj[j];
1729               rtmp1[col] -= mul1 * pv[j];
1730               rtmp2[col] -= mul2 * pv[j];
1731               rtmp3[col] -= mul3 * pv[j];
1732               rtmp4[col] -= mul4 * pv[j];
1733             }
1734             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1735           }
1736           row = *bjtmp++;
1737         }
1738 
1739         /* finished row i; check zero pivot, then stick row i into b->a */
1740         rs  = 0.0;
1741         /* L part */
1742         pc1 = b->a + bi[i];
1743         pj  = b->j + bi[i];
1744         nz  = bi[i + 1] - bi[i];
1745         for (j = 0; j < nz; j++) {
1746           col    = pj[j];
1747           pc1[j] = rtmp1[col];
1748           rs += PetscAbsScalar(pc1[j]);
1749         }
1750         /* U part */
1751         pc1 = b->a + bdiag[i + 1] + 1;
1752         pj  = b->j + bdiag[i + 1] + 1;
1753         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1754         for (j = 0; j < nz; j++) {
1755           col    = pj[j];
1756           pc1[j] = rtmp1[col];
1757           rs += PetscAbsScalar(pc1[j]);
1758         }
1759 
1760         sctx.rs = rs;
1761         sctx.pv = rtmp1[i];
1762         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1763         if (sctx.newshift) break;
1764         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1765         *pc1 = 1.0 / sctx.pv;
1766 
1767         /* Now take care of 1st column of diagonal 4x4 block. */
1768         pc2 = rtmp2 + i;
1769         pc3 = rtmp3 + i;
1770         pc4 = rtmp4 + i;
1771         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1772           mul2 = (*pc2) * (*pc1);
1773           *pc2 = mul2;
1774           mul3 = (*pc3) * (*pc1);
1775           *pc3 = mul3;
1776           mul4 = (*pc4) * (*pc1);
1777           *pc4 = mul4;
1778           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1779           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1780           for (j = 0; j < nz; j++) {
1781             col = pj[j];
1782             rtmp2[col] -= mul2 * rtmp1[col];
1783             rtmp3[col] -= mul3 * rtmp1[col];
1784             rtmp4[col] -= mul4 * rtmp1[col];
1785           }
1786           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1787         }
1788 
1789         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1790         rs  = 0.0;
1791         /* L part */
1792         pc2 = b->a + bi[i + 1];
1793         pj  = b->j + bi[i + 1];
1794         nz  = bi[i + 2] - bi[i + 1];
1795         for (j = 0; j < nz; j++) {
1796           col    = pj[j];
1797           pc2[j] = rtmp2[col];
1798           rs += PetscAbsScalar(pc2[j]);
1799         }
1800         /* U part */
1801         pc2 = b->a + bdiag[i + 2] + 1;
1802         pj  = b->j + bdiag[i + 2] + 1;
1803         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1804         for (j = 0; j < nz; j++) {
1805           col    = pj[j];
1806           pc2[j] = rtmp2[col];
1807           rs += PetscAbsScalar(pc2[j]);
1808         }
1809 
1810         sctx.rs = rs;
1811         sctx.pv = rtmp2[i + 1];
1812         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1813         if (sctx.newshift) break;
1814         pc2  = b->a + bdiag[i + 1];
1815         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1816 
1817         /* Now take care of 2nd column of diagonal 4x4 block. */
1818         pc3 = rtmp3 + i + 1;
1819         pc4 = rtmp4 + i + 1;
1820         if (*pc3 != 0.0 || *pc4 != 0.0) {
1821           mul3 = (*pc3) * (*pc2);
1822           *pc3 = mul3;
1823           mul4 = (*pc4) * (*pc2);
1824           *pc4 = mul4;
1825           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1826           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1827           for (j = 0; j < nz; j++) {
1828             col = pj[j];
1829             rtmp3[col] -= mul3 * rtmp2[col];
1830             rtmp4[col] -= mul4 * rtmp2[col];
1831           }
1832           PetscCall(PetscLogFlops(4.0 * nz));
1833         }
1834 
1835         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1836         rs  = 0.0;
1837         /* L part */
1838         pc3 = b->a + bi[i + 2];
1839         pj  = b->j + bi[i + 2];
1840         nz  = bi[i + 3] - bi[i + 2];
1841         for (j = 0; j < nz; j++) {
1842           col    = pj[j];
1843           pc3[j] = rtmp3[col];
1844           rs += PetscAbsScalar(pc3[j]);
1845         }
1846         /* U part */
1847         pc3 = b->a + bdiag[i + 3] + 1;
1848         pj  = b->j + bdiag[i + 3] + 1;
1849         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1850         for (j = 0; j < nz; j++) {
1851           col    = pj[j];
1852           pc3[j] = rtmp3[col];
1853           rs += PetscAbsScalar(pc3[j]);
1854         }
1855 
1856         sctx.rs = rs;
1857         sctx.pv = rtmp3[i + 2];
1858         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1859         if (sctx.newshift) break;
1860         pc3  = b->a + bdiag[i + 2];
1861         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1862 
1863         /* Now take care of 3rd column of diagonal 4x4 block. */
1864         pc4 = rtmp4 + i + 2;
1865         if (*pc4 != 0.0) {
1866           mul4 = (*pc4) * (*pc3);
1867           *pc4 = mul4;
1868           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1869           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1870           for (j = 0; j < nz; j++) {
1871             col = pj[j];
1872             rtmp4[col] -= mul4 * rtmp3[col];
1873           }
1874           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1875         }
1876 
1877         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1878         rs  = 0.0;
1879         /* L part */
1880         pc4 = b->a + bi[i + 3];
1881         pj  = b->j + bi[i + 3];
1882         nz  = bi[i + 4] - bi[i + 3];
1883         for (j = 0; j < nz; j++) {
1884           col    = pj[j];
1885           pc4[j] = rtmp4[col];
1886           rs += PetscAbsScalar(pc4[j]);
1887         }
1888         /* U part */
1889         pc4 = b->a + bdiag[i + 4] + 1;
1890         pj  = b->j + bdiag[i + 4] + 1;
1891         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1892         for (j = 0; j < nz; j++) {
1893           col    = pj[j];
1894           pc4[j] = rtmp4[col];
1895           rs += PetscAbsScalar(pc4[j]);
1896         }
1897 
1898         sctx.rs = rs;
1899         sctx.pv = rtmp4[i + 3];
1900         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1901         if (sctx.newshift) break;
1902         pc4  = b->a + bdiag[i + 3];
1903         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1904         break;
1905 
1906       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1907       }
1908       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1909       i += nodesz;              /* Update the row */
1910     }
1911 
1912     /* MatPivotRefine() */
1913     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1914       /*
1915        * if no shift in this attempt & shifting & started shifting & can refine,
1916        * then try lower shift
1917        */
1918       sctx.shift_hi       = sctx.shift_fraction;
1919       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1920       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1921       sctx.newshift       = PETSC_TRUE;
1922       sctx.nshift++;
1923     }
1924   } while (sctx.newshift);
1925 
1926   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1927   PetscCall(PetscFree(tmp_vec2));
1928   PetscCall(ISRestoreIndices(isicol, &ic));
1929   PetscCall(ISRestoreIndices(isrow, &r));
1930 
1931   if (b->inode.size) {
1932     C->ops->solve = MatSolve_SeqAIJ_Inode;
1933   } else {
1934     C->ops->solve = MatSolve_SeqAIJ;
1935   }
1936   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1937   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1938   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1939   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1940   C->assembled              = PETSC_TRUE;
1941   C->preallocated           = PETSC_TRUE;
1942 
1943   PetscCall(PetscLogFlops(C->cmap->n));
1944 
1945   /* MatShiftView(A,info,&sctx) */
1946   if (sctx.nshift) {
1947     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1948       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1949     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1950       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1951     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1952       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1953     }
1954   }
1955   PetscFunctionReturn(0);
1956 }
1957 
1958 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) {
1959   Mat              C = B;
1960   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1961   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1962   const PetscInt  *r, *ic, *c, *ics;
1963   PetscInt         n = A->rmap->n, *bi = b->i;
1964   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1965   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1966   PetscInt        *ai = a->i, *aj = a->j;
1967   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1968   PetscScalar      mul1, mul2, mul3, tmp;
1969   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1970   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1971   PetscReal        rs = 0.0;
1972   FactorShiftCtx   sctx;
1973 
1974   PetscFunctionBegin;
1975   sctx.shift_top      = 0;
1976   sctx.nshift_max     = 0;
1977   sctx.shift_lo       = 0;
1978   sctx.shift_hi       = 0;
1979   sctx.shift_fraction = 0;
1980 
1981   /* if both shift schemes are chosen by user, only use info->shiftpd */
1982   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1983     sctx.shift_top = 0;
1984     for (i = 0; i < n; i++) {
1985       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1986       rs    = 0.0;
1987       ajtmp = aj + ai[i];
1988       rtmp1 = aa + ai[i];
1989       nz    = ai[i + 1] - ai[i];
1990       for (j = 0; j < nz; j++) {
1991         if (*ajtmp != i) {
1992           rs += PetscAbsScalar(*rtmp1++);
1993         } else {
1994           rs -= PetscRealPart(*rtmp1++);
1995         }
1996         ajtmp++;
1997       }
1998       if (rs > sctx.shift_top) sctx.shift_top = rs;
1999     }
2000     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2001     sctx.shift_top *= 1.1;
2002     sctx.nshift_max = 5;
2003     sctx.shift_lo   = 0.;
2004     sctx.shift_hi   = 1.;
2005   }
2006   sctx.shift_amount = 0;
2007   sctx.nshift       = 0;
2008 
2009   PetscCall(ISGetIndices(isrow, &r));
2010   PetscCall(ISGetIndices(iscol, &c));
2011   PetscCall(ISGetIndices(isicol, &ic));
2012   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2013   ics = ic;
2014 
2015   node_max = a->inode.node_count;
2016   ns       = a->inode.size;
2017   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2018 
2019   /* If max inode size > 3, split it into two inodes.*/
2020   /* also map the inode sizes according to the ordering */
2021   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2022   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2023     if (ns[i] > 3) {
2024       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2025       ++j;
2026       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2027     } else {
2028       tmp_vec1[j] = ns[i];
2029     }
2030   }
2031   /* Use the correct node_max */
2032   node_max = j;
2033 
2034   /* Now reorder the inode info based on mat re-ordering info */
2035   /* First create a row -> inode_size_array_index map */
2036   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2037   for (i = 0, row = 0; i < node_max; i++) {
2038     nodesz = tmp_vec1[i];
2039     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2040   }
2041   /* Using nsmap, create a reordered ns structure */
2042   for (i = 0, j = 0; i < node_max; i++) {
2043     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2044     tmp_vec2[i] = nodesz;
2045     j += nodesz;
2046   }
2047   PetscCall(PetscFree2(nsmap, tmp_vec1));
2048   /* Now use the correct ns */
2049   ns = tmp_vec2;
2050 
2051   do {
2052     sctx.newshift = PETSC_FALSE;
2053     /* Now loop over each block-row, and do the factorization */
2054     for (i = 0, row = 0; i < node_max; i++) {
2055       nodesz = ns[i];
2056       nz     = bi[row + 1] - bi[row];
2057       bjtmp  = bj + bi[row];
2058 
2059       switch (nodesz) {
2060       case 1:
2061         for (j = 0; j < nz; j++) {
2062           idx         = bjtmp[j];
2063           rtmp11[idx] = 0.0;
2064         }
2065 
2066         /* load in initial (unfactored row) */
2067         idx    = r[row];
2068         nz_tmp = ai[idx + 1] - ai[idx];
2069         ajtmp  = aj + ai[idx];
2070         v1     = aa + ai[idx];
2071 
2072         for (j = 0; j < nz_tmp; j++) {
2073           idx         = ics[ajtmp[j]];
2074           rtmp11[idx] = v1[j];
2075         }
2076         rtmp11[ics[r[row]]] += sctx.shift_amount;
2077 
2078         prow = *bjtmp++;
2079         while (prow < row) {
2080           pc1 = rtmp11 + prow;
2081           if (*pc1 != 0.0) {
2082             pv     = ba + bd[prow];
2083             pj     = nbj + bd[prow];
2084             mul1   = *pc1 * *pv++;
2085             *pc1   = mul1;
2086             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2087             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2088             for (j = 0; j < nz_tmp; j++) {
2089               tmp = pv[j];
2090               idx = pj[j];
2091               rtmp11[idx] -= mul1 * tmp;
2092             }
2093           }
2094           prow = *bjtmp++;
2095         }
2096         pj  = bj + bi[row];
2097         pc1 = ba + bi[row];
2098 
2099         sctx.pv     = rtmp11[row];
2100         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2101         rs          = 0.0;
2102         for (j = 0; j < nz; j++) {
2103           idx    = pj[j];
2104           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2105           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2106         }
2107         sctx.rs = rs;
2108         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2109         if (sctx.newshift) goto endofwhile;
2110         break;
2111 
2112       case 2:
2113         for (j = 0; j < nz; j++) {
2114           idx         = bjtmp[j];
2115           rtmp11[idx] = 0.0;
2116           rtmp22[idx] = 0.0;
2117         }
2118 
2119         /* load in initial (unfactored row) */
2120         idx    = r[row];
2121         nz_tmp = ai[idx + 1] - ai[idx];
2122         ajtmp  = aj + ai[idx];
2123         v1     = aa + ai[idx];
2124         v2     = aa + ai[idx + 1];
2125         for (j = 0; j < nz_tmp; j++) {
2126           idx         = ics[ajtmp[j]];
2127           rtmp11[idx] = v1[j];
2128           rtmp22[idx] = v2[j];
2129         }
2130         rtmp11[ics[r[row]]] += sctx.shift_amount;
2131         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2132 
2133         prow = *bjtmp++;
2134         while (prow < row) {
2135           pc1 = rtmp11 + prow;
2136           pc2 = rtmp22 + prow;
2137           if (*pc1 != 0.0 || *pc2 != 0.0) {
2138             pv   = ba + bd[prow];
2139             pj   = nbj + bd[prow];
2140             mul1 = *pc1 * *pv;
2141             mul2 = *pc2 * *pv;
2142             ++pv;
2143             *pc1 = mul1;
2144             *pc2 = mul2;
2145 
2146             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2147             for (j = 0; j < nz_tmp; j++) {
2148               tmp = pv[j];
2149               idx = pj[j];
2150               rtmp11[idx] -= mul1 * tmp;
2151               rtmp22[idx] -= mul2 * tmp;
2152             }
2153             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2154           }
2155           prow = *bjtmp++;
2156         }
2157 
2158         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2159         pc1 = rtmp11 + prow;
2160         pc2 = rtmp22 + prow;
2161 
2162         sctx.pv = *pc1;
2163         pj      = bj + bi[prow];
2164         rs      = 0.0;
2165         for (j = 0; j < nz; j++) {
2166           idx = pj[j];
2167           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2168         }
2169         sctx.rs = rs;
2170         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2171         if (sctx.newshift) goto endofwhile;
2172 
2173         if (*pc2 != 0.0) {
2174           pj     = nbj + bd[prow];
2175           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2176           *pc2   = mul2;
2177           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2178           for (j = 0; j < nz_tmp; j++) {
2179             idx = pj[j];
2180             tmp = rtmp11[idx];
2181             rtmp22[idx] -= mul2 * tmp;
2182           }
2183           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2184         }
2185 
2186         pj  = bj + bi[row];
2187         pc1 = ba + bi[row];
2188         pc2 = ba + bi[row + 1];
2189 
2190         sctx.pv         = rtmp22[row + 1];
2191         rs              = 0.0;
2192         rtmp11[row]     = 1.0 / rtmp11[row];
2193         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2194         /* copy row entries from dense representation to sparse */
2195         for (j = 0; j < nz; j++) {
2196           idx    = pj[j];
2197           pc1[j] = rtmp11[idx];
2198           pc2[j] = rtmp22[idx];
2199           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2200         }
2201         sctx.rs = rs;
2202         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2203         if (sctx.newshift) goto endofwhile;
2204         break;
2205 
2206       case 3:
2207         for (j = 0; j < nz; j++) {
2208           idx         = bjtmp[j];
2209           rtmp11[idx] = 0.0;
2210           rtmp22[idx] = 0.0;
2211           rtmp33[idx] = 0.0;
2212         }
2213         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2214         idx    = r[row];
2215         nz_tmp = ai[idx + 1] - ai[idx];
2216         ajtmp  = aj + ai[idx];
2217         v1     = aa + ai[idx];
2218         v2     = aa + ai[idx + 1];
2219         v3     = aa + ai[idx + 2];
2220         for (j = 0; j < nz_tmp; j++) {
2221           idx         = ics[ajtmp[j]];
2222           rtmp11[idx] = v1[j];
2223           rtmp22[idx] = v2[j];
2224           rtmp33[idx] = v3[j];
2225         }
2226         rtmp11[ics[r[row]]] += sctx.shift_amount;
2227         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2228         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2229 
2230         /* loop over all pivot row blocks above this row block */
2231         prow = *bjtmp++;
2232         while (prow < row) {
2233           pc1 = rtmp11 + prow;
2234           pc2 = rtmp22 + prow;
2235           pc3 = rtmp33 + prow;
2236           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2237             pv   = ba + bd[prow];
2238             pj   = nbj + bd[prow];
2239             mul1 = *pc1 * *pv;
2240             mul2 = *pc2 * *pv;
2241             mul3 = *pc3 * *pv;
2242             ++pv;
2243             *pc1 = mul1;
2244             *pc2 = mul2;
2245             *pc3 = mul3;
2246 
2247             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2248             /* update this row based on pivot row */
2249             for (j = 0; j < nz_tmp; j++) {
2250               tmp = pv[j];
2251               idx = pj[j];
2252               rtmp11[idx] -= mul1 * tmp;
2253               rtmp22[idx] -= mul2 * tmp;
2254               rtmp33[idx] -= mul3 * tmp;
2255             }
2256             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2257           }
2258           prow = *bjtmp++;
2259         }
2260 
2261         /* Now take care of diagonal 3x3 block in this set of rows */
2262         /* note: prow = row here */
2263         pc1 = rtmp11 + prow;
2264         pc2 = rtmp22 + prow;
2265         pc3 = rtmp33 + prow;
2266 
2267         sctx.pv = *pc1;
2268         pj      = bj + bi[prow];
2269         rs      = 0.0;
2270         for (j = 0; j < nz; j++) {
2271           idx = pj[j];
2272           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2273         }
2274         sctx.rs = rs;
2275         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2276         if (sctx.newshift) goto endofwhile;
2277 
2278         if (*pc2 != 0.0 || *pc3 != 0.0) {
2279           mul2   = (*pc2) / (*pc1);
2280           mul3   = (*pc3) / (*pc1);
2281           *pc2   = mul2;
2282           *pc3   = mul3;
2283           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2284           pj     = nbj + bd[prow];
2285           for (j = 0; j < nz_tmp; j++) {
2286             idx = pj[j];
2287             tmp = rtmp11[idx];
2288             rtmp22[idx] -= mul2 * tmp;
2289             rtmp33[idx] -= mul3 * tmp;
2290           }
2291           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2292         }
2293         ++prow;
2294 
2295         pc2     = rtmp22 + prow;
2296         pc3     = rtmp33 + prow;
2297         sctx.pv = *pc2;
2298         pj      = bj + bi[prow];
2299         rs      = 0.0;
2300         for (j = 0; j < nz; j++) {
2301           idx = pj[j];
2302           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2303         }
2304         sctx.rs = rs;
2305         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2306         if (sctx.newshift) goto endofwhile;
2307 
2308         if (*pc3 != 0.0) {
2309           mul3   = (*pc3) / (*pc2);
2310           *pc3   = mul3;
2311           pj     = nbj + bd[prow];
2312           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2313           for (j = 0; j < nz_tmp; j++) {
2314             idx = pj[j];
2315             tmp = rtmp22[idx];
2316             rtmp33[idx] -= mul3 * tmp;
2317           }
2318           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2319         }
2320 
2321         pj  = bj + bi[row];
2322         pc1 = ba + bi[row];
2323         pc2 = ba + bi[row + 1];
2324         pc3 = ba + bi[row + 2];
2325 
2326         sctx.pv         = rtmp33[row + 2];
2327         rs              = 0.0;
2328         rtmp11[row]     = 1.0 / rtmp11[row];
2329         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2330         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2331         /* copy row entries from dense representation to sparse */
2332         for (j = 0; j < nz; j++) {
2333           idx    = pj[j];
2334           pc1[j] = rtmp11[idx];
2335           pc2[j] = rtmp22[idx];
2336           pc3[j] = rtmp33[idx];
2337           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2338         }
2339 
2340         sctx.rs = rs;
2341         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2342         if (sctx.newshift) goto endofwhile;
2343         break;
2344 
2345       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2346       }
2347       row += nodesz; /* Update the row */
2348     }
2349   endofwhile:;
2350   } while (sctx.newshift);
2351   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2352   PetscCall(PetscFree(tmp_vec2));
2353   PetscCall(ISRestoreIndices(isicol, &ic));
2354   PetscCall(ISRestoreIndices(isrow, &r));
2355   PetscCall(ISRestoreIndices(iscol, &c));
2356 
2357   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2358   /* do not set solve add, since MatSolve_Inode + Add is faster */
2359   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2360   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2361   C->assembled              = PETSC_TRUE;
2362   C->preallocated           = PETSC_TRUE;
2363   if (sctx.nshift) {
2364     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2365       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2366     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2367       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2368     }
2369   }
2370   PetscCall(PetscLogFlops(C->cmap->n));
2371   PetscCall(MatSeqAIJCheckInode(C));
2372   PetscFunctionReturn(0);
2373 }
2374 
2375 /* ----------------------------------------------------------- */
2376 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) {
2377   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2378   IS                 iscol = a->col, isrow = a->row;
2379   const PetscInt    *r, *c, *rout, *cout;
2380   PetscInt           i, j, n = A->rmap->n;
2381   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2382   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2383   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2384   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2385   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2386   const PetscScalar *b;
2387 
2388   PetscFunctionBegin;
2389   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2390   node_max = a->inode.node_count;
2391   ns       = a->inode.size; /* Node Size array */
2392 
2393   PetscCall(VecGetArrayRead(bb, &b));
2394   PetscCall(VecGetArrayWrite(xx, &x));
2395   tmp = a->solve_work;
2396 
2397   PetscCall(ISGetIndices(isrow, &rout));
2398   r = rout;
2399   PetscCall(ISGetIndices(iscol, &cout));
2400   c = cout;
2401 
2402   /* forward solve the lower triangular */
2403   tmps = tmp;
2404   aa   = a_a;
2405   aj   = a_j;
2406   ad   = a->diag;
2407 
2408   for (i = 0, row = 0; i < node_max; ++i) {
2409     nsz = ns[i];
2410     aii = ai[row];
2411     v1  = aa + aii;
2412     vi  = aj + aii;
2413     nz  = ai[row + 1] - ai[row];
2414 
2415     if (i < node_max - 1) {
2416       /* Prefetch the indices for the next block */
2417       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2418       /* Prefetch the data for the next block */
2419       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2420     }
2421 
2422     switch (nsz) { /* Each loop in 'case' is unrolled */
2423     case 1:
2424       sum1 = b[r[row]];
2425       for (j = 0; j < nz - 1; j += 2) {
2426         i0   = vi[j];
2427         i1   = vi[j + 1];
2428         tmp0 = tmps[i0];
2429         tmp1 = tmps[i1];
2430         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2431       }
2432       if (j == nz - 1) {
2433         tmp0 = tmps[vi[j]];
2434         sum1 -= v1[j] * tmp0;
2435       }
2436       tmp[row++] = sum1;
2437       break;
2438     case 2:
2439       sum1 = b[r[row]];
2440       sum2 = b[r[row + 1]];
2441       v2   = aa + ai[row + 1];
2442 
2443       for (j = 0; j < nz - 1; j += 2) {
2444         i0   = vi[j];
2445         i1   = vi[j + 1];
2446         tmp0 = tmps[i0];
2447         tmp1 = tmps[i1];
2448         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2449         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2450       }
2451       if (j == nz - 1) {
2452         tmp0 = tmps[vi[j]];
2453         sum1 -= v1[j] * tmp0;
2454         sum2 -= v2[j] * tmp0;
2455       }
2456       sum2 -= v2[nz] * sum1;
2457       tmp[row++] = sum1;
2458       tmp[row++] = sum2;
2459       break;
2460     case 3:
2461       sum1 = b[r[row]];
2462       sum2 = b[r[row + 1]];
2463       sum3 = b[r[row + 2]];
2464       v2   = aa + ai[row + 1];
2465       v3   = aa + ai[row + 2];
2466 
2467       for (j = 0; j < nz - 1; j += 2) {
2468         i0   = vi[j];
2469         i1   = vi[j + 1];
2470         tmp0 = tmps[i0];
2471         tmp1 = tmps[i1];
2472         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2473         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2474         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2475       }
2476       if (j == nz - 1) {
2477         tmp0 = tmps[vi[j]];
2478         sum1 -= v1[j] * tmp0;
2479         sum2 -= v2[j] * tmp0;
2480         sum3 -= v3[j] * tmp0;
2481       }
2482       sum2 -= v2[nz] * sum1;
2483       sum3 -= v3[nz] * sum1;
2484       sum3 -= v3[nz + 1] * sum2;
2485       tmp[row++] = sum1;
2486       tmp[row++] = sum2;
2487       tmp[row++] = sum3;
2488       break;
2489 
2490     case 4:
2491       sum1 = b[r[row]];
2492       sum2 = b[r[row + 1]];
2493       sum3 = b[r[row + 2]];
2494       sum4 = b[r[row + 3]];
2495       v2   = aa + ai[row + 1];
2496       v3   = aa + ai[row + 2];
2497       v4   = aa + ai[row + 3];
2498 
2499       for (j = 0; j < nz - 1; j += 2) {
2500         i0   = vi[j];
2501         i1   = vi[j + 1];
2502         tmp0 = tmps[i0];
2503         tmp1 = tmps[i1];
2504         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2505         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2506         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2507         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2508       }
2509       if (j == nz - 1) {
2510         tmp0 = tmps[vi[j]];
2511         sum1 -= v1[j] * tmp0;
2512         sum2 -= v2[j] * tmp0;
2513         sum3 -= v3[j] * tmp0;
2514         sum4 -= v4[j] * tmp0;
2515       }
2516       sum2 -= v2[nz] * sum1;
2517       sum3 -= v3[nz] * sum1;
2518       sum4 -= v4[nz] * sum1;
2519       sum3 -= v3[nz + 1] * sum2;
2520       sum4 -= v4[nz + 1] * sum2;
2521       sum4 -= v4[nz + 2] * sum3;
2522 
2523       tmp[row++] = sum1;
2524       tmp[row++] = sum2;
2525       tmp[row++] = sum3;
2526       tmp[row++] = sum4;
2527       break;
2528     case 5:
2529       sum1 = b[r[row]];
2530       sum2 = b[r[row + 1]];
2531       sum3 = b[r[row + 2]];
2532       sum4 = b[r[row + 3]];
2533       sum5 = b[r[row + 4]];
2534       v2   = aa + ai[row + 1];
2535       v3   = aa + ai[row + 2];
2536       v4   = aa + ai[row + 3];
2537       v5   = aa + ai[row + 4];
2538 
2539       for (j = 0; j < nz - 1; j += 2) {
2540         i0   = vi[j];
2541         i1   = vi[j + 1];
2542         tmp0 = tmps[i0];
2543         tmp1 = tmps[i1];
2544         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2545         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2546         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2547         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2548         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2549       }
2550       if (j == nz - 1) {
2551         tmp0 = tmps[vi[j]];
2552         sum1 -= v1[j] * tmp0;
2553         sum2 -= v2[j] * tmp0;
2554         sum3 -= v3[j] * tmp0;
2555         sum4 -= v4[j] * tmp0;
2556         sum5 -= v5[j] * tmp0;
2557       }
2558 
2559       sum2 -= v2[nz] * sum1;
2560       sum3 -= v3[nz] * sum1;
2561       sum4 -= v4[nz] * sum1;
2562       sum5 -= v5[nz] * sum1;
2563       sum3 -= v3[nz + 1] * sum2;
2564       sum4 -= v4[nz + 1] * sum2;
2565       sum5 -= v5[nz + 1] * sum2;
2566       sum4 -= v4[nz + 2] * sum3;
2567       sum5 -= v5[nz + 2] * sum3;
2568       sum5 -= v5[nz + 3] * sum4;
2569 
2570       tmp[row++] = sum1;
2571       tmp[row++] = sum2;
2572       tmp[row++] = sum3;
2573       tmp[row++] = sum4;
2574       tmp[row++] = sum5;
2575       break;
2576     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2577     }
2578   }
2579   /* backward solve the upper triangular */
2580   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2581     nsz = ns[i];
2582     aii = ad[row + 1] + 1;
2583     v1  = aa + aii;
2584     vi  = aj + aii;
2585     nz  = ad[row] - ad[row + 1] - 1;
2586 
2587     if (i > 0) {
2588       /* Prefetch the indices for the next block */
2589       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2590       /* Prefetch the data for the next block */
2591       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2592     }
2593 
2594     switch (nsz) { /* Each loop in 'case' is unrolled */
2595     case 1:
2596       sum1 = tmp[row];
2597 
2598       for (j = 0; j < nz - 1; j += 2) {
2599         i0   = vi[j];
2600         i1   = vi[j + 1];
2601         tmp0 = tmps[i0];
2602         tmp1 = tmps[i1];
2603         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2604       }
2605       if (j == nz - 1) {
2606         tmp0 = tmps[vi[j]];
2607         sum1 -= v1[j] * tmp0;
2608       }
2609       x[c[row]] = tmp[row] = sum1 * v1[nz];
2610       row--;
2611       break;
2612     case 2:
2613       sum1 = tmp[row];
2614       sum2 = tmp[row - 1];
2615       v2   = aa + ad[row] + 1;
2616       for (j = 0; j < nz - 1; j += 2) {
2617         i0   = vi[j];
2618         i1   = vi[j + 1];
2619         tmp0 = tmps[i0];
2620         tmp1 = tmps[i1];
2621         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2622         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2623       }
2624       if (j == nz - 1) {
2625         tmp0 = tmps[vi[j]];
2626         sum1 -= v1[j] * tmp0;
2627         sum2 -= v2[j + 1] * tmp0;
2628       }
2629 
2630       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2631       row--;
2632       sum2 -= v2[0] * tmp0;
2633       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2634       row--;
2635       break;
2636     case 3:
2637       sum1 = tmp[row];
2638       sum2 = tmp[row - 1];
2639       sum3 = tmp[row - 2];
2640       v2   = aa + ad[row] + 1;
2641       v3   = aa + ad[row - 1] + 1;
2642       for (j = 0; j < nz - 1; j += 2) {
2643         i0   = vi[j];
2644         i1   = vi[j + 1];
2645         tmp0 = tmps[i0];
2646         tmp1 = tmps[i1];
2647         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2648         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2649         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2650       }
2651       if (j == nz - 1) {
2652         tmp0 = tmps[vi[j]];
2653         sum1 -= v1[j] * tmp0;
2654         sum2 -= v2[j + 1] * tmp0;
2655         sum3 -= v3[j + 2] * tmp0;
2656       }
2657       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2658       row--;
2659       sum2 -= v2[0] * tmp0;
2660       sum3 -= v3[1] * tmp0;
2661       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2662       row--;
2663       sum3 -= v3[0] * tmp0;
2664       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2665       row--;
2666 
2667       break;
2668     case 4:
2669       sum1 = tmp[row];
2670       sum2 = tmp[row - 1];
2671       sum3 = tmp[row - 2];
2672       sum4 = tmp[row - 3];
2673       v2   = aa + ad[row] + 1;
2674       v3   = aa + ad[row - 1] + 1;
2675       v4   = aa + ad[row - 2] + 1;
2676 
2677       for (j = 0; j < nz - 1; j += 2) {
2678         i0   = vi[j];
2679         i1   = vi[j + 1];
2680         tmp0 = tmps[i0];
2681         tmp1 = tmps[i1];
2682         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2683         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2684         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2685         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2686       }
2687       if (j == nz - 1) {
2688         tmp0 = tmps[vi[j]];
2689         sum1 -= v1[j] * tmp0;
2690         sum2 -= v2[j + 1] * tmp0;
2691         sum3 -= v3[j + 2] * tmp0;
2692         sum4 -= v4[j + 3] * tmp0;
2693       }
2694 
2695       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2696       row--;
2697       sum2 -= v2[0] * tmp0;
2698       sum3 -= v3[1] * tmp0;
2699       sum4 -= v4[2] * tmp0;
2700       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2701       row--;
2702       sum3 -= v3[0] * tmp0;
2703       sum4 -= v4[1] * tmp0;
2704       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2705       row--;
2706       sum4 -= v4[0] * tmp0;
2707       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2708       row--;
2709       break;
2710     case 5:
2711       sum1 = tmp[row];
2712       sum2 = tmp[row - 1];
2713       sum3 = tmp[row - 2];
2714       sum4 = tmp[row - 3];
2715       sum5 = tmp[row - 4];
2716       v2   = aa + ad[row] + 1;
2717       v3   = aa + ad[row - 1] + 1;
2718       v4   = aa + ad[row - 2] + 1;
2719       v5   = aa + ad[row - 3] + 1;
2720       for (j = 0; j < nz - 1; j += 2) {
2721         i0   = vi[j];
2722         i1   = vi[j + 1];
2723         tmp0 = tmps[i0];
2724         tmp1 = tmps[i1];
2725         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2726         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2727         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2728         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2729         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2730       }
2731       if (j == nz - 1) {
2732         tmp0 = tmps[vi[j]];
2733         sum1 -= v1[j] * tmp0;
2734         sum2 -= v2[j + 1] * tmp0;
2735         sum3 -= v3[j + 2] * tmp0;
2736         sum4 -= v4[j + 3] * tmp0;
2737         sum5 -= v5[j + 4] * tmp0;
2738       }
2739 
2740       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2741       row--;
2742       sum2 -= v2[0] * tmp0;
2743       sum3 -= v3[1] * tmp0;
2744       sum4 -= v4[2] * tmp0;
2745       sum5 -= v5[3] * tmp0;
2746       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2747       row--;
2748       sum3 -= v3[0] * tmp0;
2749       sum4 -= v4[1] * tmp0;
2750       sum5 -= v5[2] * tmp0;
2751       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2752       row--;
2753       sum4 -= v4[0] * tmp0;
2754       sum5 -= v5[1] * tmp0;
2755       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2756       row--;
2757       sum5 -= v5[0] * tmp0;
2758       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2759       row--;
2760       break;
2761     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2762     }
2763   }
2764   PetscCall(ISRestoreIndices(isrow, &rout));
2765   PetscCall(ISRestoreIndices(iscol, &cout));
2766   PetscCall(VecRestoreArrayRead(bb, &b));
2767   PetscCall(VecRestoreArrayWrite(xx, &x));
2768   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2769   PetscFunctionReturn(0);
2770 }
2771 
2772 /*
2773      Makes a longer coloring[] array and calls the usual code with that
2774 */
2775 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) {
2776   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2777   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2778   PetscInt        *colorused, i;
2779   ISColoringValue *newcolor;
2780 
2781   PetscFunctionBegin;
2782   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2783   PetscCall(PetscMalloc1(n + 1, &newcolor));
2784   /* loop over inodes, marking a color for each column*/
2785   row = 0;
2786   for (i = 0; i < m; i++) {
2787     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2788   }
2789 
2790   /* eliminate unneeded colors */
2791   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2792   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2793 
2794   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2795   ncolors = colorused[5 * ncolors - 1];
2796   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2797   PetscCall(PetscFree(colorused));
2798   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2799   PetscCall(PetscFree(coloring));
2800   PetscFunctionReturn(0);
2801 }
2802 
2803 #include <petsc/private/kernels/blockinvert.h>
2804 
2805 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) {
2806   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2807   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2808   MatScalar         *ibdiag, *bdiag, work[25], *t;
2809   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2810   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2811   const PetscScalar *xb, *b;
2812   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2813   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2814   PetscInt           sz, k, ipvt[5];
2815   PetscBool          allowzeropivot, zeropivotdetected;
2816   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2817 
2818   PetscFunctionBegin;
2819   allowzeropivot = PetscNot(A->erroriffailure);
2820   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2821   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2822   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2823 
2824   if (!a->inode.ibdiagvalid) {
2825     if (!a->inode.ibdiag) {
2826       /* calculate space needed for diagonal blocks */
2827       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2828       a->inode.bdiagsize = cnt;
2829 
2830       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2831     }
2832 
2833     /* copy over the diagonal blocks and invert them */
2834     ibdiag = a->inode.ibdiag;
2835     bdiag  = a->inode.bdiag;
2836     cnt    = 0;
2837     for (i = 0, row = 0; i < m; i++) {
2838       for (j = 0; j < sizes[i]; j++) {
2839         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2840       }
2841       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2842 
2843       switch (sizes[i]) {
2844       case 1:
2845         /* Create matrix data structure */
2846         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2847           if (allowzeropivot) {
2848             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2849             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2850             A->factorerror_zeropivot_row   = row;
2851             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2852           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2853         }
2854         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2855         break;
2856       case 2:
2857         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2858         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2859         break;
2860       case 3:
2861         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2862         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2863         break;
2864       case 4:
2865         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2866         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867         break;
2868       case 5:
2869         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2870         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2871         break;
2872       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2873       }
2874       cnt += sizes[i] * sizes[i];
2875       row += sizes[i];
2876     }
2877     a->inode.ibdiagvalid = PETSC_TRUE;
2878   }
2879   ibdiag = a->inode.ibdiag;
2880   bdiag  = a->inode.bdiag;
2881   t      = a->inode.ssor_work;
2882 
2883   PetscCall(VecGetArray(xx, &x));
2884   PetscCall(VecGetArrayRead(bb, &b));
2885   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2886   if (flag & SOR_ZERO_INITIAL_GUESS) {
2887     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2888       for (i = 0, row = 0; i < m; i++) {
2889         sz  = diag[row] - ii[row];
2890         v1  = a->a + ii[row];
2891         idx = a->j + ii[row];
2892 
2893         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2894         switch (sizes[i]) {
2895         case 1:
2896 
2897           sum1 = b[row];
2898           for (n = 0; n < sz - 1; n += 2) {
2899             i1 = idx[0];
2900             i2 = idx[1];
2901             idx += 2;
2902             tmp0 = x[i1];
2903             tmp1 = x[i2];
2904             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2905             v1 += 2;
2906           }
2907 
2908           if (n == sz - 1) {
2909             tmp0 = x[*idx];
2910             sum1 -= *v1 * tmp0;
2911           }
2912           t[row]   = sum1;
2913           x[row++] = sum1 * (*ibdiag++);
2914           break;
2915         case 2:
2916           v2   = a->a + ii[row + 1];
2917           sum1 = b[row];
2918           sum2 = b[row + 1];
2919           for (n = 0; n < sz - 1; n += 2) {
2920             i1 = idx[0];
2921             i2 = idx[1];
2922             idx += 2;
2923             tmp0 = x[i1];
2924             tmp1 = x[i2];
2925             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2926             v1 += 2;
2927             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2928             v2 += 2;
2929           }
2930 
2931           if (n == sz - 1) {
2932             tmp0 = x[*idx];
2933             sum1 -= v1[0] * tmp0;
2934             sum2 -= v2[0] * tmp0;
2935           }
2936           t[row]     = sum1;
2937           t[row + 1] = sum2;
2938           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2939           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2940           ibdiag += 4;
2941           break;
2942         case 3:
2943           v2   = a->a + ii[row + 1];
2944           v3   = a->a + ii[row + 2];
2945           sum1 = b[row];
2946           sum2 = b[row + 1];
2947           sum3 = b[row + 2];
2948           for (n = 0; n < sz - 1; n += 2) {
2949             i1 = idx[0];
2950             i2 = idx[1];
2951             idx += 2;
2952             tmp0 = x[i1];
2953             tmp1 = x[i2];
2954             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2955             v1 += 2;
2956             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2957             v2 += 2;
2958             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2959             v3 += 2;
2960           }
2961 
2962           if (n == sz - 1) {
2963             tmp0 = x[*idx];
2964             sum1 -= v1[0] * tmp0;
2965             sum2 -= v2[0] * tmp0;
2966             sum3 -= v3[0] * tmp0;
2967           }
2968           t[row]     = sum1;
2969           t[row + 1] = sum2;
2970           t[row + 2] = sum3;
2971           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2972           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2973           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2974           ibdiag += 9;
2975           break;
2976         case 4:
2977           v2   = a->a + ii[row + 1];
2978           v3   = a->a + ii[row + 2];
2979           v4   = a->a + ii[row + 3];
2980           sum1 = b[row];
2981           sum2 = b[row + 1];
2982           sum3 = b[row + 2];
2983           sum4 = b[row + 3];
2984           for (n = 0; n < sz - 1; n += 2) {
2985             i1 = idx[0];
2986             i2 = idx[1];
2987             idx += 2;
2988             tmp0 = x[i1];
2989             tmp1 = x[i2];
2990             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2991             v1 += 2;
2992             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2993             v2 += 2;
2994             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2995             v3 += 2;
2996             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2997             v4 += 2;
2998           }
2999 
3000           if (n == sz - 1) {
3001             tmp0 = x[*idx];
3002             sum1 -= v1[0] * tmp0;
3003             sum2 -= v2[0] * tmp0;
3004             sum3 -= v3[0] * tmp0;
3005             sum4 -= v4[0] * tmp0;
3006           }
3007           t[row]     = sum1;
3008           t[row + 1] = sum2;
3009           t[row + 2] = sum3;
3010           t[row + 3] = sum4;
3011           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3012           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3013           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3014           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3015           ibdiag += 16;
3016           break;
3017         case 5:
3018           v2   = a->a + ii[row + 1];
3019           v3   = a->a + ii[row + 2];
3020           v4   = a->a + ii[row + 3];
3021           v5   = a->a + ii[row + 4];
3022           sum1 = b[row];
3023           sum2 = b[row + 1];
3024           sum3 = b[row + 2];
3025           sum4 = b[row + 3];
3026           sum5 = b[row + 4];
3027           for (n = 0; n < sz - 1; n += 2) {
3028             i1 = idx[0];
3029             i2 = idx[1];
3030             idx += 2;
3031             tmp0 = x[i1];
3032             tmp1 = x[i2];
3033             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3034             v1 += 2;
3035             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3036             v2 += 2;
3037             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3038             v3 += 2;
3039             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3040             v4 += 2;
3041             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3042             v5 += 2;
3043           }
3044 
3045           if (n == sz - 1) {
3046             tmp0 = x[*idx];
3047             sum1 -= v1[0] * tmp0;
3048             sum2 -= v2[0] * tmp0;
3049             sum3 -= v3[0] * tmp0;
3050             sum4 -= v4[0] * tmp0;
3051             sum5 -= v5[0] * tmp0;
3052           }
3053           t[row]     = sum1;
3054           t[row + 1] = sum2;
3055           t[row + 2] = sum3;
3056           t[row + 3] = sum4;
3057           t[row + 4] = sum5;
3058           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3059           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3060           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3061           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3062           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3063           ibdiag += 25;
3064           break;
3065         default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3066         }
3067       }
3068 
3069       xb = t;
3070       PetscCall(PetscLogFlops(a->nz));
3071     } else xb = b;
3072     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3073       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3074       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3075         ibdiag -= sizes[i] * sizes[i];
3076         sz  = ii[row + 1] - diag[row] - 1;
3077         v1  = a->a + diag[row] + 1;
3078         idx = a->j + diag[row] + 1;
3079 
3080         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3081         switch (sizes[i]) {
3082         case 1:
3083 
3084           sum1 = xb[row];
3085           for (n = 0; n < sz - 1; n += 2) {
3086             i1 = idx[0];
3087             i2 = idx[1];
3088             idx += 2;
3089             tmp0 = x[i1];
3090             tmp1 = x[i2];
3091             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3092             v1 += 2;
3093           }
3094 
3095           if (n == sz - 1) {
3096             tmp0 = x[*idx];
3097             sum1 -= *v1 * tmp0;
3098           }
3099           x[row--] = sum1 * (*ibdiag);
3100           break;
3101 
3102         case 2:
3103 
3104           sum1 = xb[row];
3105           sum2 = xb[row - 1];
3106           /* note that sum1 is associated with the second of the two rows */
3107           v2   = a->a + diag[row - 1] + 2;
3108           for (n = 0; n < sz - 1; n += 2) {
3109             i1 = idx[0];
3110             i2 = idx[1];
3111             idx += 2;
3112             tmp0 = x[i1];
3113             tmp1 = x[i2];
3114             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3115             v1 += 2;
3116             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3117             v2 += 2;
3118           }
3119 
3120           if (n == sz - 1) {
3121             tmp0 = x[*idx];
3122             sum1 -= *v1 * tmp0;
3123             sum2 -= *v2 * tmp0;
3124           }
3125           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3126           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3127           break;
3128         case 3:
3129 
3130           sum1 = xb[row];
3131           sum2 = xb[row - 1];
3132           sum3 = xb[row - 2];
3133           v2   = a->a + diag[row - 1] + 2;
3134           v3   = a->a + diag[row - 2] + 3;
3135           for (n = 0; n < sz - 1; n += 2) {
3136             i1 = idx[0];
3137             i2 = idx[1];
3138             idx += 2;
3139             tmp0 = x[i1];
3140             tmp1 = x[i2];
3141             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3142             v1 += 2;
3143             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3144             v2 += 2;
3145             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3146             v3 += 2;
3147           }
3148 
3149           if (n == sz - 1) {
3150             tmp0 = x[*idx];
3151             sum1 -= *v1 * tmp0;
3152             sum2 -= *v2 * tmp0;
3153             sum3 -= *v3 * tmp0;
3154           }
3155           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3156           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3157           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3158           break;
3159         case 4:
3160 
3161           sum1 = xb[row];
3162           sum2 = xb[row - 1];
3163           sum3 = xb[row - 2];
3164           sum4 = xb[row - 3];
3165           v2   = a->a + diag[row - 1] + 2;
3166           v3   = a->a + diag[row - 2] + 3;
3167           v4   = a->a + diag[row - 3] + 4;
3168           for (n = 0; n < sz - 1; n += 2) {
3169             i1 = idx[0];
3170             i2 = idx[1];
3171             idx += 2;
3172             tmp0 = x[i1];
3173             tmp1 = x[i2];
3174             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3175             v1 += 2;
3176             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3177             v2 += 2;
3178             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3179             v3 += 2;
3180             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3181             v4 += 2;
3182           }
3183 
3184           if (n == sz - 1) {
3185             tmp0 = x[*idx];
3186             sum1 -= *v1 * tmp0;
3187             sum2 -= *v2 * tmp0;
3188             sum3 -= *v3 * tmp0;
3189             sum4 -= *v4 * tmp0;
3190           }
3191           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3192           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3193           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3194           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3195           break;
3196         case 5:
3197 
3198           sum1 = xb[row];
3199           sum2 = xb[row - 1];
3200           sum3 = xb[row - 2];
3201           sum4 = xb[row - 3];
3202           sum5 = xb[row - 4];
3203           v2   = a->a + diag[row - 1] + 2;
3204           v3   = a->a + diag[row - 2] + 3;
3205           v4   = a->a + diag[row - 3] + 4;
3206           v5   = a->a + diag[row - 4] + 5;
3207           for (n = 0; n < sz - 1; n += 2) {
3208             i1 = idx[0];
3209             i2 = idx[1];
3210             idx += 2;
3211             tmp0 = x[i1];
3212             tmp1 = x[i2];
3213             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3214             v1 += 2;
3215             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3216             v2 += 2;
3217             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3218             v3 += 2;
3219             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3220             v4 += 2;
3221             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3222             v5 += 2;
3223           }
3224 
3225           if (n == sz - 1) {
3226             tmp0 = x[*idx];
3227             sum1 -= *v1 * tmp0;
3228             sum2 -= *v2 * tmp0;
3229             sum3 -= *v3 * tmp0;
3230             sum4 -= *v4 * tmp0;
3231             sum5 -= *v5 * tmp0;
3232           }
3233           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3234           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3235           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3236           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3237           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3238           break;
3239         default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3240         }
3241       }
3242 
3243       PetscCall(PetscLogFlops(a->nz));
3244     }
3245     its--;
3246   }
3247   while (its--) {
3248     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3249       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3250         sz  = diag[row] - ii[row];
3251         v1  = a->a + ii[row];
3252         idx = a->j + ii[row];
3253         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3254         switch (sizes[i]) {
3255         case 1:
3256           sum1 = b[row];
3257           for (n = 0; n < sz - 1; n += 2) {
3258             i1 = idx[0];
3259             i2 = idx[1];
3260             idx += 2;
3261             tmp0 = x[i1];
3262             tmp1 = x[i2];
3263             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3264             v1 += 2;
3265           }
3266           if (n == sz - 1) {
3267             tmp0 = x[*idx++];
3268             sum1 -= *v1 * tmp0;
3269             v1++;
3270           }
3271           t[row] = sum1;
3272           sz     = ii[row + 1] - diag[row] - 1;
3273           idx    = a->j + diag[row] + 1;
3274           v1 += 1;
3275           for (n = 0; n < sz - 1; n += 2) {
3276             i1 = idx[0];
3277             i2 = idx[1];
3278             idx += 2;
3279             tmp0 = x[i1];
3280             tmp1 = x[i2];
3281             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3282             v1 += 2;
3283           }
3284           if (n == sz - 1) {
3285             tmp0 = x[*idx++];
3286             sum1 -= *v1 * tmp0;
3287           }
3288           /* in MatSOR_SeqAIJ this line would be
3289            *
3290            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3291            *
3292            * but omega == 1, so this becomes
3293            *
3294            * x[row] = sum1*(*ibdiag++);
3295            *
3296            */
3297           x[row] = sum1 * (*ibdiag);
3298           break;
3299         case 2:
3300           v2   = a->a + ii[row + 1];
3301           sum1 = b[row];
3302           sum2 = b[row + 1];
3303           for (n = 0; n < sz - 1; n += 2) {
3304             i1 = idx[0];
3305             i2 = idx[1];
3306             idx += 2;
3307             tmp0 = x[i1];
3308             tmp1 = x[i2];
3309             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3310             v1 += 2;
3311             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3312             v2 += 2;
3313           }
3314           if (n == sz - 1) {
3315             tmp0 = x[*idx++];
3316             sum1 -= v1[0] * tmp0;
3317             sum2 -= v2[0] * tmp0;
3318             v1++;
3319             v2++;
3320           }
3321           t[row]     = sum1;
3322           t[row + 1] = sum2;
3323           sz         = ii[row + 1] - diag[row] - 2;
3324           idx        = a->j + diag[row] + 2;
3325           v1 += 2;
3326           v2 += 2;
3327           for (n = 0; n < sz - 1; n += 2) {
3328             i1 = idx[0];
3329             i2 = idx[1];
3330             idx += 2;
3331             tmp0 = x[i1];
3332             tmp1 = x[i2];
3333             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3334             v1 += 2;
3335             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3336             v2 += 2;
3337           }
3338           if (n == sz - 1) {
3339             tmp0 = x[*idx];
3340             sum1 -= v1[0] * tmp0;
3341             sum2 -= v2[0] * tmp0;
3342           }
3343           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3344           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3345           break;
3346         case 3:
3347           v2   = a->a + ii[row + 1];
3348           v3   = a->a + ii[row + 2];
3349           sum1 = b[row];
3350           sum2 = b[row + 1];
3351           sum3 = b[row + 2];
3352           for (n = 0; n < sz - 1; n += 2) {
3353             i1 = idx[0];
3354             i2 = idx[1];
3355             idx += 2;
3356             tmp0 = x[i1];
3357             tmp1 = x[i2];
3358             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3359             v1 += 2;
3360             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3361             v2 += 2;
3362             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3363             v3 += 2;
3364           }
3365           if (n == sz - 1) {
3366             tmp0 = x[*idx++];
3367             sum1 -= v1[0] * tmp0;
3368             sum2 -= v2[0] * tmp0;
3369             sum3 -= v3[0] * tmp0;
3370             v1++;
3371             v2++;
3372             v3++;
3373           }
3374           t[row]     = sum1;
3375           t[row + 1] = sum2;
3376           t[row + 2] = sum3;
3377           sz         = ii[row + 1] - diag[row] - 3;
3378           idx        = a->j + diag[row] + 3;
3379           v1 += 3;
3380           v2 += 3;
3381           v3 += 3;
3382           for (n = 0; n < sz - 1; n += 2) {
3383             i1 = idx[0];
3384             i2 = idx[1];
3385             idx += 2;
3386             tmp0 = x[i1];
3387             tmp1 = x[i2];
3388             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3389             v1 += 2;
3390             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3391             v2 += 2;
3392             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3393             v3 += 2;
3394           }
3395           if (n == sz - 1) {
3396             tmp0 = x[*idx];
3397             sum1 -= v1[0] * tmp0;
3398             sum2 -= v2[0] * tmp0;
3399             sum3 -= v3[0] * tmp0;
3400           }
3401           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3402           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3403           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3404           break;
3405         case 4:
3406           v2   = a->a + ii[row + 1];
3407           v3   = a->a + ii[row + 2];
3408           v4   = a->a + ii[row + 3];
3409           sum1 = b[row];
3410           sum2 = b[row + 1];
3411           sum3 = b[row + 2];
3412           sum4 = b[row + 3];
3413           for (n = 0; n < sz - 1; n += 2) {
3414             i1 = idx[0];
3415             i2 = idx[1];
3416             idx += 2;
3417             tmp0 = x[i1];
3418             tmp1 = x[i2];
3419             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3420             v1 += 2;
3421             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3422             v2 += 2;
3423             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3424             v3 += 2;
3425             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3426             v4 += 2;
3427           }
3428           if (n == sz - 1) {
3429             tmp0 = x[*idx++];
3430             sum1 -= v1[0] * tmp0;
3431             sum2 -= v2[0] * tmp0;
3432             sum3 -= v3[0] * tmp0;
3433             sum4 -= v4[0] * tmp0;
3434             v1++;
3435             v2++;
3436             v3++;
3437             v4++;
3438           }
3439           t[row]     = sum1;
3440           t[row + 1] = sum2;
3441           t[row + 2] = sum3;
3442           t[row + 3] = sum4;
3443           sz         = ii[row + 1] - diag[row] - 4;
3444           idx        = a->j + diag[row] + 4;
3445           v1 += 4;
3446           v2 += 4;
3447           v3 += 4;
3448           v4 += 4;
3449           for (n = 0; n < sz - 1; n += 2) {
3450             i1 = idx[0];
3451             i2 = idx[1];
3452             idx += 2;
3453             tmp0 = x[i1];
3454             tmp1 = x[i2];
3455             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3456             v1 += 2;
3457             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3458             v2 += 2;
3459             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3460             v3 += 2;
3461             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3462             v4 += 2;
3463           }
3464           if (n == sz - 1) {
3465             tmp0 = x[*idx];
3466             sum1 -= v1[0] * tmp0;
3467             sum2 -= v2[0] * tmp0;
3468             sum3 -= v3[0] * tmp0;
3469             sum4 -= v4[0] * tmp0;
3470           }
3471           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3472           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3473           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3474           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3475           break;
3476         case 5:
3477           v2   = a->a + ii[row + 1];
3478           v3   = a->a + ii[row + 2];
3479           v4   = a->a + ii[row + 3];
3480           v5   = a->a + ii[row + 4];
3481           sum1 = b[row];
3482           sum2 = b[row + 1];
3483           sum3 = b[row + 2];
3484           sum4 = b[row + 3];
3485           sum5 = b[row + 4];
3486           for (n = 0; n < sz - 1; n += 2) {
3487             i1 = idx[0];
3488             i2 = idx[1];
3489             idx += 2;
3490             tmp0 = x[i1];
3491             tmp1 = x[i2];
3492             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3493             v1 += 2;
3494             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3495             v2 += 2;
3496             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3497             v3 += 2;
3498             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3499             v4 += 2;
3500             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3501             v5 += 2;
3502           }
3503           if (n == sz - 1) {
3504             tmp0 = x[*idx++];
3505             sum1 -= v1[0] * tmp0;
3506             sum2 -= v2[0] * tmp0;
3507             sum3 -= v3[0] * tmp0;
3508             sum4 -= v4[0] * tmp0;
3509             sum5 -= v5[0] * tmp0;
3510             v1++;
3511             v2++;
3512             v3++;
3513             v4++;
3514             v5++;
3515           }
3516           t[row]     = sum1;
3517           t[row + 1] = sum2;
3518           t[row + 2] = sum3;
3519           t[row + 3] = sum4;
3520           t[row + 4] = sum5;
3521           sz         = ii[row + 1] - diag[row] - 5;
3522           idx        = a->j + diag[row] + 5;
3523           v1 += 5;
3524           v2 += 5;
3525           v3 += 5;
3526           v4 += 5;
3527           v5 += 5;
3528           for (n = 0; n < sz - 1; n += 2) {
3529             i1 = idx[0];
3530             i2 = idx[1];
3531             idx += 2;
3532             tmp0 = x[i1];
3533             tmp1 = x[i2];
3534             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3535             v1 += 2;
3536             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3537             v2 += 2;
3538             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3539             v3 += 2;
3540             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3541             v4 += 2;
3542             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3543             v5 += 2;
3544           }
3545           if (n == sz - 1) {
3546             tmp0 = x[*idx];
3547             sum1 -= v1[0] * tmp0;
3548             sum2 -= v2[0] * tmp0;
3549             sum3 -= v3[0] * tmp0;
3550             sum4 -= v4[0] * tmp0;
3551             sum5 -= v5[0] * tmp0;
3552           }
3553           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3554           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3555           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3556           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3557           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3558           break;
3559         default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3560         }
3561       }
3562       xb = t;
3563       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3564     } else xb = b;
3565 
3566     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3567       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3568       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3569         ibdiag -= sizes[i] * sizes[i];
3570 
3571         /* set RHS */
3572         if (xb == b) {
3573           /* whole (old way) */
3574           sz  = ii[row + 1] - ii[row];
3575           idx = a->j + ii[row];
3576           switch (sizes[i]) {
3577           case 5: v5 = a->a + ii[row - 4]; /* fall through */
3578           case 4: v4 = a->a + ii[row - 3]; /* fall through */
3579           case 3: v3 = a->a + ii[row - 2]; /* fall through */
3580           case 2: v2 = a->a + ii[row - 1]; /* fall through */
3581           case 1: v1 = a->a + ii[row]; break;
3582           default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3583           }
3584         } else {
3585           /* upper, no diag */
3586           sz  = ii[row + 1] - diag[row] - 1;
3587           idx = a->j + diag[row] + 1;
3588           switch (sizes[i]) {
3589           case 5: v5 = a->a + diag[row - 4] + 5; /* fall through */
3590           case 4: v4 = a->a + diag[row - 3] + 4; /* fall through */
3591           case 3: v3 = a->a + diag[row - 2] + 3; /* fall through */
3592           case 2: v2 = a->a + diag[row - 1] + 2; /* fall through */
3593           case 1: v1 = a->a + diag[row] + 1;
3594           }
3595         }
3596         /* set sum */
3597         switch (sizes[i]) {
3598         case 5: sum5 = xb[row - 4]; /* fall through */
3599         case 4: sum4 = xb[row - 3]; /* fall through */
3600         case 3: sum3 = xb[row - 2]; /* fall through */
3601         case 2: sum2 = xb[row - 1]; /* fall through */
3602         case 1:
3603           /* note that sum1 is associated with the last row */
3604           sum1 = xb[row];
3605         }
3606         /* do sums */
3607         for (n = 0; n < sz - 1; n += 2) {
3608           i1 = idx[0];
3609           i2 = idx[1];
3610           idx += 2;
3611           tmp0 = x[i1];
3612           tmp1 = x[i2];
3613           switch (sizes[i]) {
3614           case 5: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; /* fall through */
3615           case 4: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; /* fall through */
3616           case 3: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; /* fall through */
3617           case 2: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; /* fall through */
3618           case 1: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3619           }
3620         }
3621         /* ragged edge */
3622         if (n == sz - 1) {
3623           tmp0 = x[*idx];
3624           switch (sizes[i]) {
3625           case 5: sum5 -= *v5 * tmp0; /* fall through */
3626           case 4: sum4 -= *v4 * tmp0; /* fall through */
3627           case 3: sum3 -= *v3 * tmp0; /* fall through */
3628           case 2: sum2 -= *v2 * tmp0; /* fall through */
3629           case 1: sum1 -= *v1 * tmp0;
3630           }
3631         }
3632         /* update */
3633         if (xb == b) {
3634           /* whole (old way) w/ diag */
3635           switch (sizes[i]) {
3636           case 5:
3637             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3638             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3639             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3640             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3641             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3642             break;
3643           case 4:
3644             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3645             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3646             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3647             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3648             break;
3649           case 3:
3650             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3651             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3652             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3653             break;
3654           case 2:
3655             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3656             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3657             break;
3658           case 1: x[row--] += sum1 * (*ibdiag); break;
3659           }
3660         } else {
3661           /* no diag so set =  */
3662           switch (sizes[i]) {
3663           case 5:
3664             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3665             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3666             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3667             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3668             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3669             break;
3670           case 4:
3671             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3672             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3673             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3674             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3675             break;
3676           case 3:
3677             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3678             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3679             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3680             break;
3681           case 2:
3682             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3683             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3684             break;
3685           case 1: x[row--] = sum1 * (*ibdiag); break;
3686           }
3687         }
3688       }
3689       if (xb == b) {
3690         PetscCall(PetscLogFlops(2.0 * a->nz));
3691       } else {
3692         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3693       }
3694     }
3695   }
3696   if (flag & SOR_EISENSTAT) {
3697     /*
3698           Apply  (U + D)^-1  where D is now the block diagonal
3699     */
3700     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3701     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3702       ibdiag -= sizes[i] * sizes[i];
3703       sz  = ii[row + 1] - diag[row] - 1;
3704       v1  = a->a + diag[row] + 1;
3705       idx = a->j + diag[row] + 1;
3706       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3707       switch (sizes[i]) {
3708       case 1:
3709 
3710         sum1 = b[row];
3711         for (n = 0; n < sz - 1; n += 2) {
3712           i1 = idx[0];
3713           i2 = idx[1];
3714           idx += 2;
3715           tmp0 = x[i1];
3716           tmp1 = x[i2];
3717           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3718           v1 += 2;
3719         }
3720 
3721         if (n == sz - 1) {
3722           tmp0 = x[*idx];
3723           sum1 -= *v1 * tmp0;
3724         }
3725         x[row] = sum1 * (*ibdiag);
3726         row--;
3727         break;
3728 
3729       case 2:
3730 
3731         sum1 = b[row];
3732         sum2 = b[row - 1];
3733         /* note that sum1 is associated with the second of the two rows */
3734         v2   = a->a + diag[row - 1] + 2;
3735         for (n = 0; n < sz - 1; n += 2) {
3736           i1 = idx[0];
3737           i2 = idx[1];
3738           idx += 2;
3739           tmp0 = x[i1];
3740           tmp1 = x[i2];
3741           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3742           v1 += 2;
3743           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3744           v2 += 2;
3745         }
3746 
3747         if (n == sz - 1) {
3748           tmp0 = x[*idx];
3749           sum1 -= *v1 * tmp0;
3750           sum2 -= *v2 * tmp0;
3751         }
3752         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3753         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3754         row -= 2;
3755         break;
3756       case 3:
3757 
3758         sum1 = b[row];
3759         sum2 = b[row - 1];
3760         sum3 = b[row - 2];
3761         v2   = a->a + diag[row - 1] + 2;
3762         v3   = a->a + diag[row - 2] + 3;
3763         for (n = 0; n < sz - 1; n += 2) {
3764           i1 = idx[0];
3765           i2 = idx[1];
3766           idx += 2;
3767           tmp0 = x[i1];
3768           tmp1 = x[i2];
3769           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3770           v1 += 2;
3771           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3772           v2 += 2;
3773           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3774           v3 += 2;
3775         }
3776 
3777         if (n == sz - 1) {
3778           tmp0 = x[*idx];
3779           sum1 -= *v1 * tmp0;
3780           sum2 -= *v2 * tmp0;
3781           sum3 -= *v3 * tmp0;
3782         }
3783         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3784         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3785         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3786         row -= 3;
3787         break;
3788       case 4:
3789 
3790         sum1 = b[row];
3791         sum2 = b[row - 1];
3792         sum3 = b[row - 2];
3793         sum4 = b[row - 3];
3794         v2   = a->a + diag[row - 1] + 2;
3795         v3   = a->a + diag[row - 2] + 3;
3796         v4   = a->a + diag[row - 3] + 4;
3797         for (n = 0; n < sz - 1; n += 2) {
3798           i1 = idx[0];
3799           i2 = idx[1];
3800           idx += 2;
3801           tmp0 = x[i1];
3802           tmp1 = x[i2];
3803           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3804           v1 += 2;
3805           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3806           v2 += 2;
3807           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3808           v3 += 2;
3809           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3810           v4 += 2;
3811         }
3812 
3813         if (n == sz - 1) {
3814           tmp0 = x[*idx];
3815           sum1 -= *v1 * tmp0;
3816           sum2 -= *v2 * tmp0;
3817           sum3 -= *v3 * tmp0;
3818           sum4 -= *v4 * tmp0;
3819         }
3820         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3821         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3822         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3823         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3824         row -= 4;
3825         break;
3826       case 5:
3827 
3828         sum1 = b[row];
3829         sum2 = b[row - 1];
3830         sum3 = b[row - 2];
3831         sum4 = b[row - 3];
3832         sum5 = b[row - 4];
3833         v2   = a->a + diag[row - 1] + 2;
3834         v3   = a->a + diag[row - 2] + 3;
3835         v4   = a->a + diag[row - 3] + 4;
3836         v5   = a->a + diag[row - 4] + 5;
3837         for (n = 0; n < sz - 1; n += 2) {
3838           i1 = idx[0];
3839           i2 = idx[1];
3840           idx += 2;
3841           tmp0 = x[i1];
3842           tmp1 = x[i2];
3843           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3844           v1 += 2;
3845           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3846           v2 += 2;
3847           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3848           v3 += 2;
3849           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3850           v4 += 2;
3851           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3852           v5 += 2;
3853         }
3854 
3855         if (n == sz - 1) {
3856           tmp0 = x[*idx];
3857           sum1 -= *v1 * tmp0;
3858           sum2 -= *v2 * tmp0;
3859           sum3 -= *v3 * tmp0;
3860           sum4 -= *v4 * tmp0;
3861           sum5 -= *v5 * tmp0;
3862         }
3863         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3864         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3865         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3866         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3867         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3868         row -= 5;
3869         break;
3870       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3871       }
3872     }
3873     PetscCall(PetscLogFlops(a->nz));
3874 
3875     /*
3876            t = b - D x    where D is the block diagonal
3877     */
3878     cnt = 0;
3879     for (i = 0, row = 0; i < m; i++) {
3880       switch (sizes[i]) {
3881       case 1:
3882         t[row] = b[row] - bdiag[cnt++] * x[row];
3883         row++;
3884         break;
3885       case 2:
3886         x1         = x[row];
3887         x2         = x[row + 1];
3888         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3889         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3890         t[row]     = b[row] - tmp1;
3891         t[row + 1] = b[row + 1] - tmp2;
3892         row += 2;
3893         cnt += 4;
3894         break;
3895       case 3:
3896         x1         = x[row];
3897         x2         = x[row + 1];
3898         x3         = x[row + 2];
3899         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3900         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3901         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3902         t[row]     = b[row] - tmp1;
3903         t[row + 1] = b[row + 1] - tmp2;
3904         t[row + 2] = b[row + 2] - tmp3;
3905         row += 3;
3906         cnt += 9;
3907         break;
3908       case 4:
3909         x1         = x[row];
3910         x2         = x[row + 1];
3911         x3         = x[row + 2];
3912         x4         = x[row + 3];
3913         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3914         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3915         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3916         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3917         t[row]     = b[row] - tmp1;
3918         t[row + 1] = b[row + 1] - tmp2;
3919         t[row + 2] = b[row + 2] - tmp3;
3920         t[row + 3] = b[row + 3] - tmp4;
3921         row += 4;
3922         cnt += 16;
3923         break;
3924       case 5:
3925         x1         = x[row];
3926         x2         = x[row + 1];
3927         x3         = x[row + 2];
3928         x4         = x[row + 3];
3929         x5         = x[row + 4];
3930         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3931         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3932         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3933         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3934         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3935         t[row]     = b[row] - tmp1;
3936         t[row + 1] = b[row + 1] - tmp2;
3937         t[row + 2] = b[row + 2] - tmp3;
3938         t[row + 3] = b[row + 3] - tmp4;
3939         t[row + 4] = b[row + 4] - tmp5;
3940         row += 5;
3941         cnt += 25;
3942         break;
3943       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3944       }
3945     }
3946     PetscCall(PetscLogFlops(m));
3947 
3948     /*
3949           Apply (L + D)^-1 where D is the block diagonal
3950     */
3951     for (i = 0, row = 0; i < m; i++) {
3952       sz  = diag[row] - ii[row];
3953       v1  = a->a + ii[row];
3954       idx = a->j + ii[row];
3955       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3956       switch (sizes[i]) {
3957       case 1:
3958 
3959         sum1 = t[row];
3960         for (n = 0; n < sz - 1; n += 2) {
3961           i1 = idx[0];
3962           i2 = idx[1];
3963           idx += 2;
3964           tmp0 = t[i1];
3965           tmp1 = t[i2];
3966           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3967           v1 += 2;
3968         }
3969 
3970         if (n == sz - 1) {
3971           tmp0 = t[*idx];
3972           sum1 -= *v1 * tmp0;
3973         }
3974         x[row] += t[row] = sum1 * (*ibdiag++);
3975         row++;
3976         break;
3977       case 2:
3978         v2   = a->a + ii[row + 1];
3979         sum1 = t[row];
3980         sum2 = t[row + 1];
3981         for (n = 0; n < sz - 1; n += 2) {
3982           i1 = idx[0];
3983           i2 = idx[1];
3984           idx += 2;
3985           tmp0 = t[i1];
3986           tmp1 = t[i2];
3987           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3988           v1 += 2;
3989           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3990           v2 += 2;
3991         }
3992 
3993         if (n == sz - 1) {
3994           tmp0 = t[*idx];
3995           sum1 -= v1[0] * tmp0;
3996           sum2 -= v2[0] * tmp0;
3997         }
3998         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3999         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4000         ibdiag += 4;
4001         row += 2;
4002         break;
4003       case 3:
4004         v2   = a->a + ii[row + 1];
4005         v3   = a->a + ii[row + 2];
4006         sum1 = t[row];
4007         sum2 = t[row + 1];
4008         sum3 = t[row + 2];
4009         for (n = 0; n < sz - 1; n += 2) {
4010           i1 = idx[0];
4011           i2 = idx[1];
4012           idx += 2;
4013           tmp0 = t[i1];
4014           tmp1 = t[i2];
4015           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4016           v1 += 2;
4017           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4018           v2 += 2;
4019           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4020           v3 += 2;
4021         }
4022 
4023         if (n == sz - 1) {
4024           tmp0 = t[*idx];
4025           sum1 -= v1[0] * tmp0;
4026           sum2 -= v2[0] * tmp0;
4027           sum3 -= v3[0] * tmp0;
4028         }
4029         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4030         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4031         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4032         ibdiag += 9;
4033         row += 3;
4034         break;
4035       case 4:
4036         v2   = a->a + ii[row + 1];
4037         v3   = a->a + ii[row + 2];
4038         v4   = a->a + ii[row + 3];
4039         sum1 = t[row];
4040         sum2 = t[row + 1];
4041         sum3 = t[row + 2];
4042         sum4 = t[row + 3];
4043         for (n = 0; n < sz - 1; n += 2) {
4044           i1 = idx[0];
4045           i2 = idx[1];
4046           idx += 2;
4047           tmp0 = t[i1];
4048           tmp1 = t[i2];
4049           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4050           v1 += 2;
4051           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4052           v2 += 2;
4053           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4054           v3 += 2;
4055           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4056           v4 += 2;
4057         }
4058 
4059         if (n == sz - 1) {
4060           tmp0 = t[*idx];
4061           sum1 -= v1[0] * tmp0;
4062           sum2 -= v2[0] * tmp0;
4063           sum3 -= v3[0] * tmp0;
4064           sum4 -= v4[0] * tmp0;
4065         }
4066         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4067         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4068         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4069         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4070         ibdiag += 16;
4071         row += 4;
4072         break;
4073       case 5:
4074         v2   = a->a + ii[row + 1];
4075         v3   = a->a + ii[row + 2];
4076         v4   = a->a + ii[row + 3];
4077         v5   = a->a + ii[row + 4];
4078         sum1 = t[row];
4079         sum2 = t[row + 1];
4080         sum3 = t[row + 2];
4081         sum4 = t[row + 3];
4082         sum5 = t[row + 4];
4083         for (n = 0; n < sz - 1; n += 2) {
4084           i1 = idx[0];
4085           i2 = idx[1];
4086           idx += 2;
4087           tmp0 = t[i1];
4088           tmp1 = t[i2];
4089           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4090           v1 += 2;
4091           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4092           v2 += 2;
4093           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4094           v3 += 2;
4095           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4096           v4 += 2;
4097           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4098           v5 += 2;
4099         }
4100 
4101         if (n == sz - 1) {
4102           tmp0 = t[*idx];
4103           sum1 -= v1[0] * tmp0;
4104           sum2 -= v2[0] * tmp0;
4105           sum3 -= v3[0] * tmp0;
4106           sum4 -= v4[0] * tmp0;
4107           sum5 -= v5[0] * tmp0;
4108         }
4109         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4110         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4111         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4112         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4113         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4114         ibdiag += 25;
4115         row += 5;
4116         break;
4117       default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4118       }
4119     }
4120     PetscCall(PetscLogFlops(a->nz));
4121   }
4122   PetscCall(VecRestoreArray(xx, &x));
4123   PetscCall(VecRestoreArrayRead(bb, &b));
4124   PetscFunctionReturn(0);
4125 }
4126 
4127 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) {
4128   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4129   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4130   const MatScalar   *bdiag = a->inode.bdiag;
4131   const PetscScalar *b;
4132   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4133   const PetscInt    *sizes = a->inode.size;
4134 
4135   PetscFunctionBegin;
4136   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4137   PetscCall(VecGetArray(xx, &x));
4138   PetscCall(VecGetArrayRead(bb, &b));
4139   cnt = 0;
4140   for (i = 0, row = 0; i < m; i++) {
4141     switch (sizes[i]) {
4142     case 1:
4143       x[row] = b[row] * bdiag[cnt++];
4144       row++;
4145       break;
4146     case 2:
4147       x1       = b[row];
4148       x2       = b[row + 1];
4149       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4150       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4151       x[row++] = tmp1;
4152       x[row++] = tmp2;
4153       cnt += 4;
4154       break;
4155     case 3:
4156       x1       = b[row];
4157       x2       = b[row + 1];
4158       x3       = b[row + 2];
4159       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4160       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4161       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4162       x[row++] = tmp1;
4163       x[row++] = tmp2;
4164       x[row++] = tmp3;
4165       cnt += 9;
4166       break;
4167     case 4:
4168       x1       = b[row];
4169       x2       = b[row + 1];
4170       x3       = b[row + 2];
4171       x4       = b[row + 3];
4172       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4173       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4174       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4175       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4176       x[row++] = tmp1;
4177       x[row++] = tmp2;
4178       x[row++] = tmp3;
4179       x[row++] = tmp4;
4180       cnt += 16;
4181       break;
4182     case 5:
4183       x1       = b[row];
4184       x2       = b[row + 1];
4185       x3       = b[row + 2];
4186       x4       = b[row + 3];
4187       x5       = b[row + 4];
4188       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4189       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4190       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4191       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4192       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4193       x[row++] = tmp1;
4194       x[row++] = tmp2;
4195       x[row++] = tmp3;
4196       x[row++] = tmp4;
4197       x[row++] = tmp5;
4198       cnt += 25;
4199       break;
4200     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4201     }
4202   }
4203   PetscCall(PetscLogFlops(2.0 * cnt));
4204   PetscCall(VecRestoreArray(xx, &x));
4205   PetscCall(VecRestoreArrayRead(bb, &b));
4206   PetscFunctionReturn(0);
4207 }
4208 
4209 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) {
4210   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4211 
4212   PetscFunctionBegin;
4213   a->inode.node_count       = 0;
4214   a->inode.use              = PETSC_FALSE;
4215   a->inode.checked          = PETSC_FALSE;
4216   a->inode.mat_nonzerostate = -1;
4217   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4218   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4219   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4220   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4221   A->ops->coloringpatch     = NULL;
4222   A->ops->multdiagonalblock = NULL;
4223   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4224   PetscFunctionReturn(0);
4225 }
4226 
4227 /*
4228     samestructure indicates that the matrix has not changed its nonzero structure so we
4229     do not need to recompute the inodes
4230 */
4231 PetscErrorCode MatSeqAIJCheckInode(Mat A) {
4232   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4233   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4234   PetscBool       flag;
4235   const PetscInt *idx, *idy, *ii;
4236 
4237   PetscFunctionBegin;
4238   if (!a->inode.use) {
4239     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4240     PetscCall(PetscFree(a->inode.size));
4241     PetscFunctionReturn(0);
4242   }
4243   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4244 
4245   m = A->rmap->n;
4246   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4247   ns = a->inode.size;
4248 
4249   i          = 0;
4250   node_count = 0;
4251   idx        = a->j;
4252   ii         = a->i;
4253   while (i < m) {            /* For each row */
4254     nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4255     /* Limits the number of elements in a node to 'a->inode.limit' */
4256     for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4257       nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4258       if (nzy != nzx) break;
4259       idy += nzx; /* Same nonzero pattern */
4260       PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4261       if (!flag) break;
4262     }
4263     ns[node_count++] = blk_size;
4264     idx += blk_size * nzx;
4265     i = j;
4266   }
4267 
4268   /* If not enough inodes found,, do not use inode version of the routines */
4269   if (!m || node_count > .8 * m) {
4270     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4271     PetscCall(PetscFree(a->inode.size));
4272     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4273   } else {
4274     if (!A->factortype) {
4275       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4276       if (A->rmap->n == A->cmap->n) {
4277         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4278         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4279         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4280         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4281         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4282       }
4283     } else {
4284       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4285     }
4286     a->inode.node_count = node_count;
4287     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4288   }
4289   a->inode.checked          = PETSC_TRUE;
4290   a->inode.mat_nonzerostate = A->nonzerostate;
4291   PetscFunctionReturn(0);
4292 }
4293 
4294 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) {
4295   Mat         B = *C;
4296   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4297   PetscInt    m = A->rmap->n;
4298 
4299   PetscFunctionBegin;
4300   c->inode.use              = a->inode.use;
4301   c->inode.limit            = a->inode.limit;
4302   c->inode.max_limit        = a->inode.max_limit;
4303   c->inode.checked          = PETSC_FALSE;
4304   c->inode.size             = NULL;
4305   c->inode.node_count       = 0;
4306   c->inode.ibdiagvalid      = PETSC_FALSE;
4307   c->inode.ibdiag           = NULL;
4308   c->inode.bdiag            = NULL;
4309   c->inode.mat_nonzerostate = -1;
4310   if (a->inode.use) {
4311     if (a->inode.checked && a->inode.size) {
4312       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4313       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4314 
4315       c->inode.checked          = PETSC_TRUE;
4316       c->inode.node_count       = a->inode.node_count;
4317       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4318     }
4319     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4320     if (!B->factortype) {
4321       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4322       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4323       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4324       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4325       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4326       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4327     } else {
4328       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4329     }
4330   }
4331   PetscFunctionReturn(0);
4332 }
4333 
4334 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) {
4335   PetscInt        k;
4336   const PetscInt *vi;
4337 
4338   PetscFunctionBegin;
4339   vi = aj + ai[row];
4340   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4341   vi        = aj + adiag[row];
4342   cols[nzl] = vi[0];
4343   vi        = aj + adiag[row + 1] + 1;
4344   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4345   PetscFunctionReturn(0);
4346 }
4347 /*
4348    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4349    Modified from MatSeqAIJCheckInode().
4350 
4351    Input Parameters:
4352 .  Mat A - ILU or LU matrix factor
4353 
4354 */
4355 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) {
4356   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4357   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4358   PetscInt       *cols1, *cols2, *ns;
4359   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4360   PetscBool       flag;
4361 
4362   PetscFunctionBegin;
4363   if (!a->inode.use) PetscFunctionReturn(0);
4364   if (a->inode.checked) PetscFunctionReturn(0);
4365 
4366   m = A->rmap->n;
4367   if (a->inode.size) ns = a->inode.size;
4368   else PetscCall(PetscMalloc1(m + 1, &ns));
4369 
4370   i          = 0;
4371   node_count = 0;
4372   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4373   while (i < m) {                       /* For each row */
4374     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4375     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4376     nzx  = nzl1 + nzu1 + 1;
4377     MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i);
4378 
4379     /* Limits the number of elements in a node to 'a->inode.limit' */
4380     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4381       nzl2 = ai[j + 1] - ai[j];
4382       nzu2 = adiag[j] - adiag[j + 1] - 1;
4383       nzy  = nzl2 + nzu2 + 1;
4384       if (nzy != nzx) break;
4385       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4386       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4387       if (!flag) break;
4388     }
4389     ns[node_count++] = blk_size;
4390     i                = j;
4391   }
4392   PetscCall(PetscFree2(cols1, cols2));
4393   /* If not enough inodes found,, do not use inode version of the routines */
4394   if (!m || node_count > .8 * m) {
4395     PetscCall(PetscFree(ns));
4396 
4397     a->inode.node_count = 0;
4398     a->inode.size       = NULL;
4399     a->inode.use        = PETSC_FALSE;
4400 
4401     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4402   } else {
4403     A->ops->mult              = NULL;
4404     A->ops->sor               = NULL;
4405     A->ops->multadd           = NULL;
4406     A->ops->getrowij          = NULL;
4407     A->ops->restorerowij      = NULL;
4408     A->ops->getcolumnij       = NULL;
4409     A->ops->restorecolumnij   = NULL;
4410     A->ops->coloringpatch     = NULL;
4411     A->ops->multdiagonalblock = NULL;
4412     a->inode.node_count       = node_count;
4413     a->inode.size             = ns;
4414 
4415     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4416   }
4417   a->inode.checked = PETSC_TRUE;
4418   PetscFunctionReturn(0);
4419 }
4420 
4421 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) {
4422   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4423 
4424   PetscFunctionBegin;
4425   a->inode.ibdiagvalid = PETSC_FALSE;
4426   PetscFunctionReturn(0);
4427 }
4428 
4429 /*
4430      This is really ugly. if inodes are used this replaces the
4431   permutations with ones that correspond to rows/cols of the matrix
4432   rather then inode blocks
4433 */
4434 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) {
4435   PetscFunctionBegin;
4436   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4437   PetscFunctionReturn(0);
4438 }
4439 
4440 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) {
4441   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4442   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4443   const PetscInt *ridx, *cidx;
4444   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4445   PetscInt        nslim_col, *ns_col;
4446   IS              ris = *rperm, cis = *cperm;
4447 
4448   PetscFunctionBegin;
4449   if (!a->inode.size) PetscFunctionReturn(0);           /* no inodes so return */
4450   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4451 
4452   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4453   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4454   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4455 
4456   PetscCall(ISGetIndices(ris, &ridx));
4457   PetscCall(ISGetIndices(cis, &cidx));
4458 
4459   /* Form the inode structure for the rows of permuted matric using inv perm*/
4460   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4461 
4462   /* Construct the permutations for rows*/
4463   for (i = 0, row = 0; i < nslim_row; ++i) {
4464     indx      = ridx[i];
4465     start_val = tns[indx];
4466     end_val   = tns[indx + 1];
4467     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4468   }
4469 
4470   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4471   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4472 
4473   /* Construct permutations for columns */
4474   for (i = 0, col = 0; i < nslim_col; ++i) {
4475     indx      = cidx[i];
4476     start_val = tns[indx];
4477     end_val   = tns[indx + 1];
4478     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4479   }
4480 
4481   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4482   PetscCall(ISSetPermutation(*rperm));
4483   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4484   PetscCall(ISSetPermutation(*cperm));
4485 
4486   PetscCall(ISRestoreIndices(ris, &ridx));
4487   PetscCall(ISRestoreIndices(cis, &cidx));
4488 
4489   PetscCall(PetscFree(ns_col));
4490   PetscCall(PetscFree2(permr, permc));
4491   PetscCall(ISDestroy(&cis));
4492   PetscCall(ISDestroy(&ris));
4493   PetscCall(PetscFree(tns));
4494   PetscFunctionReturn(0);
4495 }
4496 
4497 /*@C
4498    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4499 
4500    Not Collective
4501 
4502    Input Parameter:
4503 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4504 
4505    Output Parameters:
4506 +  node_count - no of inodes present in the matrix.
4507 .  sizes      - an array of size node_count,with sizes of each inode.
4508 -  limit      - the max size used to generate the inodes.
4509 
4510    Level: advanced
4511 
4512    Note:
4513     This routine returns some internal storage information
4514    of the matrix, it is intended to be used by advanced users.
4515    It should be called after the matrix is assembled.
4516    The contents of the sizes[] array should not be changed.
4517    NULL may be passed for information not requested.
4518 
4519 .seealso: `MatGetInfo()`
4520 @*/
4521 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) {
4522   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4523 
4524   PetscFunctionBegin;
4525   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4526   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4527   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4528   PetscFunctionReturn(0);
4529 }
4530 
4531 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) {
4532   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4533 
4534   PetscFunctionBegin;
4535   if (node_count) *node_count = a->inode.node_count;
4536   if (sizes) *sizes = a->inode.size;
4537   if (limit) *limit = a->inode.limit;
4538   PetscFunctionReturn(0);
4539 }
4540