xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 3f7878b3ed00daeee6ae8d2839a0d41a4d3e79ca)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size_csr;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30   ns_col[0] = 0;
31 
32   /* Use the same row structure wherever feasible. */
33   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34 
35   /* if m < n; pad up the remainder with inode_limit */
36   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37   /* The last node is the odd ball. pad it up with the remaining rows; */
38   if (count < n) {
39     ns_col[i + 1] = ns_col[i] + (n - count);
40     i++;
41   } else if (count > n) {
42     /* Adjust for the over estimation */
43     ns_col[i] += n - count;
44   }
45   *size = i;
46   *ns   = ns_col;
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /*
51       This builds symmetric version of nonzero structure,
52 */
53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
56   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59 
60   PetscFunctionBegin;
61   nslim_row = a->inode.node_count;
62   m         = A->rmap->n;
63   n         = A->cmap->n;
64   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66 
67   /* Use the row_inode as column_inode */
68   nslim_col = nslim_row;
69   ns_col    = ns_row;
70 
71   /* allocate space for reformatted inode structure */
72   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74 
75   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76     nsz = ns_col[i1 + 1] - ns_col[i1];
77     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78   }
79   /* allocate space for row pointers */
80   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81   *iia = ia;
82   PetscCall(PetscMalloc1(nslim_row + 1, &work));
83 
84   /* determine the number of columns in each row */
85   ia[0] = oshift;
86   for (i1 = 0; i1 < nslim_row; i1++) {
87     row  = ns_row[i1];
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0; i1 < nslim_row; i1++) {
117     row  = ns_row[i1];
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1 + 1] - ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0; i1 < nslim_row; i1++) {
171     row = ns_row[i1];
172     j   = aj + ai[row] + ishift;
173     nz  = ai[row + 1] - ai[row];
174     if (!nz) continue; /* empty row */
175     col = *j++ + ishift;
176     i2  = tvc[col];
177     while (nz-- > 0) { /* off-diagonal elements */
178       ia[i1 + 1]++;
179       i2++; /* Start col of next node */
180       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181       if (nz > 0) i2 = tvc[col];
182     }
183   }
184 
185   /* shift ia[i] to point to next row */
186   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187     row = ia[i1 - 1];
188     ia[i1] += row;
189     work[i1 - 1] = row - oshift;
190   }
191 
192   /* allocate space for column pointers */
193   nz = ia[nslim_row] + (!ishift);
194   PetscCall(PetscMalloc1(nz, &ja));
195   *jja = ja;
196 
197   /* loop over matrix putting into ja */
198   for (i1 = 0; i1 < nslim_row; i1++) {
199     row = ns_row[i1];
200     j   = aj + ai[row] + ishift;
201     nz  = ai[row + 1] - ai[row];
202     if (!nz) continue; /* empty row */
203     col = *j++ + ishift;
204     i2  = tvc[col];
205     while (nz-- > 0) {
206       ja[work[i1]++] = i2 + oshift;
207       ++i2;
208       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209       if (nz > 0) i2 = tvc[col];
210     }
211   }
212   PetscCall(PetscFree(ns_col));
213   PetscCall(PetscFree(work));
214   PetscCall(PetscFree2(tns, tvc));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221 
222   PetscFunctionBegin;
223   if (n) *n = a->inode.node_count;
224   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225   if (!blockcompressed) {
226     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227   } else if (symmetric) {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229   } else {
230     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231   }
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237   PetscFunctionBegin;
238   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239 
240   if (!blockcompressed) {
241     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242   } else {
243     PetscCall(PetscFree(*ia));
244     PetscCall(PetscFree(*ja));
245   }
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254 
255   PetscFunctionBegin;
256   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257   nslim_row = a->inode.node_count;
258   n         = A->cmap->n;
259 
260   /* Create The column_inode for this matrix */
261   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262 
263   /* allocate space for reformatted column_inode structure */
264   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266 
267   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268     nsz = ns_col[i1 + 1] - ns_col[i1];
269     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270   }
271   /* allocate space for column pointers */
272   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273   *iia = ia;
274   PetscCall(PetscMalloc1(nslim_col + 1, &work));
275 
276   /* determine the number of columns in each row */
277   ia[0] = oshift;
278   for (i1 = 0; i1 < nslim_row; i1++) {
279     row = ns_row[i1];
280     j   = aj + ai[row] + ishift;
281     col = *j++ + ishift;
282     i2  = tvc[col];
283     nz  = ai[row + 1] - ai[row];
284     while (nz-- > 0) { /* off-diagonal elements */
285       /* ia[i1+1]++; */
286       ia[i2 + 1]++;
287       i2++;
288       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289       if (nz > 0) i2 = tvc[col];
290     }
291   }
292 
293   /* shift ia[i] to point to next col */
294   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295     col = ia[i1 - 1];
296     ia[i1] += col;
297     work[i1 - 1] = col - oshift;
298   }
299 
300   /* allocate space for column pointers */
301   nz = ia[nslim_col] + (!ishift);
302   PetscCall(PetscMalloc1(nz, &ja));
303   *jja = ja;
304 
305   /* loop over matrix putting into ja */
306   for (i1 = 0; i1 < nslim_row; i1++) {
307     row = ns_row[i1];
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row + 1] - ai[row];
312     while (nz-- > 0) {
313       /* ja[work[i1]++] = i2 + oshift; */
314       ja[work[i2]++] = i1 + oshift;
315       i2++;
316       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320   PetscCall(PetscFree(ns_col));
321   PetscCall(PetscFree(work));
322   PetscCall(PetscFree2(tns, tvc));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreateColInode_Private(A, n, NULL));
330   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331 
332   if (!blockcompressed) {
333     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334   } else if (symmetric) {
335     /* Since the indices are symmetric it doesn't matter */
336     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337   } else {
338     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345   PetscFunctionBegin;
346   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347   if (!blockcompressed) {
348     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349   } else {
350     PetscCall(PetscFree(*ia));
351     PetscCall(PetscFree(*ja));
352   }
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359   PetscScalar       *y;
360   const PetscScalar *x;
361   PetscInt           row, node_max, nonzerorow = 0;
362   PetscInt          *ns;
363 
364   PetscFunctionBegin;
365   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366   node_max = a->inode.node_count;
367   ns       = a->inode.size_csr; /* Node Size array */
368   PetscCall(VecGetArrayRead(xx, &x));
369   PetscCall(VecGetArray(yy, &y));
370 
371   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372   for (PetscInt i = 0; i < node_max; ++i) {
373     PetscInt         i1, i2, nsz, n, sz;
374     const MatScalar *v1, *v2, *v3, *v4, *v5;
375     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376     const PetscInt  *idx;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380 #endif
381     row = ns[i];
382     nsz = ns[i + 1] - ns[i];
383     n   = a->i[row + 1] - a->i[row];
384     nonzerorow += (n > 0) * nsz;
385 
386     idx = &a->j[a->i[row]];
387     v1  = &a->a[a->i[row]];
388     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
389     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390     sz = n;                                                                /* No of non zeros in this row */
391                                                                            /* Switch on the size of Node */
392     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
393     case 1:
394       sum1 = 0.;
395 
396       for (n = 0; n < sz - 1; n += 2) {
397         i1 = idx[0]; /* The instructions are ordered to */
398         i2 = idx[1]; /* make the compiler's job easy */
399         idx += 2;
400         tmp0 = x[i1];
401         tmp1 = x[i2];
402         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403         v1 += 2;
404       }
405 
406       if (n == sz - 1) { /* Take care of the last nonzero  */
407         tmp0 = x[*idx++];
408         sum1 += *v1++ * tmp0;
409       }
410       y[row++] = sum1;
411       break;
412     case 2:
413       sum1 = 0.;
414       sum2 = 0.;
415       v2   = v1 + n;
416 
417       for (n = 0; n < sz - 1; n += 2) {
418         i1 = idx[0];
419         i2 = idx[1];
420         idx += 2;
421         tmp0 = x[i1];
422         tmp1 = x[i2];
423         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424         v1 += 2;
425         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426         v2 += 2;
427       }
428       if (n == sz - 1) {
429         tmp0 = x[*idx++];
430         sum1 += *v1++ * tmp0;
431         sum2 += *v2++ * tmp0;
432       }
433       y[row++] = sum1;
434       y[row++] = sum2;
435       v1       = v2; /* Since the next block to be processed starts there*/
436       idx += sz;
437       break;
438     case 3:
439       sum1 = 0.;
440       sum2 = 0.;
441       sum3 = 0.;
442       v2   = v1 + n;
443       v3   = v2 + n;
444 
445       for (n = 0; n < sz - 1; n += 2) {
446         i1 = idx[0];
447         i2 = idx[1];
448         idx += 2;
449         tmp0 = x[i1];
450         tmp1 = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452         v1 += 2;
453         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454         v2 += 2;
455         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456         v3 += 2;
457       }
458       if (n == sz - 1) {
459         tmp0 = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462         sum3 += *v3++ * tmp0;
463       }
464       y[row++] = sum1;
465       y[row++] = sum2;
466       y[row++] = sum3;
467       v1       = v3; /* Since the next block to be processed starts there*/
468       idx += 2 * sz;
469       break;
470     case 4:
471       sum1 = 0.;
472       sum2 = 0.;
473       sum3 = 0.;
474       sum4 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477       v4   = v3 + n;
478 
479       for (n = 0; n < sz - 1; n += 2) {
480         i1 = idx[0];
481         i2 = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486         v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488         v2 += 2;
489         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490         v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492         v4 += 2;
493       }
494       if (n == sz - 1) {
495         tmp0 = x[*idx++];
496         sum1 += *v1++ * tmp0;
497         sum2 += *v2++ * tmp0;
498         sum3 += *v3++ * tmp0;
499         sum4 += *v4++ * tmp0;
500       }
501       y[row++] = sum1;
502       y[row++] = sum2;
503       y[row++] = sum3;
504       y[row++] = sum4;
505       v1       = v4; /* Since the next block to be processed starts there*/
506       idx += 3 * sz;
507       break;
508     case 5:
509       sum1 = 0.;
510       sum2 = 0.;
511       sum3 = 0.;
512       sum4 = 0.;
513       sum5 = 0.;
514       v2   = v1 + n;
515       v3   = v2 + n;
516       v4   = v3 + n;
517       v5   = v4 + n;
518 
519       for (n = 0; n < sz - 1; n += 2) {
520         i1 = idx[0];
521         i2 = idx[1];
522         idx += 2;
523         tmp0 = x[i1];
524         tmp1 = x[i2];
525         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526         v1 += 2;
527         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528         v2 += 2;
529         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530         v3 += 2;
531         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532         v4 += 2;
533         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534         v5 += 2;
535       }
536       if (n == sz - 1) {
537         tmp0 = x[*idx++];
538         sum1 += *v1++ * tmp0;
539         sum2 += *v2++ * tmp0;
540         sum3 += *v3++ * tmp0;
541         sum4 += *v4++ * tmp0;
542         sum5 += *v5++ * tmp0;
543       }
544       y[row++] = sum1;
545       y[row++] = sum2;
546       y[row++] = sum3;
547       y[row++] = sum4;
548       y[row++] = sum5;
549       v1       = v5; /* Since the next block to be processed starts there */
550       idx += 4 * sz;
551       break;
552     default:
553       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554     }
555   }
556   PetscCall(VecRestoreArrayRead(xx, &x));
557   PetscCall(VecRestoreArray(yy, &y));
558   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559   PetscFunctionReturn(PETSC_SUCCESS);
560 }
561 
562 /* Almost same code as the MatMult_SeqAIJ_Inode() */
563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564 {
565   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
566   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567   const MatScalar   *v1, *v2, *v3, *v4, *v5;
568   const PetscScalar *x;
569   PetscScalar       *y, *z, *zt;
570   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
571   const PetscInt    *idx, *ns, *ii;
572 
573   PetscFunctionBegin;
574   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575   node_max = a->inode.node_count;
576   ns       = a->inode.size_csr; /* Node Size array */
577 
578   PetscCall(VecGetArrayRead(xx, &x));
579   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580   zt = z;
581 
582   idx = a->j;
583   v1  = a->a;
584   ii  = a->i;
585 
586   for (i = 0; i < node_max; ++i) {
587     row = ns[i];
588     nsz = ns[i + 1] - ns[i];
589     n   = ii[1] - ii[0];
590     ii += nsz;
591     sz = n;        /* No of non zeros in this row */
592                    /* Switch on the size of Node */
593     switch (nsz) { /* Each loop in 'case' is unrolled */
594     case 1:
595       sum1 = *zt++;
596 
597       for (n = 0; n < sz - 1; n += 2) {
598         i1 = idx[0]; /* The instructions are ordered to */
599         i2 = idx[1]; /* make the compiler's job easy */
600         idx += 2;
601         tmp0 = x[i1];
602         tmp1 = x[i2];
603         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604         v1 += 2;
605       }
606 
607       if (n == sz - 1) { /* Take care of the last nonzero  */
608         tmp0 = x[*idx++];
609         sum1 += *v1++ * tmp0;
610       }
611       y[row++] = sum1;
612       break;
613     case 2:
614       sum1 = *zt++;
615       sum2 = *zt++;
616       v2   = v1 + n;
617 
618       for (n = 0; n < sz - 1; n += 2) {
619         i1 = idx[0];
620         i2 = idx[1];
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625         v1 += 2;
626         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627         v2 += 2;
628       }
629       if (n == sz - 1) {
630         tmp0 = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++] = sum1;
635       y[row++] = sum2;
636       v1       = v2; /* Since the next block to be processed starts there*/
637       idx += sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n < sz - 1; n += 2) {
647         i1 = idx[0];
648         i2 = idx[1];
649         idx += 2;
650         tmp0 = x[i1];
651         tmp1 = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653         v1 += 2;
654         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655         v2 += 2;
656         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657         v3 += 2;
658       }
659       if (n == sz - 1) {
660         tmp0 = x[*idx++];
661         sum1 += *v1++ * tmp0;
662         sum2 += *v2++ * tmp0;
663         sum3 += *v3++ * tmp0;
664       }
665       y[row++] = sum1;
666       y[row++] = sum2;
667       y[row++] = sum3;
668       v1       = v3; /* Since the next block to be processed starts there*/
669       idx += 2 * sz;
670       break;
671     case 4:
672       sum1 = *zt++;
673       sum2 = *zt++;
674       sum3 = *zt++;
675       sum4 = *zt++;
676       v2   = v1 + n;
677       v3   = v2 + n;
678       v4   = v3 + n;
679 
680       for (n = 0; n < sz - 1; n += 2) {
681         i1 = idx[0];
682         i2 = idx[1];
683         idx += 2;
684         tmp0 = x[i1];
685         tmp1 = x[i2];
686         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687         v1 += 2;
688         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689         v2 += 2;
690         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691         v3 += 2;
692         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693         v4 += 2;
694       }
695       if (n == sz - 1) {
696         tmp0 = x[*idx++];
697         sum1 += *v1++ * tmp0;
698         sum2 += *v2++ * tmp0;
699         sum3 += *v3++ * tmp0;
700         sum4 += *v4++ * tmp0;
701       }
702       y[row++] = sum1;
703       y[row++] = sum2;
704       y[row++] = sum3;
705       y[row++] = sum4;
706       v1       = v4; /* Since the next block to be processed starts there*/
707       idx += 3 * sz;
708       break;
709     case 5:
710       sum1 = *zt++;
711       sum2 = *zt++;
712       sum3 = *zt++;
713       sum4 = *zt++;
714       sum5 = *zt++;
715       v2   = v1 + n;
716       v3   = v2 + n;
717       v4   = v3 + n;
718       v5   = v4 + n;
719 
720       for (n = 0; n < sz - 1; n += 2) {
721         i1 = idx[0];
722         i2 = idx[1];
723         idx += 2;
724         tmp0 = x[i1];
725         tmp1 = x[i2];
726         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727         v1 += 2;
728         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729         v2 += 2;
730         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731         v3 += 2;
732         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733         v4 += 2;
734         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735         v5 += 2;
736       }
737       if (n == sz - 1) {
738         tmp0 = x[*idx++];
739         sum1 += *v1++ * tmp0;
740         sum2 += *v2++ * tmp0;
741         sum3 += *v3++ * tmp0;
742         sum4 += *v4++ * tmp0;
743         sum5 += *v5++ * tmp0;
744       }
745       y[row++] = sum1;
746       y[row++] = sum2;
747       y[row++] = sum3;
748       y[row++] = sum4;
749       y[row++] = sum5;
750       v1       = v5; /* Since the next block to be processed starts there */
751       idx += 4 * sz;
752       break;
753     default:
754       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755     }
756   }
757   PetscCall(VecRestoreArrayRead(xx, &x));
758   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759   PetscCall(PetscLogFlops(2.0 * a->nz));
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764 {
765   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
766   IS                 iscol = a->col, isrow = a->row;
767   const PetscInt    *r, *c, *rout, *cout;
768   PetscInt           i, j, n = A->rmap->n, nz;
769   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
770   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774   const PetscScalar *b;
775 
776   PetscFunctionBegin;
777   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778   node_max = a->inode.node_count;
779   ns       = a->inode.size_csr; /* Node Size array */
780 
781   PetscCall(VecGetArrayRead(bb, &b));
782   PetscCall(VecGetArrayWrite(xx, &x));
783   tmp = a->solve_work;
784 
785   PetscCall(ISGetIndices(isrow, &rout));
786   r = rout;
787   PetscCall(ISGetIndices(iscol, &cout));
788   c = cout + (n - 1);
789 
790   /* forward solve the lower triangular */
791   tmps = tmp;
792   aa   = a_a;
793   aj   = a_j;
794   ad   = a->diag;
795 
796   for (i = 0, row = 0; i < node_max; ++i) {
797     row = ns[i];
798     nsz = ns[i + 1] - ns[i];
799     aii = ai[row];
800     v1  = aa + aii;
801     vi  = aj + aii;
802     nz  = ad[row] - aii;
803     if (i < node_max - 1) {
804       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805       * but our indexing to determine its size could. */
806       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810     }
811 
812     switch (nsz) { /* Each loop in 'case' is unrolled */
813     case 1:
814       sum1 = b[*r++];
815       for (j = 0; j < nz - 1; j += 2) {
816         i0 = vi[0];
817         i1 = vi[1];
818         vi += 2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822         v1 += 2;
823       }
824       if (j == nz - 1) {
825         tmp0 = tmps[*vi++];
826         sum1 -= *v1++ * tmp0;
827       }
828       tmp[row++] = sum1;
829       break;
830     case 2:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       v2   = aa + ai[row + 1];
834 
835       for (j = 0; j < nz - 1; j += 2) {
836         i0 = vi[0];
837         i1 = vi[1];
838         vi += 2;
839         tmp0 = tmps[i0];
840         tmp1 = tmps[i1];
841         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842         v1 += 2;
843         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844         v2 += 2;
845       }
846       if (j == nz - 1) {
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ * tmp0;
849         sum2 -= *v2++ * tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row++] = sum1;
853       tmp[row++] = sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row + 1];
860       v3   = aa + ai[row + 2];
861 
862       for (j = 0; j < nz - 1; j += 2) {
863         i0 = vi[0];
864         i1 = vi[1];
865         vi += 2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869         v1 += 2;
870         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871         v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873         v3 += 2;
874       }
875       if (j == nz - 1) {
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ * tmp0;
878         sum2 -= *v2++ * tmp0;
879         sum3 -= *v3++ * tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884 
885       tmp[row++] = sum1;
886       tmp[row++] = sum2;
887       tmp[row++] = sum3;
888       break;
889 
890     case 4:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       v2   = aa + ai[row + 1];
896       v3   = aa + ai[row + 2];
897       v4   = aa + ai[row + 3];
898 
899       for (j = 0; j < nz - 1; j += 2) {
900         i0 = vi[0];
901         i1 = vi[1];
902         vi += 2;
903         tmp0 = tmps[i0];
904         tmp1 = tmps[i1];
905         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906         v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908         v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910         v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912         v4 += 2;
913       }
914       if (j == nz - 1) {
915         tmp0 = tmps[*vi++];
916         sum1 -= *v1++ * tmp0;
917         sum2 -= *v2++ * tmp0;
918         sum3 -= *v3++ * tmp0;
919         sum4 -= *v4++ * tmp0;
920       }
921       sum2 -= *v2++ * sum1;
922       sum3 -= *v3++ * sum1;
923       sum4 -= *v4++ * sum1;
924       sum3 -= *v3++ * sum2;
925       sum4 -= *v4++ * sum2;
926       sum4 -= *v4++ * sum3;
927 
928       tmp[row++] = sum1;
929       tmp[row++] = sum2;
930       tmp[row++] = sum3;
931       tmp[row++] = sum4;
932       break;
933     case 5:
934       sum1 = b[*r++];
935       sum2 = b[*r++];
936       sum3 = b[*r++];
937       sum4 = b[*r++];
938       sum5 = b[*r++];
939       v2   = aa + ai[row + 1];
940       v3   = aa + ai[row + 2];
941       v4   = aa + ai[row + 3];
942       v5   = aa + ai[row + 4];
943 
944       for (j = 0; j < nz - 1; j += 2) {
945         i0 = vi[0];
946         i1 = vi[1];
947         vi += 2;
948         tmp0 = tmps[i0];
949         tmp1 = tmps[i1];
950         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951         v1 += 2;
952         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953         v2 += 2;
954         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955         v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957         v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959         v5 += 2;
960       }
961       if (j == nz - 1) {
962         tmp0 = tmps[*vi++];
963         sum1 -= *v1++ * tmp0;
964         sum2 -= *v2++ * tmp0;
965         sum3 -= *v3++ * tmp0;
966         sum4 -= *v4++ * tmp0;
967         sum5 -= *v5++ * tmp0;
968       }
969 
970       sum2 -= *v2++ * sum1;
971       sum3 -= *v3++ * sum1;
972       sum4 -= *v4++ * sum1;
973       sum5 -= *v5++ * sum1;
974       sum3 -= *v3++ * sum2;
975       sum4 -= *v4++ * sum2;
976       sum5 -= *v5++ * sum2;
977       sum4 -= *v4++ * sum3;
978       sum5 -= *v5++ * sum3;
979       sum5 -= *v5++ * sum4;
980 
981       tmp[row++] = sum1;
982       tmp[row++] = sum2;
983       tmp[row++] = sum3;
984       tmp[row++] = sum4;
985       tmp[row++] = sum5;
986       break;
987     default:
988       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989     }
990   }
991   /* backward solve the upper triangular */
992   for (i = node_max - 1; i >= 0; i--) {
993     row = ns[i + 1];
994     nsz = ns[i + 1] - ns[i];
995     aii = ai[row + 1] - 1;
996     v1  = aa + aii;
997     vi  = aj + aii;
998     nz  = aii - ad[row];
999     switch (nsz) { /* Each loop in 'case' is unrolled */
1000     case 1:
1001       sum1 = tmp[row];
1002 
1003       for (j = nz; j > 1; j -= 2) {
1004         vi -= 2;
1005         i0   = vi[2];
1006         i1   = vi[1];
1007         tmp0 = tmps[i0];
1008         tmp1 = tmps[i1];
1009         v1 -= 2;
1010         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011       }
1012       if (j == 1) {
1013         tmp0 = tmps[*vi--];
1014         sum1 -= *v1-- * tmp0;
1015       }
1016       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017       row--;
1018       break;
1019     case 2:
1020       sum1 = tmp[row];
1021       sum2 = tmp[row - 1];
1022       v2   = aa + ai[row] - 1;
1023       for (j = nz; j > 1; j -= 2) {
1024         vi -= 2;
1025         i0   = vi[2];
1026         i1   = vi[1];
1027         tmp0 = tmps[i0];
1028         tmp1 = tmps[i1];
1029         v1 -= 2;
1030         v2 -= 2;
1031         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033       }
1034       if (j == 1) {
1035         tmp0 = tmps[*vi--];
1036         sum1 -= *v1-- * tmp0;
1037         sum2 -= *v2-- * tmp0;
1038       }
1039 
1040       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041       row--;
1042       sum2 -= *v2-- * tmp0;
1043       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044       row--;
1045       break;
1046     case 3:
1047       sum1 = tmp[row];
1048       sum2 = tmp[row - 1];
1049       sum3 = tmp[row - 2];
1050       v2   = aa + ai[row] - 1;
1051       v3   = aa + ai[row - 1] - 1;
1052       for (j = nz; j > 1; j -= 2) {
1053         vi -= 2;
1054         i0   = vi[2];
1055         i1   = vi[1];
1056         tmp0 = tmps[i0];
1057         tmp1 = tmps[i1];
1058         v1 -= 2;
1059         v2 -= 2;
1060         v3 -= 2;
1061         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064       }
1065       if (j == 1) {
1066         tmp0 = tmps[*vi--];
1067         sum1 -= *v1-- * tmp0;
1068         sum2 -= *v2-- * tmp0;
1069         sum3 -= *v3-- * tmp0;
1070       }
1071       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072       row--;
1073       sum2 -= *v2-- * tmp0;
1074       sum3 -= *v3-- * tmp0;
1075       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076       row--;
1077       sum3 -= *v3-- * tmp0;
1078       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079       row--;
1080 
1081       break;
1082     case 4:
1083       sum1 = tmp[row];
1084       sum2 = tmp[row - 1];
1085       sum3 = tmp[row - 2];
1086       sum4 = tmp[row - 3];
1087       v2   = aa + ai[row] - 1;
1088       v3   = aa + ai[row - 1] - 1;
1089       v4   = aa + ai[row - 2] - 1;
1090 
1091       for (j = nz; j > 1; j -= 2) {
1092         vi -= 2;
1093         i0   = vi[2];
1094         i1   = vi[1];
1095         tmp0 = tmps[i0];
1096         tmp1 = tmps[i1];
1097         v1 -= 2;
1098         v2 -= 2;
1099         v3 -= 2;
1100         v4 -= 2;
1101         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105       }
1106       if (j == 1) {
1107         tmp0 = tmps[*vi--];
1108         sum1 -= *v1-- * tmp0;
1109         sum2 -= *v2-- * tmp0;
1110         sum3 -= *v3-- * tmp0;
1111         sum4 -= *v4-- * tmp0;
1112       }
1113 
1114       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115       row--;
1116       sum2 -= *v2-- * tmp0;
1117       sum3 -= *v3-- * tmp0;
1118       sum4 -= *v4-- * tmp0;
1119       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120       row--;
1121       sum3 -= *v3-- * tmp0;
1122       sum4 -= *v4-- * tmp0;
1123       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124       row--;
1125       sum4 -= *v4-- * tmp0;
1126       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127       row--;
1128       break;
1129     case 5:
1130       sum1 = tmp[row];
1131       sum2 = tmp[row - 1];
1132       sum3 = tmp[row - 2];
1133       sum4 = tmp[row - 3];
1134       sum5 = tmp[row - 4];
1135       v2   = aa + ai[row] - 1;
1136       v3   = aa + ai[row - 1] - 1;
1137       v4   = aa + ai[row - 2] - 1;
1138       v5   = aa + ai[row - 3] - 1;
1139       for (j = nz; j > 1; j -= 2) {
1140         vi -= 2;
1141         i0   = vi[2];
1142         i1   = vi[1];
1143         tmp0 = tmps[i0];
1144         tmp1 = tmps[i1];
1145         v1 -= 2;
1146         v2 -= 2;
1147         v3 -= 2;
1148         v4 -= 2;
1149         v5 -= 2;
1150         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155       }
1156       if (j == 1) {
1157         tmp0 = tmps[*vi--];
1158         sum1 -= *v1-- * tmp0;
1159         sum2 -= *v2-- * tmp0;
1160         sum3 -= *v3-- * tmp0;
1161         sum4 -= *v4-- * tmp0;
1162         sum5 -= *v5-- * tmp0;
1163       }
1164 
1165       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166       row--;
1167       sum2 -= *v2-- * tmp0;
1168       sum3 -= *v3-- * tmp0;
1169       sum4 -= *v4-- * tmp0;
1170       sum5 -= *v5-- * tmp0;
1171       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172       row--;
1173       sum3 -= *v3-- * tmp0;
1174       sum4 -= *v4-- * tmp0;
1175       sum5 -= *v5-- * tmp0;
1176       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177       row--;
1178       sum4 -= *v4-- * tmp0;
1179       sum5 -= *v5-- * tmp0;
1180       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181       row--;
1182       sum5 -= *v5-- * tmp0;
1183       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184       row--;
1185       break;
1186     default:
1187       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188     }
1189   }
1190   PetscCall(ISRestoreIndices(isrow, &rout));
1191   PetscCall(ISRestoreIndices(iscol, &cout));
1192   PetscCall(VecRestoreArrayRead(bb, &b));
1193   PetscCall(VecRestoreArrayWrite(xx, &x));
1194   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199 {
1200   Mat              C = B;
1201   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202   IS               isrow = b->row, isicol = b->icol;
1203   const PetscInt  *r, *ic, *ics;
1204   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205   PetscInt         i, j, k, nz, nzL, row, *pj;
1206   const PetscInt  *ajtmp, *bjtmp;
1207   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209   FactorShiftCtx   sctx;
1210   const PetscInt  *ddiag;
1211   PetscReal        rs;
1212   MatScalar        d;
1213   PetscInt         inod, nodesz, node_max, col;
1214   const PetscInt  *ns;
1215   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1216 
1217   PetscFunctionBegin;
1218   /* MatPivotSetUp(): initialize shift context sctx */
1219   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1220 
1221   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222     ddiag          = a->diag;
1223     sctx.shift_top = info->zeropivot;
1224     for (i = 0; i < n; i++) {
1225       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226       d  = (aa)[ddiag[i]];
1227       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228       v  = aa + ai[i];
1229       nz = ai[i + 1] - ai[i];
1230       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231       if (rs > sctx.shift_top) sctx.shift_top = rs;
1232     }
1233     sctx.shift_top *= 1.1;
1234     sctx.nshift_max = 5;
1235     sctx.shift_lo   = 0.;
1236     sctx.shift_hi   = 1.;
1237   }
1238 
1239   PetscCall(ISGetIndices(isrow, &r));
1240   PetscCall(ISGetIndices(isicol, &ic));
1241 
1242   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243   ics = ic;
1244 
1245   node_max = a->inode.node_count;
1246   ns       = a->inode.size_csr;
1247   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1248 
1249   /* If max inode size > 4, split it into two inodes.*/
1250   /* also map the inode sizes according to the ordering */
1251   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253     nodesz = ns[i + 1] - ns[i];
1254     if (nodesz > 4) {
1255       tmp_vec1[j] = 4;
1256       ++j;
1257       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258     } else {
1259       tmp_vec1[j] = nodesz;
1260     }
1261   }
1262   /* Use the correct node_max */
1263   node_max = j;
1264 
1265   /* Now reorder the inode info based on mat re-ordering info */
1266   /* First create a row -> inode_size_array_index map */
1267   PetscCall(PetscMalloc1(n + 1, &nsmap));
1268   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269   tmp_vec2[0] = 0;
1270   for (i = 0, row = 0; i < node_max; i++) {
1271     nodesz = tmp_vec1[i];
1272     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273   }
1274   /* Using nsmap, create a reordered ns structure */
1275   for (i = 0, j = 0; i < node_max; i++) {
1276     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278     j += nodesz;
1279   }
1280   PetscCall(PetscFree(nsmap));
1281   PetscCall(PetscFree(tmp_vec1));
1282 
1283   /* Now use the correct ns */
1284   ns = tmp_vec2;
1285 
1286   do {
1287     sctx.newshift = PETSC_FALSE;
1288     /* Now loop over each block-row, and do the factorization */
1289     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290       nodesz = ns[inod + 1] - ns[inod];
1291 
1292       switch (nodesz) {
1293       case 1:
1294         /* zero rtmp1 */
1295         /* L part */
1296         nz    = bi[i + 1] - bi[i];
1297         bjtmp = bj + bi[i];
1298         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1299 
1300         /* U part */
1301         nz    = bdiag[i] - bdiag[i + 1];
1302         bjtmp = bj + bdiag[i + 1] + 1;
1303         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1304 
1305         /* load in initial (unfactored row) */
1306         nz    = ai[r[i] + 1] - ai[r[i]];
1307         ajtmp = aj + ai[r[i]];
1308         v     = aa + ai[r[i]];
1309         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1310 
1311         /* ZeropivotApply() */
1312         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1313 
1314         /* elimination */
1315         bjtmp = bj + bi[i];
1316         row   = *bjtmp++;
1317         nzL   = bi[i + 1] - bi[i];
1318         for (k = 0; k < nzL; k++) {
1319           pc = rtmp1 + row;
1320           if (*pc != 0.0) {
1321             pv   = b->a + bdiag[row];
1322             mul1 = *pc * (*pv);
1323             *pc  = mul1;
1324             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325             pv   = b->a + bdiag[row + 1] + 1;
1326             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329           }
1330           row = *bjtmp++;
1331         }
1332 
1333         /* finished row so stick it into b->a */
1334         rs = 0.0;
1335         /* L part */
1336         pv = b->a + bi[i];
1337         pj = b->j + bi[i];
1338         nz = bi[i + 1] - bi[i];
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* U part */
1345         pv = b->a + bdiag[i + 1] + 1;
1346         pj = b->j + bdiag[i + 1] + 1;
1347         nz = bdiag[i] - bdiag[i + 1] - 1;
1348         for (j = 0; j < nz; j++) {
1349           pv[j] = rtmp1[pj[j]];
1350           rs += PetscAbsScalar(pv[j]);
1351         }
1352 
1353         /* Check zero pivot */
1354         sctx.rs = rs;
1355         sctx.pv = rtmp1[i];
1356         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357         if (sctx.newshift) break;
1358 
1359         /* Mark diagonal and invert diagonal for simpler triangular solves */
1360         pv  = b->a + bdiag[i];
1361         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362         break;
1363 
1364       case 2:
1365         /* zero rtmp1 and rtmp2 */
1366         /* L part */
1367         nz    = bi[i + 1] - bi[i];
1368         bjtmp = bj + bi[i];
1369         for (j = 0; j < nz; j++) {
1370           col        = bjtmp[j];
1371           rtmp1[col] = 0.0;
1372           rtmp2[col] = 0.0;
1373         }
1374 
1375         /* U part */
1376         nz    = bdiag[i] - bdiag[i + 1];
1377         bjtmp = bj + bdiag[i + 1] + 1;
1378         for (j = 0; j < nz; j++) {
1379           col        = bjtmp[j];
1380           rtmp1[col] = 0.0;
1381           rtmp2[col] = 0.0;
1382         }
1383 
1384         /* load in initial (unfactored row) */
1385         nz    = ai[r[i] + 1] - ai[r[i]];
1386         ajtmp = aj + ai[r[i]];
1387         v1    = aa + ai[r[i]];
1388         v2    = aa + ai[r[i + 1]];
1389         for (j = 0; j < nz; j++) {
1390           col        = ics[ajtmp[j]];
1391           rtmp1[col] = v1[j];
1392           rtmp2[col] = v2[j];
1393         }
1394         /* ZeropivotApply(): shift the diagonal of the matrix  */
1395         rtmp1[i] += sctx.shift_amount;
1396         rtmp2[i + 1] += sctx.shift_amount;
1397 
1398         /* elimination */
1399         bjtmp = bj + bi[i];
1400         row   = *bjtmp++; /* pivot row */
1401         nzL   = bi[i + 1] - bi[i];
1402         for (k = 0; k < nzL; k++) {
1403           pc1 = rtmp1 + row;
1404           pc2 = rtmp2 + row;
1405           if (*pc1 != 0.0 || *pc2 != 0.0) {
1406             pv   = b->a + bdiag[row];
1407             mul1 = *pc1 * (*pv);
1408             mul2 = *pc2 * (*pv);
1409             *pc1 = mul1;
1410             *pc2 = mul2;
1411 
1412             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413             pv = b->a + bdiag[row + 1] + 1;
1414             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415             for (j = 0; j < nz; j++) {
1416               col = pj[j];
1417               rtmp1[col] -= mul1 * pv[j];
1418               rtmp2[col] -= mul2 * pv[j];
1419             }
1420             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421           }
1422           row = *bjtmp++;
1423         }
1424 
1425         /* finished row i; check zero pivot, then stick row i into b->a */
1426         rs = 0.0;
1427         /* L part */
1428         pc1 = b->a + bi[i];
1429         pj  = b->j + bi[i];
1430         nz  = bi[i + 1] - bi[i];
1431         for (j = 0; j < nz; j++) {
1432           col    = pj[j];
1433           pc1[j] = rtmp1[col];
1434           rs += PetscAbsScalar(pc1[j]);
1435         }
1436         /* U part */
1437         pc1 = b->a + bdiag[i + 1] + 1;
1438         pj  = b->j + bdiag[i + 1] + 1;
1439         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440         for (j = 0; j < nz; j++) {
1441           col    = pj[j];
1442           pc1[j] = rtmp1[col];
1443           rs += PetscAbsScalar(pc1[j]);
1444         }
1445 
1446         sctx.rs = rs;
1447         sctx.pv = rtmp1[i];
1448         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449         if (sctx.newshift) break;
1450         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451         *pc1 = 1.0 / sctx.pv;
1452 
1453         /* Now take care of diagonal 2x2 block. */
1454         pc2 = rtmp2 + i;
1455         if (*pc2 != 0.0) {
1456           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457           *pc2 = mul1;                        /* insert L entry */
1458           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460           for (j = 0; j < nz; j++) {
1461             col = pj[j];
1462             rtmp2[col] -= mul1 * rtmp1[col];
1463           }
1464           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465         }
1466 
1467         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468         rs = 0.0;
1469         /* L part */
1470         pc2 = b->a + bi[i + 1];
1471         pj  = b->j + bi[i + 1];
1472         nz  = bi[i + 2] - bi[i + 1];
1473         for (j = 0; j < nz; j++) {
1474           col    = pj[j];
1475           pc2[j] = rtmp2[col];
1476           rs += PetscAbsScalar(pc2[j]);
1477         }
1478         /* U part */
1479         pc2 = b->a + bdiag[i + 2] + 1;
1480         pj  = b->j + bdiag[i + 2] + 1;
1481         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482         for (j = 0; j < nz; j++) {
1483           col    = pj[j];
1484           pc2[j] = rtmp2[col];
1485           rs += PetscAbsScalar(pc2[j]);
1486         }
1487 
1488         sctx.rs = rs;
1489         sctx.pv = rtmp2[i + 1];
1490         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491         if (sctx.newshift) break;
1492         pc2  = b->a + bdiag[i + 1];
1493         *pc2 = 1.0 / sctx.pv;
1494         break;
1495 
1496       case 3:
1497         /* zero rtmp */
1498         /* L part */
1499         nz    = bi[i + 1] - bi[i];
1500         bjtmp = bj + bi[i];
1501         for (j = 0; j < nz; j++) {
1502           col        = bjtmp[j];
1503           rtmp1[col] = 0.0;
1504           rtmp2[col] = 0.0;
1505           rtmp3[col] = 0.0;
1506         }
1507 
1508         /* U part */
1509         nz    = bdiag[i] - bdiag[i + 1];
1510         bjtmp = bj + bdiag[i + 1] + 1;
1511         for (j = 0; j < nz; j++) {
1512           col        = bjtmp[j];
1513           rtmp1[col] = 0.0;
1514           rtmp2[col] = 0.0;
1515           rtmp3[col] = 0.0;
1516         }
1517 
1518         /* load in initial (unfactored row) */
1519         nz    = ai[r[i] + 1] - ai[r[i]];
1520         ajtmp = aj + ai[r[i]];
1521         v1    = aa + ai[r[i]];
1522         v2    = aa + ai[r[i + 1]];
1523         v3    = aa + ai[r[i + 2]];
1524         for (j = 0; j < nz; j++) {
1525           col        = ics[ajtmp[j]];
1526           rtmp1[col] = v1[j];
1527           rtmp2[col] = v2[j];
1528           rtmp3[col] = v3[j];
1529         }
1530         /* ZeropivotApply(): shift the diagonal of the matrix  */
1531         rtmp1[i] += sctx.shift_amount;
1532         rtmp2[i + 1] += sctx.shift_amount;
1533         rtmp3[i + 2] += sctx.shift_amount;
1534 
1535         /* elimination */
1536         bjtmp = bj + bi[i];
1537         row   = *bjtmp++; /* pivot row */
1538         nzL   = bi[i + 1] - bi[i];
1539         for (k = 0; k < nzL; k++) {
1540           pc1 = rtmp1 + row;
1541           pc2 = rtmp2 + row;
1542           pc3 = rtmp3 + row;
1543           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544             pv   = b->a + bdiag[row];
1545             mul1 = *pc1 * (*pv);
1546             mul2 = *pc2 * (*pv);
1547             mul3 = *pc3 * (*pv);
1548             *pc1 = mul1;
1549             *pc2 = mul2;
1550             *pc3 = mul3;
1551 
1552             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553             pv = b->a + bdiag[row + 1] + 1;
1554             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555             for (j = 0; j < nz; j++) {
1556               col = pj[j];
1557               rtmp1[col] -= mul1 * pv[j];
1558               rtmp2[col] -= mul2 * pv[j];
1559               rtmp3[col] -= mul3 * pv[j];
1560             }
1561             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562           }
1563           row = *bjtmp++;
1564         }
1565 
1566         /* finished row i; check zero pivot, then stick row i into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc1 = b->a + bi[i];
1570         pj  = b->j + bi[i];
1571         nz  = bi[i + 1] - bi[i];
1572         for (j = 0; j < nz; j++) {
1573           col    = pj[j];
1574           pc1[j] = rtmp1[col];
1575           rs += PetscAbsScalar(pc1[j]);
1576         }
1577         /* U part */
1578         pc1 = b->a + bdiag[i + 1] + 1;
1579         pj  = b->j + bdiag[i + 1] + 1;
1580         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581         for (j = 0; j < nz; j++) {
1582           col    = pj[j];
1583           pc1[j] = rtmp1[col];
1584           rs += PetscAbsScalar(pc1[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp1[i];
1589         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590         if (sctx.newshift) break;
1591         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592         *pc1 = 1.0 / sctx.pv;
1593 
1594         /* Now take care of 1st column of diagonal 3x3 block. */
1595         pc2 = rtmp2 + i;
1596         pc3 = rtmp3 + i;
1597         if (*pc2 != 0.0 || *pc3 != 0.0) {
1598           mul2 = (*pc2) * (*pc1);
1599           *pc2 = mul2;
1600           mul3 = (*pc3) * (*pc1);
1601           *pc3 = mul3;
1602           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1603           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604           for (j = 0; j < nz; j++) {
1605             col = pj[j];
1606             rtmp2[col] -= mul2 * rtmp1[col];
1607             rtmp3[col] -= mul3 * rtmp1[col];
1608           }
1609           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610         }
1611 
1612         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613         rs = 0.0;
1614         /* L part */
1615         pc2 = b->a + bi[i + 1];
1616         pj  = b->j + bi[i + 1];
1617         nz  = bi[i + 2] - bi[i + 1];
1618         for (j = 0; j < nz; j++) {
1619           col    = pj[j];
1620           pc2[j] = rtmp2[col];
1621           rs += PetscAbsScalar(pc2[j]);
1622         }
1623         /* U part */
1624         pc2 = b->a + bdiag[i + 2] + 1;
1625         pj  = b->j + bdiag[i + 2] + 1;
1626         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627         for (j = 0; j < nz; j++) {
1628           col    = pj[j];
1629           pc2[j] = rtmp2[col];
1630           rs += PetscAbsScalar(pc2[j]);
1631         }
1632 
1633         sctx.rs = rs;
1634         sctx.pv = rtmp2[i + 1];
1635         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636         if (sctx.newshift) break;
1637         pc2  = b->a + bdiag[i + 1];
1638         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639 
1640         /* Now take care of 2nd column of diagonal 3x3 block. */
1641         pc3 = rtmp3 + i + 1;
1642         if (*pc3 != 0.0) {
1643           mul3 = (*pc3) * (*pc2);
1644           *pc3 = mul3;
1645           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1646           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647           for (j = 0; j < nz; j++) {
1648             col = pj[j];
1649             rtmp3[col] -= mul3 * rtmp2[col];
1650           }
1651           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652         }
1653 
1654         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655         rs = 0.0;
1656         /* L part */
1657         pc3 = b->a + bi[i + 2];
1658         pj  = b->j + bi[i + 2];
1659         nz  = bi[i + 3] - bi[i + 2];
1660         for (j = 0; j < nz; j++) {
1661           col    = pj[j];
1662           pc3[j] = rtmp3[col];
1663           rs += PetscAbsScalar(pc3[j]);
1664         }
1665         /* U part */
1666         pc3 = b->a + bdiag[i + 3] + 1;
1667         pj  = b->j + bdiag[i + 3] + 1;
1668         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669         for (j = 0; j < nz; j++) {
1670           col    = pj[j];
1671           pc3[j] = rtmp3[col];
1672           rs += PetscAbsScalar(pc3[j]);
1673         }
1674 
1675         sctx.rs = rs;
1676         sctx.pv = rtmp3[i + 2];
1677         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678         if (sctx.newshift) break;
1679         pc3  = b->a + bdiag[i + 2];
1680         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681         break;
1682       case 4:
1683         /* zero rtmp */
1684         /* L part */
1685         nz    = bi[i + 1] - bi[i];
1686         bjtmp = bj + bi[i];
1687         for (j = 0; j < nz; j++) {
1688           col        = bjtmp[j];
1689           rtmp1[col] = 0.0;
1690           rtmp2[col] = 0.0;
1691           rtmp3[col] = 0.0;
1692           rtmp4[col] = 0.0;
1693         }
1694 
1695         /* U part */
1696         nz    = bdiag[i] - bdiag[i + 1];
1697         bjtmp = bj + bdiag[i + 1] + 1;
1698         for (j = 0; j < nz; j++) {
1699           col        = bjtmp[j];
1700           rtmp1[col] = 0.0;
1701           rtmp2[col] = 0.0;
1702           rtmp3[col] = 0.0;
1703           rtmp4[col] = 0.0;
1704         }
1705 
1706         /* load in initial (unfactored row) */
1707         nz    = ai[r[i] + 1] - ai[r[i]];
1708         ajtmp = aj + ai[r[i]];
1709         v1    = aa + ai[r[i]];
1710         v2    = aa + ai[r[i + 1]];
1711         v3    = aa + ai[r[i + 2]];
1712         v4    = aa + ai[r[i + 3]];
1713         for (j = 0; j < nz; j++) {
1714           col        = ics[ajtmp[j]];
1715           rtmp1[col] = v1[j];
1716           rtmp2[col] = v2[j];
1717           rtmp3[col] = v3[j];
1718           rtmp4[col] = v4[j];
1719         }
1720         /* ZeropivotApply(): shift the diagonal of the matrix  */
1721         rtmp1[i] += sctx.shift_amount;
1722         rtmp2[i + 1] += sctx.shift_amount;
1723         rtmp3[i + 2] += sctx.shift_amount;
1724         rtmp4[i + 3] += sctx.shift_amount;
1725 
1726         /* elimination */
1727         bjtmp = bj + bi[i];
1728         row   = *bjtmp++; /* pivot row */
1729         nzL   = bi[i + 1] - bi[i];
1730         for (k = 0; k < nzL; k++) {
1731           pc1 = rtmp1 + row;
1732           pc2 = rtmp2 + row;
1733           pc3 = rtmp3 + row;
1734           pc4 = rtmp4 + row;
1735           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736             pv   = b->a + bdiag[row];
1737             mul1 = *pc1 * (*pv);
1738             mul2 = *pc2 * (*pv);
1739             mul3 = *pc3 * (*pv);
1740             mul4 = *pc4 * (*pv);
1741             *pc1 = mul1;
1742             *pc2 = mul2;
1743             *pc3 = mul3;
1744             *pc4 = mul4;
1745 
1746             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747             pv = b->a + bdiag[row + 1] + 1;
1748             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749             for (j = 0; j < nz; j++) {
1750               col = pj[j];
1751               rtmp1[col] -= mul1 * pv[j];
1752               rtmp2[col] -= mul2 * pv[j];
1753               rtmp3[col] -= mul3 * pv[j];
1754               rtmp4[col] -= mul4 * pv[j];
1755             }
1756             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757           }
1758           row = *bjtmp++;
1759         }
1760 
1761         /* finished row i; check zero pivot, then stick row i into b->a */
1762         rs = 0.0;
1763         /* L part */
1764         pc1 = b->a + bi[i];
1765         pj  = b->j + bi[i];
1766         nz  = bi[i + 1] - bi[i];
1767         for (j = 0; j < nz; j++) {
1768           col    = pj[j];
1769           pc1[j] = rtmp1[col];
1770           rs += PetscAbsScalar(pc1[j]);
1771         }
1772         /* U part */
1773         pc1 = b->a + bdiag[i + 1] + 1;
1774         pj  = b->j + bdiag[i + 1] + 1;
1775         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776         for (j = 0; j < nz; j++) {
1777           col    = pj[j];
1778           pc1[j] = rtmp1[col];
1779           rs += PetscAbsScalar(pc1[j]);
1780         }
1781 
1782         sctx.rs = rs;
1783         sctx.pv = rtmp1[i];
1784         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785         if (sctx.newshift) break;
1786         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1787         *pc1 = 1.0 / sctx.pv;
1788 
1789         /* Now take care of 1st column of diagonal 4x4 block. */
1790         pc2 = rtmp2 + i;
1791         pc3 = rtmp3 + i;
1792         pc4 = rtmp4 + i;
1793         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794           mul2 = (*pc2) * (*pc1);
1795           *pc2 = mul2;
1796           mul3 = (*pc3) * (*pc1);
1797           *pc3 = mul3;
1798           mul4 = (*pc4) * (*pc1);
1799           *pc4 = mul4;
1800           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1801           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802           for (j = 0; j < nz; j++) {
1803             col = pj[j];
1804             rtmp2[col] -= mul2 * rtmp1[col];
1805             rtmp3[col] -= mul3 * rtmp1[col];
1806             rtmp4[col] -= mul4 * rtmp1[col];
1807           }
1808           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809         }
1810 
1811         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812         rs = 0.0;
1813         /* L part */
1814         pc2 = b->a + bi[i + 1];
1815         pj  = b->j + bi[i + 1];
1816         nz  = bi[i + 2] - bi[i + 1];
1817         for (j = 0; j < nz; j++) {
1818           col    = pj[j];
1819           pc2[j] = rtmp2[col];
1820           rs += PetscAbsScalar(pc2[j]);
1821         }
1822         /* U part */
1823         pc2 = b->a + bdiag[i + 2] + 1;
1824         pj  = b->j + bdiag[i + 2] + 1;
1825         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826         for (j = 0; j < nz; j++) {
1827           col    = pj[j];
1828           pc2[j] = rtmp2[col];
1829           rs += PetscAbsScalar(pc2[j]);
1830         }
1831 
1832         sctx.rs = rs;
1833         sctx.pv = rtmp2[i + 1];
1834         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835         if (sctx.newshift) break;
1836         pc2  = b->a + bdiag[i + 1];
1837         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1838 
1839         /* Now take care of 2nd column of diagonal 4x4 block. */
1840         pc3 = rtmp3 + i + 1;
1841         pc4 = rtmp4 + i + 1;
1842         if (*pc3 != 0.0 || *pc4 != 0.0) {
1843           mul3 = (*pc3) * (*pc2);
1844           *pc3 = mul3;
1845           mul4 = (*pc4) * (*pc2);
1846           *pc4 = mul4;
1847           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1848           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849           for (j = 0; j < nz; j++) {
1850             col = pj[j];
1851             rtmp3[col] -= mul3 * rtmp2[col];
1852             rtmp4[col] -= mul4 * rtmp2[col];
1853           }
1854           PetscCall(PetscLogFlops(4.0 * nz));
1855         }
1856 
1857         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858         rs = 0.0;
1859         /* L part */
1860         pc3 = b->a + bi[i + 2];
1861         pj  = b->j + bi[i + 2];
1862         nz  = bi[i + 3] - bi[i + 2];
1863         for (j = 0; j < nz; j++) {
1864           col    = pj[j];
1865           pc3[j] = rtmp3[col];
1866           rs += PetscAbsScalar(pc3[j]);
1867         }
1868         /* U part */
1869         pc3 = b->a + bdiag[i + 3] + 1;
1870         pj  = b->j + bdiag[i + 3] + 1;
1871         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872         for (j = 0; j < nz; j++) {
1873           col    = pj[j];
1874           pc3[j] = rtmp3[col];
1875           rs += PetscAbsScalar(pc3[j]);
1876         }
1877 
1878         sctx.rs = rs;
1879         sctx.pv = rtmp3[i + 2];
1880         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881         if (sctx.newshift) break;
1882         pc3  = b->a + bdiag[i + 2];
1883         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1884 
1885         /* Now take care of 3rd column of diagonal 4x4 block. */
1886         pc4 = rtmp4 + i + 2;
1887         if (*pc4 != 0.0) {
1888           mul4 = (*pc4) * (*pc3);
1889           *pc4 = mul4;
1890           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1891           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892           for (j = 0; j < nz; j++) {
1893             col = pj[j];
1894             rtmp4[col] -= mul4 * rtmp3[col];
1895           }
1896           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897         }
1898 
1899         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900         rs = 0.0;
1901         /* L part */
1902         pc4 = b->a + bi[i + 3];
1903         pj  = b->j + bi[i + 3];
1904         nz  = bi[i + 4] - bi[i + 3];
1905         for (j = 0; j < nz; j++) {
1906           col    = pj[j];
1907           pc4[j] = rtmp4[col];
1908           rs += PetscAbsScalar(pc4[j]);
1909         }
1910         /* U part */
1911         pc4 = b->a + bdiag[i + 4] + 1;
1912         pj  = b->j + bdiag[i + 4] + 1;
1913         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914         for (j = 0; j < nz; j++) {
1915           col    = pj[j];
1916           pc4[j] = rtmp4[col];
1917           rs += PetscAbsScalar(pc4[j]);
1918         }
1919 
1920         sctx.rs = rs;
1921         sctx.pv = rtmp4[i + 3];
1922         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923         if (sctx.newshift) break;
1924         pc4  = b->a + bdiag[i + 3];
1925         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926         break;
1927 
1928       default:
1929         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930       }
1931       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932       i += nodesz;              /* Update the row */
1933     }
1934 
1935     /* MatPivotRefine() */
1936     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937       /*
1938        * if no shift in this attempt & shifting & started shifting & can refine,
1939        * then try lower shift
1940        */
1941       sctx.shift_hi       = sctx.shift_fraction;
1942       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1944       sctx.newshift       = PETSC_TRUE;
1945       sctx.nshift++;
1946     }
1947   } while (sctx.newshift);
1948 
1949   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950   PetscCall(PetscFree(tmp_vec2));
1951   PetscCall(ISRestoreIndices(isicol, &ic));
1952   PetscCall(ISRestoreIndices(isrow, &r));
1953 
1954   if (b->inode.size_csr) {
1955     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956   } else {
1957     C->ops->solve = MatSolve_SeqAIJ;
1958   }
1959   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1960   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1961   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964   C->assembled              = PETSC_TRUE;
1965   C->preallocated           = PETSC_TRUE;
1966 
1967   PetscCall(PetscLogFlops(C->cmap->n));
1968 
1969   /* MatShiftView(A,info,&sctx) */
1970   if (sctx.nshift) {
1971     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977     }
1978   }
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
1982 #if 0
1983 // unused
1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985 {
1986   Mat              C = B;
1987   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1989   const PetscInt  *r, *ic, *c, *ics;
1990   PetscInt         n = A->rmap->n, *bi = b->i;
1991   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1993   PetscInt        *ai = a->i, *aj = a->j;
1994   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995   PetscScalar      mul1, mul2, mul3, tmp;
1996   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1998   PetscReal        rs = 0.0;
1999   FactorShiftCtx   sctx;
2000 
2001   PetscFunctionBegin;
2002   sctx.shift_top      = 0;
2003   sctx.nshift_max     = 0;
2004   sctx.shift_lo       = 0;
2005   sctx.shift_hi       = 0;
2006   sctx.shift_fraction = 0;
2007 
2008   /* if both shift schemes are chosen by user, only use info->shiftpd */
2009   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010     sctx.shift_top = 0;
2011     for (i = 0; i < n; i++) {
2012       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013       rs    = 0.0;
2014       ajtmp = aj + ai[i];
2015       rtmp1 = aa + ai[i];
2016       nz    = ai[i + 1] - ai[i];
2017       for (j = 0; j < nz; j++) {
2018         if (*ajtmp != i) {
2019           rs += PetscAbsScalar(*rtmp1++);
2020         } else {
2021           rs -= PetscRealPart(*rtmp1++);
2022         }
2023         ajtmp++;
2024       }
2025       if (rs > sctx.shift_top) sctx.shift_top = rs;
2026     }
2027     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028     sctx.shift_top *= 1.1;
2029     sctx.nshift_max = 5;
2030     sctx.shift_lo   = 0.;
2031     sctx.shift_hi   = 1.;
2032   }
2033   sctx.shift_amount = 0;
2034   sctx.nshift       = 0;
2035 
2036   PetscCall(ISGetIndices(isrow, &r));
2037   PetscCall(ISGetIndices(iscol, &c));
2038   PetscCall(ISGetIndices(isicol, &ic));
2039   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040   ics = ic;
2041 
2042   node_max = a->inode.node_count;
2043   ns       = a->inode.size;
2044   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2045 
2046   /* If max inode size > 3, split it into two inodes.*/
2047   /* also map the inode sizes according to the ordering */
2048   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050     if (ns[i] > 3) {
2051       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2052       ++j;
2053       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054     } else {
2055       tmp_vec1[j] = ns[i];
2056     }
2057   }
2058   /* Use the correct node_max */
2059   node_max = j;
2060 
2061   /* Now reorder the inode info based on mat re-ordering info */
2062   /* First create a row -> inode_size_array_index map */
2063   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064   for (i = 0, row = 0; i < node_max; i++) {
2065     nodesz = tmp_vec1[i];
2066     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067   }
2068   /* Using nsmap, create a reordered ns structure */
2069   for (i = 0, j = 0; i < node_max; i++) {
2070     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071     tmp_vec2[i] = nodesz;
2072     j += nodesz;
2073   }
2074   PetscCall(PetscFree2(nsmap, tmp_vec1));
2075   /* Now use the correct ns */
2076   ns = tmp_vec2;
2077 
2078   do {
2079     sctx.newshift = PETSC_FALSE;
2080     /* Now loop over each block-row, and do the factorization */
2081     for (i = 0, row = 0; i < node_max; i++) {
2082       nodesz = ns[i];
2083       nz     = bi[row + 1] - bi[row];
2084       bjtmp  = bj + bi[row];
2085 
2086       switch (nodesz) {
2087       case 1:
2088         for (j = 0; j < nz; j++) {
2089           idx         = bjtmp[j];
2090           rtmp11[idx] = 0.0;
2091         }
2092 
2093         /* load in initial (unfactored row) */
2094         idx    = r[row];
2095         nz_tmp = ai[idx + 1] - ai[idx];
2096         ajtmp  = aj + ai[idx];
2097         v1     = aa + ai[idx];
2098 
2099         for (j = 0; j < nz_tmp; j++) {
2100           idx         = ics[ajtmp[j]];
2101           rtmp11[idx] = v1[j];
2102         }
2103         rtmp11[ics[r[row]]] += sctx.shift_amount;
2104 
2105         prow = *bjtmp++;
2106         while (prow < row) {
2107           pc1 = rtmp11 + prow;
2108           if (*pc1 != 0.0) {
2109             pv     = ba + bd[prow];
2110             pj     = nbj + bd[prow];
2111             mul1   = *pc1 * *pv++;
2112             *pc1   = mul1;
2113             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115             for (j = 0; j < nz_tmp; j++) {
2116               tmp = pv[j];
2117               idx = pj[j];
2118               rtmp11[idx] -= mul1 * tmp;
2119             }
2120           }
2121           prow = *bjtmp++;
2122         }
2123         pj  = bj + bi[row];
2124         pc1 = ba + bi[row];
2125 
2126         sctx.pv     = rtmp11[row];
2127         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128         rs          = 0.0;
2129         for (j = 0; j < nz; j++) {
2130           idx    = pj[j];
2131           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133         }
2134         sctx.rs = rs;
2135         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136         if (sctx.newshift) goto endofwhile;
2137         break;
2138 
2139       case 2:
2140         for (j = 0; j < nz; j++) {
2141           idx         = bjtmp[j];
2142           rtmp11[idx] = 0.0;
2143           rtmp22[idx] = 0.0;
2144         }
2145 
2146         /* load in initial (unfactored row) */
2147         idx    = r[row];
2148         nz_tmp = ai[idx + 1] - ai[idx];
2149         ajtmp  = aj + ai[idx];
2150         v1     = aa + ai[idx];
2151         v2     = aa + ai[idx + 1];
2152         for (j = 0; j < nz_tmp; j++) {
2153           idx         = ics[ajtmp[j]];
2154           rtmp11[idx] = v1[j];
2155           rtmp22[idx] = v2[j];
2156         }
2157         rtmp11[ics[r[row]]] += sctx.shift_amount;
2158         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2159 
2160         prow = *bjtmp++;
2161         while (prow < row) {
2162           pc1 = rtmp11 + prow;
2163           pc2 = rtmp22 + prow;
2164           if (*pc1 != 0.0 || *pc2 != 0.0) {
2165             pv   = ba + bd[prow];
2166             pj   = nbj + bd[prow];
2167             mul1 = *pc1 * *pv;
2168             mul2 = *pc2 * *pv;
2169             ++pv;
2170             *pc1 = mul1;
2171             *pc2 = mul2;
2172 
2173             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174             for (j = 0; j < nz_tmp; j++) {
2175               tmp = pv[j];
2176               idx = pj[j];
2177               rtmp11[idx] -= mul1 * tmp;
2178               rtmp22[idx] -= mul2 * tmp;
2179             }
2180             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181           }
2182           prow = *bjtmp++;
2183         }
2184 
2185         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186         pc1 = rtmp11 + prow;
2187         pc2 = rtmp22 + prow;
2188 
2189         sctx.pv = *pc1;
2190         pj      = bj + bi[prow];
2191         rs      = 0.0;
2192         for (j = 0; j < nz; j++) {
2193           idx = pj[j];
2194           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195         }
2196         sctx.rs = rs;
2197         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198         if (sctx.newshift) goto endofwhile;
2199 
2200         if (*pc2 != 0.0) {
2201           pj     = nbj + bd[prow];
2202           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203           *pc2   = mul2;
2204           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205           for (j = 0; j < nz_tmp; j++) {
2206             idx = pj[j];
2207             tmp = rtmp11[idx];
2208             rtmp22[idx] -= mul2 * tmp;
2209           }
2210           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211         }
2212 
2213         pj  = bj + bi[row];
2214         pc1 = ba + bi[row];
2215         pc2 = ba + bi[row + 1];
2216 
2217         sctx.pv         = rtmp22[row + 1];
2218         rs              = 0.0;
2219         rtmp11[row]     = 1.0 / rtmp11[row];
2220         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221         /* copy row entries from dense representation to sparse */
2222         for (j = 0; j < nz; j++) {
2223           idx    = pj[j];
2224           pc1[j] = rtmp11[idx];
2225           pc2[j] = rtmp22[idx];
2226           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227         }
2228         sctx.rs = rs;
2229         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230         if (sctx.newshift) goto endofwhile;
2231         break;
2232 
2233       case 3:
2234         for (j = 0; j < nz; j++) {
2235           idx         = bjtmp[j];
2236           rtmp11[idx] = 0.0;
2237           rtmp22[idx] = 0.0;
2238           rtmp33[idx] = 0.0;
2239         }
2240         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241         idx    = r[row];
2242         nz_tmp = ai[idx + 1] - ai[idx];
2243         ajtmp  = aj + ai[idx];
2244         v1     = aa + ai[idx];
2245         v2     = aa + ai[idx + 1];
2246         v3     = aa + ai[idx + 2];
2247         for (j = 0; j < nz_tmp; j++) {
2248           idx         = ics[ajtmp[j]];
2249           rtmp11[idx] = v1[j];
2250           rtmp22[idx] = v2[j];
2251           rtmp33[idx] = v3[j];
2252         }
2253         rtmp11[ics[r[row]]] += sctx.shift_amount;
2254         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2256 
2257         /* loop over all pivot row blocks above this row block */
2258         prow = *bjtmp++;
2259         while (prow < row) {
2260           pc1 = rtmp11 + prow;
2261           pc2 = rtmp22 + prow;
2262           pc3 = rtmp33 + prow;
2263           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264             pv   = ba + bd[prow];
2265             pj   = nbj + bd[prow];
2266             mul1 = *pc1 * *pv;
2267             mul2 = *pc2 * *pv;
2268             mul3 = *pc3 * *pv;
2269             ++pv;
2270             *pc1 = mul1;
2271             *pc2 = mul2;
2272             *pc3 = mul3;
2273 
2274             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275             /* update this row based on pivot row */
2276             for (j = 0; j < nz_tmp; j++) {
2277               tmp = pv[j];
2278               idx = pj[j];
2279               rtmp11[idx] -= mul1 * tmp;
2280               rtmp22[idx] -= mul2 * tmp;
2281               rtmp33[idx] -= mul3 * tmp;
2282             }
2283             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284           }
2285           prow = *bjtmp++;
2286         }
2287 
2288         /* Now take care of diagonal 3x3 block in this set of rows */
2289         /* note: prow = row here */
2290         pc1 = rtmp11 + prow;
2291         pc2 = rtmp22 + prow;
2292         pc3 = rtmp33 + prow;
2293 
2294         sctx.pv = *pc1;
2295         pj      = bj + bi[prow];
2296         rs      = 0.0;
2297         for (j = 0; j < nz; j++) {
2298           idx = pj[j];
2299           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300         }
2301         sctx.rs = rs;
2302         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303         if (sctx.newshift) goto endofwhile;
2304 
2305         if (*pc2 != 0.0 || *pc3 != 0.0) {
2306           mul2   = (*pc2) / (*pc1);
2307           mul3   = (*pc3) / (*pc1);
2308           *pc2   = mul2;
2309           *pc3   = mul3;
2310           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311           pj     = nbj + bd[prow];
2312           for (j = 0; j < nz_tmp; j++) {
2313             idx = pj[j];
2314             tmp = rtmp11[idx];
2315             rtmp22[idx] -= mul2 * tmp;
2316             rtmp33[idx] -= mul3 * tmp;
2317           }
2318           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319         }
2320         ++prow;
2321 
2322         pc2     = rtmp22 + prow;
2323         pc3     = rtmp33 + prow;
2324         sctx.pv = *pc2;
2325         pj      = bj + bi[prow];
2326         rs      = 0.0;
2327         for (j = 0; j < nz; j++) {
2328           idx = pj[j];
2329           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330         }
2331         sctx.rs = rs;
2332         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333         if (sctx.newshift) goto endofwhile;
2334 
2335         if (*pc3 != 0.0) {
2336           mul3   = (*pc3) / (*pc2);
2337           *pc3   = mul3;
2338           pj     = nbj + bd[prow];
2339           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340           for (j = 0; j < nz_tmp; j++) {
2341             idx = pj[j];
2342             tmp = rtmp22[idx];
2343             rtmp33[idx] -= mul3 * tmp;
2344           }
2345           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346         }
2347 
2348         pj  = bj + bi[row];
2349         pc1 = ba + bi[row];
2350         pc2 = ba + bi[row + 1];
2351         pc3 = ba + bi[row + 2];
2352 
2353         sctx.pv         = rtmp33[row + 2];
2354         rs              = 0.0;
2355         rtmp11[row]     = 1.0 / rtmp11[row];
2356         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358         /* copy row entries from dense representation to sparse */
2359         for (j = 0; j < nz; j++) {
2360           idx    = pj[j];
2361           pc1[j] = rtmp11[idx];
2362           pc2[j] = rtmp22[idx];
2363           pc3[j] = rtmp33[idx];
2364           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365         }
2366 
2367         sctx.rs = rs;
2368         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369         if (sctx.newshift) goto endofwhile;
2370         break;
2371 
2372       default:
2373         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374       }
2375       row += nodesz; /* Update the row */
2376     }
2377   endofwhile:;
2378   } while (sctx.newshift);
2379   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380   PetscCall(PetscFree(tmp_vec2));
2381   PetscCall(ISRestoreIndices(isicol, &ic));
2382   PetscCall(ISRestoreIndices(isrow, &r));
2383   PetscCall(ISRestoreIndices(iscol, &c));
2384 
2385   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386   /* do not set solve add, since MatSolve_Inode + Add is faster */
2387   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2388   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389   C->assembled              = PETSC_TRUE;
2390   C->preallocated           = PETSC_TRUE;
2391   if (sctx.nshift) {
2392     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396     }
2397   }
2398   PetscCall(PetscLogFlops(C->cmap->n));
2399   PetscCall(MatSeqAIJCheckInode(C));
2400   PetscFunctionReturn(PETSC_SUCCESS);
2401 }
2402 #endif
2403 
2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405 {
2406   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2407   IS                 iscol = a->col, isrow = a->row;
2408   const PetscInt    *r, *c, *rout, *cout;
2409   PetscInt           i, j;
2410   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2411   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2413   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2414   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415   const PetscScalar *b;
2416 
2417   PetscFunctionBegin;
2418   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419   node_max = a->inode.node_count;
2420   ns       = a->inode.size_csr; /* Node Size array */
2421 
2422   PetscCall(VecGetArrayRead(bb, &b));
2423   PetscCall(VecGetArrayWrite(xx, &x));
2424   tmp = a->solve_work;
2425 
2426   PetscCall(ISGetIndices(isrow, &rout));
2427   r = rout;
2428   PetscCall(ISGetIndices(iscol, &cout));
2429   c = cout;
2430 
2431   /* forward solve the lower triangular */
2432   tmps = tmp;
2433   aa   = a_a;
2434   aj   = a_j;
2435   ad   = a->diag;
2436 
2437   for (i = 0; i < node_max; ++i) {
2438     row = ns[i];
2439     nsz = ns[i + 1] - ns[i];
2440     aii = ai[row];
2441     v1  = aa + aii;
2442     vi  = aj + aii;
2443     nz  = ai[row + 1] - ai[row];
2444 
2445     if (i < node_max - 1) {
2446       /* Prefetch the indices for the next block */
2447       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448       /* Prefetch the data for the next block */
2449       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450     }
2451 
2452     switch (nsz) { /* Each loop in 'case' is unrolled */
2453     case 1:
2454       sum1 = b[r[row]];
2455       for (j = 0; j < nz - 1; j += 2) {
2456         i0   = vi[j];
2457         i1   = vi[j + 1];
2458         tmp0 = tmps[i0];
2459         tmp1 = tmps[i1];
2460         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461       }
2462       if (j == nz - 1) {
2463         tmp0 = tmps[vi[j]];
2464         sum1 -= v1[j] * tmp0;
2465       }
2466       tmp[row++] = sum1;
2467       break;
2468     case 2:
2469       sum1 = b[r[row]];
2470       sum2 = b[r[row + 1]];
2471       v2   = aa + ai[row + 1];
2472 
2473       for (j = 0; j < nz - 1; j += 2) {
2474         i0   = vi[j];
2475         i1   = vi[j + 1];
2476         tmp0 = tmps[i0];
2477         tmp1 = tmps[i1];
2478         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480       }
2481       if (j == nz - 1) {
2482         tmp0 = tmps[vi[j]];
2483         sum1 -= v1[j] * tmp0;
2484         sum2 -= v2[j] * tmp0;
2485       }
2486       sum2 -= v2[nz] * sum1;
2487       tmp[row++] = sum1;
2488       tmp[row++] = sum2;
2489       break;
2490     case 3:
2491       sum1 = b[r[row]];
2492       sum2 = b[r[row + 1]];
2493       sum3 = b[r[row + 2]];
2494       v2   = aa + ai[row + 1];
2495       v3   = aa + ai[row + 2];
2496 
2497       for (j = 0; j < nz - 1; j += 2) {
2498         i0   = vi[j];
2499         i1   = vi[j + 1];
2500         tmp0 = tmps[i0];
2501         tmp1 = tmps[i1];
2502         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505       }
2506       if (j == nz - 1) {
2507         tmp0 = tmps[vi[j]];
2508         sum1 -= v1[j] * tmp0;
2509         sum2 -= v2[j] * tmp0;
2510         sum3 -= v3[j] * tmp0;
2511       }
2512       sum2 -= v2[nz] * sum1;
2513       sum3 -= v3[nz] * sum1;
2514       sum3 -= v3[nz + 1] * sum2;
2515       tmp[row++] = sum1;
2516       tmp[row++] = sum2;
2517       tmp[row++] = sum3;
2518       break;
2519 
2520     case 4:
2521       sum1 = b[r[row]];
2522       sum2 = b[r[row + 1]];
2523       sum3 = b[r[row + 2]];
2524       sum4 = b[r[row + 3]];
2525       v2   = aa + ai[row + 1];
2526       v3   = aa + ai[row + 2];
2527       v4   = aa + ai[row + 3];
2528 
2529       for (j = 0; j < nz - 1; j += 2) {
2530         i0   = vi[j];
2531         i1   = vi[j + 1];
2532         tmp0 = tmps[i0];
2533         tmp1 = tmps[i1];
2534         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538       }
2539       if (j == nz - 1) {
2540         tmp0 = tmps[vi[j]];
2541         sum1 -= v1[j] * tmp0;
2542         sum2 -= v2[j] * tmp0;
2543         sum3 -= v3[j] * tmp0;
2544         sum4 -= v4[j] * tmp0;
2545       }
2546       sum2 -= v2[nz] * sum1;
2547       sum3 -= v3[nz] * sum1;
2548       sum4 -= v4[nz] * sum1;
2549       sum3 -= v3[nz + 1] * sum2;
2550       sum4 -= v4[nz + 1] * sum2;
2551       sum4 -= v4[nz + 2] * sum3;
2552 
2553       tmp[row++] = sum1;
2554       tmp[row++] = sum2;
2555       tmp[row++] = sum3;
2556       tmp[row++] = sum4;
2557       break;
2558     case 5:
2559       sum1 = b[r[row]];
2560       sum2 = b[r[row + 1]];
2561       sum3 = b[r[row + 2]];
2562       sum4 = b[r[row + 3]];
2563       sum5 = b[r[row + 4]];
2564       v2   = aa + ai[row + 1];
2565       v3   = aa + ai[row + 2];
2566       v4   = aa + ai[row + 3];
2567       v5   = aa + ai[row + 4];
2568 
2569       for (j = 0; j < nz - 1; j += 2) {
2570         i0   = vi[j];
2571         i1   = vi[j + 1];
2572         tmp0 = tmps[i0];
2573         tmp1 = tmps[i1];
2574         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579       }
2580       if (j == nz - 1) {
2581         tmp0 = tmps[vi[j]];
2582         sum1 -= v1[j] * tmp0;
2583         sum2 -= v2[j] * tmp0;
2584         sum3 -= v3[j] * tmp0;
2585         sum4 -= v4[j] * tmp0;
2586         sum5 -= v5[j] * tmp0;
2587       }
2588 
2589       sum2 -= v2[nz] * sum1;
2590       sum3 -= v3[nz] * sum1;
2591       sum4 -= v4[nz] * sum1;
2592       sum5 -= v5[nz] * sum1;
2593       sum3 -= v3[nz + 1] * sum2;
2594       sum4 -= v4[nz + 1] * sum2;
2595       sum5 -= v5[nz + 1] * sum2;
2596       sum4 -= v4[nz + 2] * sum3;
2597       sum5 -= v5[nz + 2] * sum3;
2598       sum5 -= v5[nz + 3] * sum4;
2599 
2600       tmp[row++] = sum1;
2601       tmp[row++] = sum2;
2602       tmp[row++] = sum3;
2603       tmp[row++] = sum4;
2604       tmp[row++] = sum5;
2605       break;
2606     default:
2607       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608     }
2609   }
2610   /* backward solve the upper triangular */
2611   for (i = node_max - 1; i >= 0; i--) {
2612     row = ns[i + 1] - 1;
2613     nsz = ns[i + 1] - ns[i];
2614     aii = ad[row + 1] + 1;
2615     v1  = aa + aii;
2616     vi  = aj + aii;
2617     nz  = ad[row] - ad[row + 1] - 1;
2618 
2619     if (i > 0) {
2620       /* Prefetch the indices for the next block */
2621       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622       /* Prefetch the data for the next block */
2623       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624     }
2625 
2626     switch (nsz) { /* Each loop in 'case' is unrolled */
2627     case 1:
2628       sum1 = tmp[row];
2629 
2630       for (j = 0; j < nz - 1; j += 2) {
2631         i0   = vi[j];
2632         i1   = vi[j + 1];
2633         tmp0 = tmps[i0];
2634         tmp1 = tmps[i1];
2635         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636       }
2637       if (j == nz - 1) {
2638         tmp0 = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640       }
2641       x[c[row]] = tmp[row] = sum1 * v1[nz];
2642       row--;
2643       break;
2644     case 2:
2645       sum1 = tmp[row];
2646       sum2 = tmp[row - 1];
2647       v2   = aa + ad[row] + 1;
2648       for (j = 0; j < nz - 1; j += 2) {
2649         i0   = vi[j];
2650         i1   = vi[j + 1];
2651         tmp0 = tmps[i0];
2652         tmp1 = tmps[i1];
2653         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655       }
2656       if (j == nz - 1) {
2657         tmp0 = tmps[vi[j]];
2658         sum1 -= v1[j] * tmp0;
2659         sum2 -= v2[j + 1] * tmp0;
2660       }
2661 
2662       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663       row--;
2664       sum2 -= v2[0] * tmp0;
2665       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666       row--;
2667       break;
2668     case 3:
2669       sum1 = tmp[row];
2670       sum2 = tmp[row - 1];
2671       sum3 = tmp[row - 2];
2672       v2   = aa + ad[row] + 1;
2673       v3   = aa + ad[row - 1] + 1;
2674       for (j = 0; j < nz - 1; j += 2) {
2675         i0   = vi[j];
2676         i1   = vi[j + 1];
2677         tmp0 = tmps[i0];
2678         tmp1 = tmps[i1];
2679         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682       }
2683       if (j == nz - 1) {
2684         tmp0 = tmps[vi[j]];
2685         sum1 -= v1[j] * tmp0;
2686         sum2 -= v2[j + 1] * tmp0;
2687         sum3 -= v3[j + 2] * tmp0;
2688       }
2689       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690       row--;
2691       sum2 -= v2[0] * tmp0;
2692       sum3 -= v3[1] * tmp0;
2693       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694       row--;
2695       sum3 -= v3[0] * tmp0;
2696       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697       row--;
2698 
2699       break;
2700     case 4:
2701       sum1 = tmp[row];
2702       sum2 = tmp[row - 1];
2703       sum3 = tmp[row - 2];
2704       sum4 = tmp[row - 3];
2705       v2   = aa + ad[row] + 1;
2706       v3   = aa + ad[row - 1] + 1;
2707       v4   = aa + ad[row - 2] + 1;
2708 
2709       for (j = 0; j < nz - 1; j += 2) {
2710         i0   = vi[j];
2711         i1   = vi[j + 1];
2712         tmp0 = tmps[i0];
2713         tmp1 = tmps[i1];
2714         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718       }
2719       if (j == nz - 1) {
2720         tmp0 = tmps[vi[j]];
2721         sum1 -= v1[j] * tmp0;
2722         sum2 -= v2[j + 1] * tmp0;
2723         sum3 -= v3[j + 2] * tmp0;
2724         sum4 -= v4[j + 3] * tmp0;
2725       }
2726 
2727       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728       row--;
2729       sum2 -= v2[0] * tmp0;
2730       sum3 -= v3[1] * tmp0;
2731       sum4 -= v4[2] * tmp0;
2732       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733       row--;
2734       sum3 -= v3[0] * tmp0;
2735       sum4 -= v4[1] * tmp0;
2736       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737       row--;
2738       sum4 -= v4[0] * tmp0;
2739       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740       row--;
2741       break;
2742     case 5:
2743       sum1 = tmp[row];
2744       sum2 = tmp[row - 1];
2745       sum3 = tmp[row - 2];
2746       sum4 = tmp[row - 3];
2747       sum5 = tmp[row - 4];
2748       v2   = aa + ad[row] + 1;
2749       v3   = aa + ad[row - 1] + 1;
2750       v4   = aa + ad[row - 2] + 1;
2751       v5   = aa + ad[row - 3] + 1;
2752       for (j = 0; j < nz - 1; j += 2) {
2753         i0   = vi[j];
2754         i1   = vi[j + 1];
2755         tmp0 = tmps[i0];
2756         tmp1 = tmps[i1];
2757         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762       }
2763       if (j == nz - 1) {
2764         tmp0 = tmps[vi[j]];
2765         sum1 -= v1[j] * tmp0;
2766         sum2 -= v2[j + 1] * tmp0;
2767         sum3 -= v3[j + 2] * tmp0;
2768         sum4 -= v4[j + 3] * tmp0;
2769         sum5 -= v5[j + 4] * tmp0;
2770       }
2771 
2772       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773       row--;
2774       sum2 -= v2[0] * tmp0;
2775       sum3 -= v3[1] * tmp0;
2776       sum4 -= v4[2] * tmp0;
2777       sum5 -= v5[3] * tmp0;
2778       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779       row--;
2780       sum3 -= v3[0] * tmp0;
2781       sum4 -= v4[1] * tmp0;
2782       sum5 -= v5[2] * tmp0;
2783       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784       row--;
2785       sum4 -= v4[0] * tmp0;
2786       sum5 -= v5[1] * tmp0;
2787       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788       row--;
2789       sum5 -= v5[0] * tmp0;
2790       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791       row--;
2792       break;
2793     default:
2794       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795     }
2796   }
2797   PetscCall(ISRestoreIndices(isrow, &rout));
2798   PetscCall(ISRestoreIndices(iscol, &cout));
2799   PetscCall(VecRestoreArrayRead(bb, &b));
2800   PetscCall(VecRestoreArrayWrite(xx, &x));
2801   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802   PetscFunctionReturn(PETSC_SUCCESS);
2803 }
2804 
2805 /*
2806      Makes a longer coloring[] array and calls the usual code with that
2807 */
2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809 {
2810   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2811   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812   PetscInt        *colorused, i;
2813   ISColoringValue *newcolor;
2814 
2815   PetscFunctionBegin;
2816   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817   PetscCall(PetscMalloc1(n + 1, &newcolor));
2818   /* loop over inodes, marking a color for each column*/
2819   row = 0;
2820   for (i = 0; i < m; i++) {
2821     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822   }
2823 
2824   /* eliminate unneeded colors */
2825   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2827 
2828   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829   ncolors = colorused[5 * ncolors - 1];
2830   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831   PetscCall(PetscFree(colorused));
2832   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833   PetscCall(PetscFree(coloring));
2834   PetscFunctionReturn(PETSC_SUCCESS);
2835 }
2836 
2837 #include <petsc/private/kernels/blockinvert.h>
2838 
2839 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840 {
2841   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2842   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2843   MatScalar         *ibdiag, *bdiag, work[25], *t;
2844   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2845   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2846   const PetscScalar *xb, *b;
2847   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2849   PetscInt           sz, k, ipvt[5];
2850   PetscBool          allowzeropivot, zeropivotdetected;
2851   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
2852 
2853   PetscFunctionBegin;
2854   allowzeropivot = PetscNot(A->erroriffailure);
2855   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2856   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2857   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2858 
2859   if (!a->inode.ibdiagvalid) {
2860     if (!a->inode.ibdiag) {
2861       /* calculate space needed for diagonal blocks */
2862       for (i = 0; i < m; i++) {
2863         nodesz = sizes[i + 1] - sizes[i];
2864         cnt += nodesz * nodesz;
2865       }
2866       a->inode.bdiagsize = cnt;
2867 
2868       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2869     }
2870 
2871     /* copy over the diagonal blocks and invert them */
2872     ibdiag = a->inode.ibdiag;
2873     bdiag  = a->inode.bdiag;
2874     cnt    = 0;
2875     for (i = 0, row = 0; i < m; i++) {
2876       nodesz = sizes[i + 1] - sizes[i];
2877       for (j = 0; j < nodesz; j++) {
2878         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2879       }
2880       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2881 
2882       switch (nodesz) {
2883       case 1:
2884         /* Create matrix data structure */
2885         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2886           if (allowzeropivot) {
2887             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2889             A->factorerror_zeropivot_row   = row;
2890             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2891           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2892         }
2893         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2894         break;
2895       case 2:
2896         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2897         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2898         break;
2899       case 3:
2900         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2901         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2902         break;
2903       case 4:
2904         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2905         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2906         break;
2907       case 5:
2908         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2909         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2910         break;
2911       default:
2912         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2913       }
2914       cnt += nodesz * nodesz;
2915       row += nodesz;
2916     }
2917     a->inode.ibdiagvalid = PETSC_TRUE;
2918   }
2919   ibdiag = a->inode.ibdiag;
2920   bdiag  = a->inode.bdiag;
2921   t      = a->inode.ssor_work;
2922 
2923   PetscCall(VecGetArray(xx, &x));
2924   PetscCall(VecGetArrayRead(bb, &b));
2925   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2926   if (flag & SOR_ZERO_INITIAL_GUESS) {
2927     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2928       for (i = 0, row = 0; i < m; i++) {
2929         sz  = diag[row] - ii[row];
2930         v1  = a->a + ii[row];
2931         idx = a->j + ii[row];
2932 
2933         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2934         nodesz = sizes[i + 1] - sizes[i];
2935         switch (nodesz) {
2936         case 1:
2937 
2938           sum1 = b[row];
2939           for (n = 0; n < sz - 1; n += 2) {
2940             i1 = idx[0];
2941             i2 = idx[1];
2942             idx += 2;
2943             tmp0 = x[i1];
2944             tmp1 = x[i2];
2945             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946             v1 += 2;
2947           }
2948 
2949           if (n == sz - 1) {
2950             tmp0 = x[*idx];
2951             sum1 -= *v1 * tmp0;
2952           }
2953           t[row]   = sum1;
2954           x[row++] = sum1 * (*ibdiag++);
2955           break;
2956         case 2:
2957           v2   = a->a + ii[row + 1];
2958           sum1 = b[row];
2959           sum2 = b[row + 1];
2960           for (n = 0; n < sz - 1; n += 2) {
2961             i1 = idx[0];
2962             i2 = idx[1];
2963             idx += 2;
2964             tmp0 = x[i1];
2965             tmp1 = x[i2];
2966             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2967             v1 += 2;
2968             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2969             v2 += 2;
2970           }
2971 
2972           if (n == sz - 1) {
2973             tmp0 = x[*idx];
2974             sum1 -= v1[0] * tmp0;
2975             sum2 -= v2[0] * tmp0;
2976           }
2977           t[row]     = sum1;
2978           t[row + 1] = sum2;
2979           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2980           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2981           ibdiag += 4;
2982           break;
2983         case 3:
2984           v2   = a->a + ii[row + 1];
2985           v3   = a->a + ii[row + 2];
2986           sum1 = b[row];
2987           sum2 = b[row + 1];
2988           sum3 = b[row + 2];
2989           for (n = 0; n < sz - 1; n += 2) {
2990             i1 = idx[0];
2991             i2 = idx[1];
2992             idx += 2;
2993             tmp0 = x[i1];
2994             tmp1 = x[i2];
2995             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2996             v1 += 2;
2997             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2998             v2 += 2;
2999             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3000             v3 += 2;
3001           }
3002 
3003           if (n == sz - 1) {
3004             tmp0 = x[*idx];
3005             sum1 -= v1[0] * tmp0;
3006             sum2 -= v2[0] * tmp0;
3007             sum3 -= v3[0] * tmp0;
3008           }
3009           t[row]     = sum1;
3010           t[row + 1] = sum2;
3011           t[row + 2] = sum3;
3012           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3013           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3014           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3015           ibdiag += 9;
3016           break;
3017         case 4:
3018           v2   = a->a + ii[row + 1];
3019           v3   = a->a + ii[row + 2];
3020           v4   = a->a + ii[row + 3];
3021           sum1 = b[row];
3022           sum2 = b[row + 1];
3023           sum3 = b[row + 2];
3024           sum4 = b[row + 3];
3025           for (n = 0; n < sz - 1; n += 2) {
3026             i1 = idx[0];
3027             i2 = idx[1];
3028             idx += 2;
3029             tmp0 = x[i1];
3030             tmp1 = x[i2];
3031             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3032             v1 += 2;
3033             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3034             v2 += 2;
3035             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3036             v3 += 2;
3037             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3038             v4 += 2;
3039           }
3040 
3041           if (n == sz - 1) {
3042             tmp0 = x[*idx];
3043             sum1 -= v1[0] * tmp0;
3044             sum2 -= v2[0] * tmp0;
3045             sum3 -= v3[0] * tmp0;
3046             sum4 -= v4[0] * tmp0;
3047           }
3048           t[row]     = sum1;
3049           t[row + 1] = sum2;
3050           t[row + 2] = sum3;
3051           t[row + 3] = sum4;
3052           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3053           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3054           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3055           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3056           ibdiag += 16;
3057           break;
3058         case 5:
3059           v2   = a->a + ii[row + 1];
3060           v3   = a->a + ii[row + 2];
3061           v4   = a->a + ii[row + 3];
3062           v5   = a->a + ii[row + 4];
3063           sum1 = b[row];
3064           sum2 = b[row + 1];
3065           sum3 = b[row + 2];
3066           sum4 = b[row + 3];
3067           sum5 = b[row + 4];
3068           for (n = 0; n < sz - 1; n += 2) {
3069             i1 = idx[0];
3070             i2 = idx[1];
3071             idx += 2;
3072             tmp0 = x[i1];
3073             tmp1 = x[i2];
3074             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3075             v1 += 2;
3076             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3077             v2 += 2;
3078             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3079             v3 += 2;
3080             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3081             v4 += 2;
3082             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3083             v5 += 2;
3084           }
3085 
3086           if (n == sz - 1) {
3087             tmp0 = x[*idx];
3088             sum1 -= v1[0] * tmp0;
3089             sum2 -= v2[0] * tmp0;
3090             sum3 -= v3[0] * tmp0;
3091             sum4 -= v4[0] * tmp0;
3092             sum5 -= v5[0] * tmp0;
3093           }
3094           t[row]     = sum1;
3095           t[row + 1] = sum2;
3096           t[row + 2] = sum3;
3097           t[row + 3] = sum4;
3098           t[row + 4] = sum5;
3099           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3100           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3101           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3102           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3103           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3104           ibdiag += 25;
3105           break;
3106         default:
3107           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3108         }
3109       }
3110 
3111       xb = t;
3112       PetscCall(PetscLogFlops(a->nz));
3113     } else xb = b;
3114     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3115       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3116       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3117         nodesz = sizes[i + 1] - sizes[i];
3118         ibdiag -= nodesz * nodesz;
3119         sz  = ii[row + 1] - diag[row] - 1;
3120         v1  = a->a + diag[row] + 1;
3121         idx = a->j + diag[row] + 1;
3122 
3123         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3124         switch (nodesz) {
3125         case 1:
3126 
3127           sum1 = xb[row];
3128           for (n = 0; n < sz - 1; n += 2) {
3129             i1 = idx[0];
3130             i2 = idx[1];
3131             idx += 2;
3132             tmp0 = x[i1];
3133             tmp1 = x[i2];
3134             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135             v1 += 2;
3136           }
3137 
3138           if (n == sz - 1) {
3139             tmp0 = x[*idx];
3140             sum1 -= *v1 * tmp0;
3141           }
3142           x[row--] = sum1 * (*ibdiag);
3143           break;
3144 
3145         case 2:
3146 
3147           sum1 = xb[row];
3148           sum2 = xb[row - 1];
3149           /* note that sum1 is associated with the second of the two rows */
3150           v2 = a->a + diag[row - 1] + 2;
3151           for (n = 0; n < sz - 1; n += 2) {
3152             i1 = idx[0];
3153             i2 = idx[1];
3154             idx += 2;
3155             tmp0 = x[i1];
3156             tmp1 = x[i2];
3157             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3158             v1 += 2;
3159             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3160             v2 += 2;
3161           }
3162 
3163           if (n == sz - 1) {
3164             tmp0 = x[*idx];
3165             sum1 -= *v1 * tmp0;
3166             sum2 -= *v2 * tmp0;
3167           }
3168           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3169           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3170           break;
3171         case 3:
3172 
3173           sum1 = xb[row];
3174           sum2 = xb[row - 1];
3175           sum3 = xb[row - 2];
3176           v2   = a->a + diag[row - 1] + 2;
3177           v3   = a->a + diag[row - 2] + 3;
3178           for (n = 0; n < sz - 1; n += 2) {
3179             i1 = idx[0];
3180             i2 = idx[1];
3181             idx += 2;
3182             tmp0 = x[i1];
3183             tmp1 = x[i2];
3184             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3185             v1 += 2;
3186             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3187             v2 += 2;
3188             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3189             v3 += 2;
3190           }
3191 
3192           if (n == sz - 1) {
3193             tmp0 = x[*idx];
3194             sum1 -= *v1 * tmp0;
3195             sum2 -= *v2 * tmp0;
3196             sum3 -= *v3 * tmp0;
3197           }
3198           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3199           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3200           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3201           break;
3202         case 4:
3203 
3204           sum1 = xb[row];
3205           sum2 = xb[row - 1];
3206           sum3 = xb[row - 2];
3207           sum4 = xb[row - 3];
3208           v2   = a->a + diag[row - 1] + 2;
3209           v3   = a->a + diag[row - 2] + 3;
3210           v4   = a->a + diag[row - 3] + 4;
3211           for (n = 0; n < sz - 1; n += 2) {
3212             i1 = idx[0];
3213             i2 = idx[1];
3214             idx += 2;
3215             tmp0 = x[i1];
3216             tmp1 = x[i2];
3217             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3218             v1 += 2;
3219             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3220             v2 += 2;
3221             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3222             v3 += 2;
3223             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3224             v4 += 2;
3225           }
3226 
3227           if (n == sz - 1) {
3228             tmp0 = x[*idx];
3229             sum1 -= *v1 * tmp0;
3230             sum2 -= *v2 * tmp0;
3231             sum3 -= *v3 * tmp0;
3232             sum4 -= *v4 * tmp0;
3233           }
3234           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3235           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3236           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3237           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3238           break;
3239         case 5:
3240 
3241           sum1 = xb[row];
3242           sum2 = xb[row - 1];
3243           sum3 = xb[row - 2];
3244           sum4 = xb[row - 3];
3245           sum5 = xb[row - 4];
3246           v2   = a->a + diag[row - 1] + 2;
3247           v3   = a->a + diag[row - 2] + 3;
3248           v4   = a->a + diag[row - 3] + 4;
3249           v5   = a->a + diag[row - 4] + 5;
3250           for (n = 0; n < sz - 1; n += 2) {
3251             i1 = idx[0];
3252             i2 = idx[1];
3253             idx += 2;
3254             tmp0 = x[i1];
3255             tmp1 = x[i2];
3256             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3257             v1 += 2;
3258             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3259             v2 += 2;
3260             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3261             v3 += 2;
3262             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3263             v4 += 2;
3264             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3265             v5 += 2;
3266           }
3267 
3268           if (n == sz - 1) {
3269             tmp0 = x[*idx];
3270             sum1 -= *v1 * tmp0;
3271             sum2 -= *v2 * tmp0;
3272             sum3 -= *v3 * tmp0;
3273             sum4 -= *v4 * tmp0;
3274             sum5 -= *v5 * tmp0;
3275           }
3276           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3277           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3278           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3279           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3280           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3281           break;
3282         default:
3283           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3284         }
3285       }
3286 
3287       PetscCall(PetscLogFlops(a->nz));
3288     }
3289     its--;
3290   }
3291   while (its--) {
3292     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3293       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3294         nodesz = sizes[i + 1] - sizes[i];
3295         sz     = diag[row] - ii[row];
3296         v1     = a->a + ii[row];
3297         idx    = a->j + ii[row];
3298         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3299         switch (nodesz) {
3300         case 1:
3301           sum1 = b[row];
3302           for (n = 0; n < sz - 1; n += 2) {
3303             i1 = idx[0];
3304             i2 = idx[1];
3305             idx += 2;
3306             tmp0 = x[i1];
3307             tmp1 = x[i2];
3308             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3309             v1 += 2;
3310           }
3311           if (n == sz - 1) {
3312             tmp0 = x[*idx++];
3313             sum1 -= *v1 * tmp0;
3314             v1++;
3315           }
3316           t[row] = sum1;
3317           sz     = ii[row + 1] - diag[row] - 1;
3318           idx    = a->j + diag[row] + 1;
3319           v1 += 1;
3320           for (n = 0; n < sz - 1; n += 2) {
3321             i1 = idx[0];
3322             i2 = idx[1];
3323             idx += 2;
3324             tmp0 = x[i1];
3325             tmp1 = x[i2];
3326             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3327             v1 += 2;
3328           }
3329           if (n == sz - 1) {
3330             tmp0 = x[*idx++];
3331             sum1 -= *v1 * tmp0;
3332           }
3333           /* in MatSOR_SeqAIJ this line would be
3334            *
3335            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3336            *
3337            * but omega == 1, so this becomes
3338            *
3339            * x[row] = sum1*(*ibdiag++);
3340            *
3341            */
3342           x[row] = sum1 * (*ibdiag);
3343           break;
3344         case 2:
3345           v2   = a->a + ii[row + 1];
3346           sum1 = b[row];
3347           sum2 = b[row + 1];
3348           for (n = 0; n < sz - 1; n += 2) {
3349             i1 = idx[0];
3350             i2 = idx[1];
3351             idx += 2;
3352             tmp0 = x[i1];
3353             tmp1 = x[i2];
3354             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355             v1 += 2;
3356             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357             v2 += 2;
3358           }
3359           if (n == sz - 1) {
3360             tmp0 = x[*idx++];
3361             sum1 -= v1[0] * tmp0;
3362             sum2 -= v2[0] * tmp0;
3363             v1++;
3364             v2++;
3365           }
3366           t[row]     = sum1;
3367           t[row + 1] = sum2;
3368           sz         = ii[row + 1] - diag[row] - 2;
3369           idx        = a->j + diag[row] + 2;
3370           v1 += 2;
3371           v2 += 2;
3372           for (n = 0; n < sz - 1; n += 2) {
3373             i1 = idx[0];
3374             i2 = idx[1];
3375             idx += 2;
3376             tmp0 = x[i1];
3377             tmp1 = x[i2];
3378             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3379             v1 += 2;
3380             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3381             v2 += 2;
3382           }
3383           if (n == sz - 1) {
3384             tmp0 = x[*idx];
3385             sum1 -= v1[0] * tmp0;
3386             sum2 -= v2[0] * tmp0;
3387           }
3388           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3389           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3390           break;
3391         case 3:
3392           v2   = a->a + ii[row + 1];
3393           v3   = a->a + ii[row + 2];
3394           sum1 = b[row];
3395           sum2 = b[row + 1];
3396           sum3 = b[row + 2];
3397           for (n = 0; n < sz - 1; n += 2) {
3398             i1 = idx[0];
3399             i2 = idx[1];
3400             idx += 2;
3401             tmp0 = x[i1];
3402             tmp1 = x[i2];
3403             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3404             v1 += 2;
3405             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3406             v2 += 2;
3407             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3408             v3 += 2;
3409           }
3410           if (n == sz - 1) {
3411             tmp0 = x[*idx++];
3412             sum1 -= v1[0] * tmp0;
3413             sum2 -= v2[0] * tmp0;
3414             sum3 -= v3[0] * tmp0;
3415             v1++;
3416             v2++;
3417             v3++;
3418           }
3419           t[row]     = sum1;
3420           t[row + 1] = sum2;
3421           t[row + 2] = sum3;
3422           sz         = ii[row + 1] - diag[row] - 3;
3423           idx        = a->j + diag[row] + 3;
3424           v1 += 3;
3425           v2 += 3;
3426           v3 += 3;
3427           for (n = 0; n < sz - 1; n += 2) {
3428             i1 = idx[0];
3429             i2 = idx[1];
3430             idx += 2;
3431             tmp0 = x[i1];
3432             tmp1 = x[i2];
3433             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3434             v1 += 2;
3435             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3436             v2 += 2;
3437             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3438             v3 += 2;
3439           }
3440           if (n == sz - 1) {
3441             tmp0 = x[*idx];
3442             sum1 -= v1[0] * tmp0;
3443             sum2 -= v2[0] * tmp0;
3444             sum3 -= v3[0] * tmp0;
3445           }
3446           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3447           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3448           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3449           break;
3450         case 4:
3451           v2   = a->a + ii[row + 1];
3452           v3   = a->a + ii[row + 2];
3453           v4   = a->a + ii[row + 3];
3454           sum1 = b[row];
3455           sum2 = b[row + 1];
3456           sum3 = b[row + 2];
3457           sum4 = b[row + 3];
3458           for (n = 0; n < sz - 1; n += 2) {
3459             i1 = idx[0];
3460             i2 = idx[1];
3461             idx += 2;
3462             tmp0 = x[i1];
3463             tmp1 = x[i2];
3464             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3465             v1 += 2;
3466             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3467             v2 += 2;
3468             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3469             v3 += 2;
3470             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3471             v4 += 2;
3472           }
3473           if (n == sz - 1) {
3474             tmp0 = x[*idx++];
3475             sum1 -= v1[0] * tmp0;
3476             sum2 -= v2[0] * tmp0;
3477             sum3 -= v3[0] * tmp0;
3478             sum4 -= v4[0] * tmp0;
3479             v1++;
3480             v2++;
3481             v3++;
3482             v4++;
3483           }
3484           t[row]     = sum1;
3485           t[row + 1] = sum2;
3486           t[row + 2] = sum3;
3487           t[row + 3] = sum4;
3488           sz         = ii[row + 1] - diag[row] - 4;
3489           idx        = a->j + diag[row] + 4;
3490           v1 += 4;
3491           v2 += 4;
3492           v3 += 4;
3493           v4 += 4;
3494           for (n = 0; n < sz - 1; n += 2) {
3495             i1 = idx[0];
3496             i2 = idx[1];
3497             idx += 2;
3498             tmp0 = x[i1];
3499             tmp1 = x[i2];
3500             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3501             v1 += 2;
3502             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3503             v2 += 2;
3504             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3505             v3 += 2;
3506             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3507             v4 += 2;
3508           }
3509           if (n == sz - 1) {
3510             tmp0 = x[*idx];
3511             sum1 -= v1[0] * tmp0;
3512             sum2 -= v2[0] * tmp0;
3513             sum3 -= v3[0] * tmp0;
3514             sum4 -= v4[0] * tmp0;
3515           }
3516           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3517           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3518           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3519           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3520           break;
3521         case 5:
3522           v2   = a->a + ii[row + 1];
3523           v3   = a->a + ii[row + 2];
3524           v4   = a->a + ii[row + 3];
3525           v5   = a->a + ii[row + 4];
3526           sum1 = b[row];
3527           sum2 = b[row + 1];
3528           sum3 = b[row + 2];
3529           sum4 = b[row + 3];
3530           sum5 = b[row + 4];
3531           for (n = 0; n < sz - 1; n += 2) {
3532             i1 = idx[0];
3533             i2 = idx[1];
3534             idx += 2;
3535             tmp0 = x[i1];
3536             tmp1 = x[i2];
3537             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3538             v1 += 2;
3539             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3540             v2 += 2;
3541             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3542             v3 += 2;
3543             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3544             v4 += 2;
3545             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3546             v5 += 2;
3547           }
3548           if (n == sz - 1) {
3549             tmp0 = x[*idx++];
3550             sum1 -= v1[0] * tmp0;
3551             sum2 -= v2[0] * tmp0;
3552             sum3 -= v3[0] * tmp0;
3553             sum4 -= v4[0] * tmp0;
3554             sum5 -= v5[0] * tmp0;
3555             v1++;
3556             v2++;
3557             v3++;
3558             v4++;
3559             v5++;
3560           }
3561           t[row]     = sum1;
3562           t[row + 1] = sum2;
3563           t[row + 2] = sum3;
3564           t[row + 3] = sum4;
3565           t[row + 4] = sum5;
3566           sz         = ii[row + 1] - diag[row] - 5;
3567           idx        = a->j + diag[row] + 5;
3568           v1 += 5;
3569           v2 += 5;
3570           v3 += 5;
3571           v4 += 5;
3572           v5 += 5;
3573           for (n = 0; n < sz - 1; n += 2) {
3574             i1 = idx[0];
3575             i2 = idx[1];
3576             idx += 2;
3577             tmp0 = x[i1];
3578             tmp1 = x[i2];
3579             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3580             v1 += 2;
3581             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3582             v2 += 2;
3583             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3584             v3 += 2;
3585             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3586             v4 += 2;
3587             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3588             v5 += 2;
3589           }
3590           if (n == sz - 1) {
3591             tmp0 = x[*idx];
3592             sum1 -= v1[0] * tmp0;
3593             sum2 -= v2[0] * tmp0;
3594             sum3 -= v3[0] * tmp0;
3595             sum4 -= v4[0] * tmp0;
3596             sum5 -= v5[0] * tmp0;
3597           }
3598           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3599           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3600           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3601           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3602           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3603           break;
3604         default:
3605           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3606         }
3607       }
3608       xb = t;
3609       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3610     } else xb = b;
3611 
3612     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3613       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3614       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3615         nodesz = sizes[i + 1] - sizes[i];
3616         ibdiag -= nodesz * nodesz;
3617 
3618         /* set RHS */
3619         if (xb == b) {
3620           /* whole (old way) */
3621           sz  = ii[row + 1] - ii[row];
3622           idx = a->j + ii[row];
3623           switch (nodesz) {
3624           case 5:
3625             v5 = a->a + ii[row - 4]; /* fall through */
3626           case 4:
3627             v4 = a->a + ii[row - 3]; /* fall through */
3628           case 3:
3629             v3 = a->a + ii[row - 2]; /* fall through */
3630           case 2:
3631             v2 = a->a + ii[row - 1]; /* fall through */
3632           case 1:
3633             v1 = a->a + ii[row];
3634             break;
3635           default:
3636             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3637           }
3638         } else {
3639           /* upper, no diag */
3640           sz  = ii[row + 1] - diag[row] - 1;
3641           idx = a->j + diag[row] + 1;
3642           switch (nodesz) {
3643           case 5:
3644             v5 = a->a + diag[row - 4] + 5; /* fall through */
3645           case 4:
3646             v4 = a->a + diag[row - 3] + 4; /* fall through */
3647           case 3:
3648             v3 = a->a + diag[row - 2] + 3; /* fall through */
3649           case 2:
3650             v2 = a->a + diag[row - 1] + 2; /* fall through */
3651           case 1:
3652             v1 = a->a + diag[row] + 1;
3653           }
3654         }
3655         /* set sum */
3656         switch (nodesz) {
3657         case 5:
3658           sum5 = xb[row - 4]; /* fall through */
3659         case 4:
3660           sum4 = xb[row - 3]; /* fall through */
3661         case 3:
3662           sum3 = xb[row - 2]; /* fall through */
3663         case 2:
3664           sum2 = xb[row - 1]; /* fall through */
3665         case 1:
3666           /* note that sum1 is associated with the last row */
3667           sum1 = xb[row];
3668         }
3669         /* do sums */
3670         for (n = 0; n < sz - 1; n += 2) {
3671           i1 = idx[0];
3672           i2 = idx[1];
3673           idx += 2;
3674           tmp0 = x[i1];
3675           tmp1 = x[i2];
3676           switch (nodesz) {
3677           case 5:
3678             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3679             v5 += 2; /* fall through */
3680           case 4:
3681             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3682             v4 += 2; /* fall through */
3683           case 3:
3684             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3685             v3 += 2; /* fall through */
3686           case 2:
3687             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3688             v2 += 2; /* fall through */
3689           case 1:
3690             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3691             v1 += 2;
3692           }
3693         }
3694         /* ragged edge */
3695         if (n == sz - 1) {
3696           tmp0 = x[*idx];
3697           switch (nodesz) {
3698           case 5:
3699             sum5 -= *v5 * tmp0; /* fall through */
3700           case 4:
3701             sum4 -= *v4 * tmp0; /* fall through */
3702           case 3:
3703             sum3 -= *v3 * tmp0; /* fall through */
3704           case 2:
3705             sum2 -= *v2 * tmp0; /* fall through */
3706           case 1:
3707             sum1 -= *v1 * tmp0;
3708           }
3709         }
3710         /* update */
3711         if (xb == b) {
3712           /* whole (old way) w/ diag */
3713           switch (nodesz) {
3714           case 5:
3715             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3716             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3717             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3718             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3719             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3720             break;
3721           case 4:
3722             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3723             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3724             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3725             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3726             break;
3727           case 3:
3728             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3729             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3730             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3731             break;
3732           case 2:
3733             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3734             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3735             break;
3736           case 1:
3737             x[row--] += sum1 * (*ibdiag);
3738             break;
3739           }
3740         } else {
3741           /* no diag so set =  */
3742           switch (nodesz) {
3743           case 5:
3744             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3745             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3746             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3747             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3748             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3749             break;
3750           case 4:
3751             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3752             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3753             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3754             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3755             break;
3756           case 3:
3757             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3758             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3759             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3760             break;
3761           case 2:
3762             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3763             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3764             break;
3765           case 1:
3766             x[row--] = sum1 * (*ibdiag);
3767             break;
3768           }
3769         }
3770       }
3771       if (xb == b) {
3772         PetscCall(PetscLogFlops(2.0 * a->nz));
3773       } else {
3774         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3775       }
3776     }
3777   }
3778   if (flag & SOR_EISENSTAT) {
3779     /*
3780           Apply  (U + D)^-1  where D is now the block diagonal
3781     */
3782     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3783     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3784       nodesz = sizes[i + 1] - sizes[i];
3785       ibdiag -= nodesz * nodesz;
3786       sz  = ii[row + 1] - diag[row] - 1;
3787       v1  = a->a + diag[row] + 1;
3788       idx = a->j + diag[row] + 1;
3789       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3790       switch (nodesz) {
3791       case 1:
3792 
3793         sum1 = b[row];
3794         for (n = 0; n < sz - 1; n += 2) {
3795           i1 = idx[0];
3796           i2 = idx[1];
3797           idx += 2;
3798           tmp0 = x[i1];
3799           tmp1 = x[i2];
3800           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3801           v1 += 2;
3802         }
3803 
3804         if (n == sz - 1) {
3805           tmp0 = x[*idx];
3806           sum1 -= *v1 * tmp0;
3807         }
3808         x[row] = sum1 * (*ibdiag);
3809         row--;
3810         break;
3811 
3812       case 2:
3813 
3814         sum1 = b[row];
3815         sum2 = b[row - 1];
3816         /* note that sum1 is associated with the second of the two rows */
3817         v2 = a->a + diag[row - 1] + 2;
3818         for (n = 0; n < sz - 1; n += 2) {
3819           i1 = idx[0];
3820           i2 = idx[1];
3821           idx += 2;
3822           tmp0 = x[i1];
3823           tmp1 = x[i2];
3824           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3825           v1 += 2;
3826           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3827           v2 += 2;
3828         }
3829 
3830         if (n == sz - 1) {
3831           tmp0 = x[*idx];
3832           sum1 -= *v1 * tmp0;
3833           sum2 -= *v2 * tmp0;
3834         }
3835         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3836         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3837         row -= 2;
3838         break;
3839       case 3:
3840 
3841         sum1 = b[row];
3842         sum2 = b[row - 1];
3843         sum3 = b[row - 2];
3844         v2   = a->a + diag[row - 1] + 2;
3845         v3   = a->a + diag[row - 2] + 3;
3846         for (n = 0; n < sz - 1; n += 2) {
3847           i1 = idx[0];
3848           i2 = idx[1];
3849           idx += 2;
3850           tmp0 = x[i1];
3851           tmp1 = x[i2];
3852           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3853           v1 += 2;
3854           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3855           v2 += 2;
3856           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3857           v3 += 2;
3858         }
3859 
3860         if (n == sz - 1) {
3861           tmp0 = x[*idx];
3862           sum1 -= *v1 * tmp0;
3863           sum2 -= *v2 * tmp0;
3864           sum3 -= *v3 * tmp0;
3865         }
3866         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3867         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3868         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3869         row -= 3;
3870         break;
3871       case 4:
3872 
3873         sum1 = b[row];
3874         sum2 = b[row - 1];
3875         sum3 = b[row - 2];
3876         sum4 = b[row - 3];
3877         v2   = a->a + diag[row - 1] + 2;
3878         v3   = a->a + diag[row - 2] + 3;
3879         v4   = a->a + diag[row - 3] + 4;
3880         for (n = 0; n < sz - 1; n += 2) {
3881           i1 = idx[0];
3882           i2 = idx[1];
3883           idx += 2;
3884           tmp0 = x[i1];
3885           tmp1 = x[i2];
3886           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3887           v1 += 2;
3888           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3889           v2 += 2;
3890           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3891           v3 += 2;
3892           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3893           v4 += 2;
3894         }
3895 
3896         if (n == sz - 1) {
3897           tmp0 = x[*idx];
3898           sum1 -= *v1 * tmp0;
3899           sum2 -= *v2 * tmp0;
3900           sum3 -= *v3 * tmp0;
3901           sum4 -= *v4 * tmp0;
3902         }
3903         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3904         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3905         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3906         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3907         row -= 4;
3908         break;
3909       case 5:
3910 
3911         sum1 = b[row];
3912         sum2 = b[row - 1];
3913         sum3 = b[row - 2];
3914         sum4 = b[row - 3];
3915         sum5 = b[row - 4];
3916         v2   = a->a + diag[row - 1] + 2;
3917         v3   = a->a + diag[row - 2] + 3;
3918         v4   = a->a + diag[row - 3] + 4;
3919         v5   = a->a + diag[row - 4] + 5;
3920         for (n = 0; n < sz - 1; n += 2) {
3921           i1 = idx[0];
3922           i2 = idx[1];
3923           idx += 2;
3924           tmp0 = x[i1];
3925           tmp1 = x[i2];
3926           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3927           v1 += 2;
3928           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3929           v2 += 2;
3930           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3931           v3 += 2;
3932           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3933           v4 += 2;
3934           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3935           v5 += 2;
3936         }
3937 
3938         if (n == sz - 1) {
3939           tmp0 = x[*idx];
3940           sum1 -= *v1 * tmp0;
3941           sum2 -= *v2 * tmp0;
3942           sum3 -= *v3 * tmp0;
3943           sum4 -= *v4 * tmp0;
3944           sum5 -= *v5 * tmp0;
3945         }
3946         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3947         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3948         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3949         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3950         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3951         row -= 5;
3952         break;
3953       default:
3954         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3955       }
3956     }
3957     PetscCall(PetscLogFlops(a->nz));
3958 
3959     /*
3960            t = b - D x    where D is the block diagonal
3961     */
3962     cnt = 0;
3963     for (i = 0, row = 0; i < m; i++) {
3964       nodesz = sizes[i + 1] - sizes[i];
3965       switch (nodesz) {
3966       case 1:
3967         t[row] = b[row] - bdiag[cnt++] * x[row];
3968         row++;
3969         break;
3970       case 2:
3971         x1         = x[row];
3972         x2         = x[row + 1];
3973         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3974         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3975         t[row]     = b[row] - tmp1;
3976         t[row + 1] = b[row + 1] - tmp2;
3977         row += 2;
3978         cnt += 4;
3979         break;
3980       case 3:
3981         x1         = x[row];
3982         x2         = x[row + 1];
3983         x3         = x[row + 2];
3984         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3985         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3986         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3987         t[row]     = b[row] - tmp1;
3988         t[row + 1] = b[row + 1] - tmp2;
3989         t[row + 2] = b[row + 2] - tmp3;
3990         row += 3;
3991         cnt += 9;
3992         break;
3993       case 4:
3994         x1         = x[row];
3995         x2         = x[row + 1];
3996         x3         = x[row + 2];
3997         x4         = x[row + 3];
3998         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3999         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4000         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4001         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4002         t[row]     = b[row] - tmp1;
4003         t[row + 1] = b[row + 1] - tmp2;
4004         t[row + 2] = b[row + 2] - tmp3;
4005         t[row + 3] = b[row + 3] - tmp4;
4006         row += 4;
4007         cnt += 16;
4008         break;
4009       case 5:
4010         x1         = x[row];
4011         x2         = x[row + 1];
4012         x3         = x[row + 2];
4013         x4         = x[row + 3];
4014         x5         = x[row + 4];
4015         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4016         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4017         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4018         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4019         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4020         t[row]     = b[row] - tmp1;
4021         t[row + 1] = b[row + 1] - tmp2;
4022         t[row + 2] = b[row + 2] - tmp3;
4023         t[row + 3] = b[row + 3] - tmp4;
4024         t[row + 4] = b[row + 4] - tmp5;
4025         row += 5;
4026         cnt += 25;
4027         break;
4028       default:
4029         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4030       }
4031     }
4032     PetscCall(PetscLogFlops(m));
4033 
4034     /*
4035           Apply (L + D)^-1 where D is the block diagonal
4036     */
4037     for (i = 0, row = 0; i < m; i++) {
4038       nodesz = sizes[i + 1] - sizes[i];
4039       sz     = diag[row] - ii[row];
4040       v1     = a->a + ii[row];
4041       idx    = a->j + ii[row];
4042       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4043       switch (nodesz) {
4044       case 1:
4045 
4046         sum1 = t[row];
4047         for (n = 0; n < sz - 1; n += 2) {
4048           i1 = idx[0];
4049           i2 = idx[1];
4050           idx += 2;
4051           tmp0 = t[i1];
4052           tmp1 = t[i2];
4053           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4054           v1 += 2;
4055         }
4056 
4057         if (n == sz - 1) {
4058           tmp0 = t[*idx];
4059           sum1 -= *v1 * tmp0;
4060         }
4061         x[row] += t[row] = sum1 * (*ibdiag++);
4062         row++;
4063         break;
4064       case 2:
4065         v2   = a->a + ii[row + 1];
4066         sum1 = t[row];
4067         sum2 = t[row + 1];
4068         for (n = 0; n < sz - 1; n += 2) {
4069           i1 = idx[0];
4070           i2 = idx[1];
4071           idx += 2;
4072           tmp0 = t[i1];
4073           tmp1 = t[i2];
4074           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075           v1 += 2;
4076           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077           v2 += 2;
4078         }
4079 
4080         if (n == sz - 1) {
4081           tmp0 = t[*idx];
4082           sum1 -= v1[0] * tmp0;
4083           sum2 -= v2[0] * tmp0;
4084         }
4085         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4086         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4087         ibdiag += 4;
4088         row += 2;
4089         break;
4090       case 3:
4091         v2   = a->a + ii[row + 1];
4092         v3   = a->a + ii[row + 2];
4093         sum1 = t[row];
4094         sum2 = t[row + 1];
4095         sum3 = t[row + 2];
4096         for (n = 0; n < sz - 1; n += 2) {
4097           i1 = idx[0];
4098           i2 = idx[1];
4099           idx += 2;
4100           tmp0 = t[i1];
4101           tmp1 = t[i2];
4102           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4103           v1 += 2;
4104           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4105           v2 += 2;
4106           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4107           v3 += 2;
4108         }
4109 
4110         if (n == sz - 1) {
4111           tmp0 = t[*idx];
4112           sum1 -= v1[0] * tmp0;
4113           sum2 -= v2[0] * tmp0;
4114           sum3 -= v3[0] * tmp0;
4115         }
4116         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4117         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4118         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4119         ibdiag += 9;
4120         row += 3;
4121         break;
4122       case 4:
4123         v2   = a->a + ii[row + 1];
4124         v3   = a->a + ii[row + 2];
4125         v4   = a->a + ii[row + 3];
4126         sum1 = t[row];
4127         sum2 = t[row + 1];
4128         sum3 = t[row + 2];
4129         sum4 = t[row + 3];
4130         for (n = 0; n < sz - 1; n += 2) {
4131           i1 = idx[0];
4132           i2 = idx[1];
4133           idx += 2;
4134           tmp0 = t[i1];
4135           tmp1 = t[i2];
4136           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4137           v1 += 2;
4138           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4139           v2 += 2;
4140           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4141           v3 += 2;
4142           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4143           v4 += 2;
4144         }
4145 
4146         if (n == sz - 1) {
4147           tmp0 = t[*idx];
4148           sum1 -= v1[0] * tmp0;
4149           sum2 -= v2[0] * tmp0;
4150           sum3 -= v3[0] * tmp0;
4151           sum4 -= v4[0] * tmp0;
4152         }
4153         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4154         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4155         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4156         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4157         ibdiag += 16;
4158         row += 4;
4159         break;
4160       case 5:
4161         v2   = a->a + ii[row + 1];
4162         v3   = a->a + ii[row + 2];
4163         v4   = a->a + ii[row + 3];
4164         v5   = a->a + ii[row + 4];
4165         sum1 = t[row];
4166         sum2 = t[row + 1];
4167         sum3 = t[row + 2];
4168         sum4 = t[row + 3];
4169         sum5 = t[row + 4];
4170         for (n = 0; n < sz - 1; n += 2) {
4171           i1 = idx[0];
4172           i2 = idx[1];
4173           idx += 2;
4174           tmp0 = t[i1];
4175           tmp1 = t[i2];
4176           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4177           v1 += 2;
4178           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4179           v2 += 2;
4180           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4181           v3 += 2;
4182           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4183           v4 += 2;
4184           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4185           v5 += 2;
4186         }
4187 
4188         if (n == sz - 1) {
4189           tmp0 = t[*idx];
4190           sum1 -= v1[0] * tmp0;
4191           sum2 -= v2[0] * tmp0;
4192           sum3 -= v3[0] * tmp0;
4193           sum4 -= v4[0] * tmp0;
4194           sum5 -= v5[0] * tmp0;
4195         }
4196         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4197         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4198         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4199         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4200         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4201         ibdiag += 25;
4202         row += 5;
4203         break;
4204       default:
4205         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4206       }
4207     }
4208     PetscCall(PetscLogFlops(a->nz));
4209   }
4210   PetscCall(VecRestoreArray(xx, &x));
4211   PetscCall(VecRestoreArrayRead(bb, &b));
4212   PetscFunctionReturn(PETSC_SUCCESS);
4213 }
4214 
4215 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4216 {
4217   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4218   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4219   const MatScalar   *bdiag = a->inode.bdiag;
4220   const PetscScalar *b;
4221   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4222   const PetscInt    *sizes = a->inode.size_csr;
4223 
4224   PetscFunctionBegin;
4225   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4226   PetscCall(VecGetArray(xx, &x));
4227   PetscCall(VecGetArrayRead(bb, &b));
4228   cnt = 0;
4229   for (i = 0, row = 0; i < m; i++) {
4230     nodesz = sizes[i + 1] - sizes[i];
4231     switch (nodesz) {
4232     case 1:
4233       x[row] = b[row] * bdiag[cnt++];
4234       row++;
4235       break;
4236     case 2:
4237       x1       = b[row];
4238       x2       = b[row + 1];
4239       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4240       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4241       x[row++] = tmp1;
4242       x[row++] = tmp2;
4243       cnt += 4;
4244       break;
4245     case 3:
4246       x1       = b[row];
4247       x2       = b[row + 1];
4248       x3       = b[row + 2];
4249       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4250       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4251       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4252       x[row++] = tmp1;
4253       x[row++] = tmp2;
4254       x[row++] = tmp3;
4255       cnt += 9;
4256       break;
4257     case 4:
4258       x1       = b[row];
4259       x2       = b[row + 1];
4260       x3       = b[row + 2];
4261       x4       = b[row + 3];
4262       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4263       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4264       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4265       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4266       x[row++] = tmp1;
4267       x[row++] = tmp2;
4268       x[row++] = tmp3;
4269       x[row++] = tmp4;
4270       cnt += 16;
4271       break;
4272     case 5:
4273       x1       = b[row];
4274       x2       = b[row + 1];
4275       x3       = b[row + 2];
4276       x4       = b[row + 3];
4277       x5       = b[row + 4];
4278       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4279       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4280       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4281       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4282       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4283       x[row++] = tmp1;
4284       x[row++] = tmp2;
4285       x[row++] = tmp3;
4286       x[row++] = tmp4;
4287       x[row++] = tmp5;
4288       cnt += 25;
4289       break;
4290     default:
4291       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4292     }
4293   }
4294   PetscCall(PetscLogFlops(2.0 * cnt));
4295   PetscCall(VecRestoreArray(xx, &x));
4296   PetscCall(VecRestoreArrayRead(bb, &b));
4297   PetscFunctionReturn(PETSC_SUCCESS);
4298 }
4299 
4300 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4301 {
4302   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4303 
4304   PetscFunctionBegin;
4305   a->inode.node_count       = 0;
4306   a->inode.use              = PETSC_FALSE;
4307   a->inode.checked          = PETSC_FALSE;
4308   a->inode.mat_nonzerostate = -1;
4309   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4310   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4311   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4312   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4313   A->ops->coloringpatch     = NULL;
4314   A->ops->multdiagonalblock = NULL;
4315   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4316   PetscFunctionReturn(PETSC_SUCCESS);
4317 }
4318 
4319 /*
4320     samestructure indicates that the matrix has not changed its nonzero structure so we
4321     do not need to recompute the inodes
4322 */
4323 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4324 {
4325   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4326   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4327   PetscBool       flag;
4328   const PetscInt *idx, *idy, *ii;
4329 
4330   PetscFunctionBegin;
4331   if (!a->inode.use) {
4332     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4333     PetscCall(PetscFree(a->inode.size_csr));
4334     PetscFunctionReturn(PETSC_SUCCESS);
4335   }
4336   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4337 
4338   m = A->rmap->n;
4339   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4340   ns    = a->inode.size_csr;
4341   ns[0] = 0;
4342 
4343   i          = 0;
4344   node_count = 0;
4345   idx        = a->j;
4346   ii         = a->i;
4347   if (idx) {
4348     while (i < m) {            /* For each row */
4349       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4350       /* Limits the number of elements in a node to 'a->inode.limit' */
4351       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4352         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4353         if (nzy != nzx) break;
4354         idy += nzx; /* Same nonzero pattern */
4355         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4356         if (!flag) break;
4357       }
4358       ns[node_count + 1] = ns[node_count] + blk_size;
4359       node_count++;
4360       idx += blk_size * nzx;
4361       i = j;
4362     }
4363   }
4364   /* If not enough inodes found,, do not use inode version of the routines */
4365   if (!m || !idx || node_count > .8 * m) {
4366     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4367     PetscCall(PetscFree(a->inode.size_csr));
4368     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4369   } else {
4370     if (!A->factortype) {
4371       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4372       if (A->rmap->n == A->cmap->n) {
4373         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4374         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4375         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4376         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4377         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4378       }
4379     } else {
4380       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4381     }
4382     a->inode.node_count = node_count;
4383     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4384   }
4385   a->inode.checked          = PETSC_TRUE;
4386   a->inode.mat_nonzerostate = A->nonzerostate;
4387   PetscFunctionReturn(PETSC_SUCCESS);
4388 }
4389 
4390 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4391 {
4392   Mat         B = *C;
4393   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4394   PetscInt    m = A->rmap->n;
4395 
4396   PetscFunctionBegin;
4397   c->inode.use              = a->inode.use;
4398   c->inode.limit            = a->inode.limit;
4399   c->inode.max_limit        = a->inode.max_limit;
4400   c->inode.checked          = PETSC_FALSE;
4401   c->inode.size_csr         = NULL;
4402   c->inode.node_count       = 0;
4403   c->inode.ibdiagvalid      = PETSC_FALSE;
4404   c->inode.ibdiag           = NULL;
4405   c->inode.bdiag            = NULL;
4406   c->inode.mat_nonzerostate = -1;
4407   if (a->inode.use) {
4408     if (a->inode.checked && a->inode.size_csr) {
4409       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4410       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4411 
4412       c->inode.checked          = PETSC_TRUE;
4413       c->inode.node_count       = a->inode.node_count;
4414       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4415     }
4416     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4417     if (!B->factortype) {
4418       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4419       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4420       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4421       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4422       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4423       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4424     } else {
4425       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4426     }
4427   }
4428   PetscFunctionReturn(PETSC_SUCCESS);
4429 }
4430 
4431 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4432 {
4433   PetscInt        k;
4434   const PetscInt *vi;
4435 
4436   PetscFunctionBegin;
4437   vi = aj + ai[row];
4438   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4439   vi        = aj + adiag[row];
4440   cols[nzl] = vi[0];
4441   vi        = aj + adiag[row + 1] + 1;
4442   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4443   PetscFunctionReturn(PETSC_SUCCESS);
4444 }
4445 /*
4446    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4447    Modified from MatSeqAIJCheckInode().
4448 
4449    Input Parameters:
4450 .  Mat A - ILU or LU matrix factor
4451 
4452 */
4453 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4454 {
4455   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4456   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4457   PetscInt       *cols1, *cols2, *ns;
4458   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4459   PetscBool       flag;
4460 
4461   PetscFunctionBegin;
4462   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4463   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4464 
4465   m = A->rmap->n;
4466   if (a->inode.size_csr) ns = a->inode.size_csr;
4467   else PetscCall(PetscMalloc1(m + 1, &ns));
4468   ns[0] = 0;
4469 
4470   i          = 0;
4471   node_count = 0;
4472   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4473   while (i < m) {                       /* For each row */
4474     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4475     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4476     nzx  = nzl1 + nzu1 + 1;
4477     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4478 
4479     /* Limits the number of elements in a node to 'a->inode.limit' */
4480     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4481       nzl2 = ai[j + 1] - ai[j];
4482       nzu2 = adiag[j] - adiag[j + 1] - 1;
4483       nzy  = nzl2 + nzu2 + 1;
4484       if (nzy != nzx) break;
4485       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4486       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4487       if (!flag) break;
4488     }
4489     ns[node_count + 1] = ns[node_count] + blk_size;
4490     node_count++;
4491     i = j;
4492   }
4493   PetscCall(PetscFree2(cols1, cols2));
4494   /* If not enough inodes found,, do not use inode version of the routines */
4495   if (!m || node_count > .8 * m) {
4496     PetscCall(PetscFree(ns));
4497 
4498     a->inode.node_count = 0;
4499     a->inode.size_csr   = NULL;
4500     a->inode.use        = PETSC_FALSE;
4501 
4502     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4503   } else {
4504     A->ops->mult              = NULL;
4505     A->ops->sor               = NULL;
4506     A->ops->multadd           = NULL;
4507     A->ops->getrowij          = NULL;
4508     A->ops->restorerowij      = NULL;
4509     A->ops->getcolumnij       = NULL;
4510     A->ops->restorecolumnij   = NULL;
4511     A->ops->coloringpatch     = NULL;
4512     A->ops->multdiagonalblock = NULL;
4513     a->inode.node_count       = node_count;
4514     a->inode.size_csr         = ns;
4515     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4516   }
4517   a->inode.checked = PETSC_TRUE;
4518   PetscFunctionReturn(PETSC_SUCCESS);
4519 }
4520 
4521 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4522 {
4523   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4524 
4525   PetscFunctionBegin;
4526   a->inode.ibdiagvalid = PETSC_FALSE;
4527   PetscFunctionReturn(PETSC_SUCCESS);
4528 }
4529 
4530 /*
4531      This is really ugly. if inodes are used this replaces the
4532   permutations with ones that correspond to rows/cols of the matrix
4533   rather than inode blocks
4534 */
4535 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4536 {
4537   PetscFunctionBegin;
4538   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4539   PetscFunctionReturn(PETSC_SUCCESS);
4540 }
4541 
4542 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4543 {
4544   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4545   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4546   const PetscInt *ridx, *cidx;
4547   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4548   PetscInt        nslim_col, *ns_col;
4549   IS              ris = *rperm, cis = *cperm;
4550 
4551   PetscFunctionBegin;
4552   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4553   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4554 
4555   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4556   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4557   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4558 
4559   PetscCall(ISGetIndices(ris, &ridx));
4560   PetscCall(ISGetIndices(cis, &cidx));
4561 
4562   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4563   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4564 
4565   /* Construct the permutations for rows*/
4566   for (i = 0, row = 0; i < nslim_row; ++i) {
4567     indx      = ridx[i];
4568     start_val = tns[indx];
4569     end_val   = tns[indx + 1];
4570     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4571   }
4572 
4573   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4574   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4575 
4576   /* Construct permutations for columns */
4577   for (i = 0, col = 0; i < nslim_col; ++i) {
4578     indx      = cidx[i];
4579     start_val = tns[indx];
4580     end_val   = tns[indx + 1];
4581     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4582   }
4583 
4584   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4585   PetscCall(ISSetPermutation(*rperm));
4586   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4587   PetscCall(ISSetPermutation(*cperm));
4588 
4589   PetscCall(ISRestoreIndices(ris, &ridx));
4590   PetscCall(ISRestoreIndices(cis, &cidx));
4591 
4592   PetscCall(PetscFree(ns_col));
4593   PetscCall(PetscFree2(permr, permc));
4594   PetscCall(ISDestroy(&cis));
4595   PetscCall(ISDestroy(&ris));
4596   PetscCall(PetscFree(tns));
4597   PetscFunctionReturn(PETSC_SUCCESS);
4598 }
4599 
4600 /*@C
4601   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4602 
4603   Not Collective
4604 
4605   Input Parameter:
4606 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4607 
4608   Output Parameters:
4609 + node_count - no of inodes present in the matrix.
4610 . sizes      - an array of size `node_count`, with the sizes of each inode.
4611 - limit      - the max size used to generate the inodes.
4612 
4613   Level: advanced
4614 
4615   Note:
4616   It should be called after the matrix is assembled.
4617   The contents of the sizes[] array should not be changed.
4618   `NULL` may be passed for information not needed
4619 
4620 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4621 @*/
4622 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4623 {
4624   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4625 
4626   PetscFunctionBegin;
4627   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4628   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4629   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4630   PetscFunctionReturn(PETSC_SUCCESS);
4631 }
4632 
4633 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4634 {
4635   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4636 
4637   PetscFunctionBegin;
4638   if (node_count) *node_count = a->inode.node_count;
4639   if (sizes) *sizes = a->inode.size_csr;
4640   if (limit) *limit = a->inode.limit;
4641   PetscFunctionReturn(PETSC_SUCCESS);
4642 }
4643