xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 4d12350b4d1d562f24a95bd06d554a68a71c004f)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size_csr;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30   ns_col[0] = 0;
31 
32   /* Use the same row structure wherever feasible. */
33   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34 
35   /* if m < n; pad up the remainder with inode_limit */
36   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37   /* The last node is the odd ball. pad it up with the remaining rows; */
38   if (count < n) {
39     ns_col[i + 1] = ns_col[i] + (n - count);
40     i++;
41   } else if (count > n) {
42     /* Adjust for the over estimation */
43     ns_col[i] += n - count;
44   }
45   *size = i;
46   *ns   = ns_col;
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /*
51       This builds symmetric version of nonzero structure,
52 */
53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
56   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59 
60   PetscFunctionBegin;
61   nslim_row = a->inode.node_count;
62   m         = A->rmap->n;
63   n         = A->cmap->n;
64   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66 
67   /* Use the row_inode as column_inode */
68   nslim_col = nslim_row;
69   ns_col    = ns_row;
70 
71   /* allocate space for reformatted inode structure */
72   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74 
75   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76     nsz = ns_col[i1 + 1] - ns_col[i1];
77     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78   }
79   /* allocate space for row pointers */
80   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81   *iia = ia;
82   PetscCall(PetscMalloc1(nslim_row + 1, &work));
83 
84   /* determine the number of columns in each row */
85   ia[0] = oshift;
86   for (i1 = 0; i1 < nslim_row; i1++) {
87     row  = ns_row[i1];
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0; i1 < nslim_row; i1++) {
117     row  = ns_row[i1];
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1 + 1] - ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0; i1 < nslim_row; i1++) {
171     row = ns_row[i1];
172     j   = aj + ai[row] + ishift;
173     nz  = ai[row + 1] - ai[row];
174     if (!nz) continue; /* empty row */
175     col = *j++ + ishift;
176     i2  = tvc[col];
177     while (nz-- > 0) { /* off-diagonal elements */
178       ia[i1 + 1]++;
179       i2++; /* Start col of next node */
180       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181       if (nz > 0) i2 = tvc[col];
182     }
183   }
184 
185   /* shift ia[i] to point to next row */
186   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187     row = ia[i1 - 1];
188     ia[i1] += row;
189     work[i1 - 1] = row - oshift;
190   }
191 
192   /* allocate space for column pointers */
193   nz = ia[nslim_row] + (!ishift);
194   PetscCall(PetscMalloc1(nz, &ja));
195   *jja = ja;
196 
197   /* loop over matrix putting into ja */
198   for (i1 = 0; i1 < nslim_row; i1++) {
199     row = ns_row[i1];
200     j   = aj + ai[row] + ishift;
201     nz  = ai[row + 1] - ai[row];
202     if (!nz) continue; /* empty row */
203     col = *j++ + ishift;
204     i2  = tvc[col];
205     while (nz-- > 0) {
206       ja[work[i1]++] = i2 + oshift;
207       ++i2;
208       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209       if (nz > 0) i2 = tvc[col];
210     }
211   }
212   PetscCall(PetscFree(ns_col));
213   PetscCall(PetscFree(work));
214   PetscCall(PetscFree2(tns, tvc));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221 
222   PetscFunctionBegin;
223   if (n) *n = a->inode.node_count;
224   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225   if (!blockcompressed) {
226     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227   } else if (symmetric) {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229   } else {
230     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231   }
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237   PetscFunctionBegin;
238   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239 
240   if (!blockcompressed) {
241     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242   } else {
243     PetscCall(PetscFree(*ia));
244     PetscCall(PetscFree(*ja));
245   }
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254 
255   PetscFunctionBegin;
256   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257   nslim_row = a->inode.node_count;
258   n         = A->cmap->n;
259 
260   /* Create The column_inode for this matrix */
261   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262 
263   /* allocate space for reformatted column_inode structure */
264   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266 
267   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268     nsz = ns_col[i1 + 1] - ns_col[i1];
269     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270   }
271   /* allocate space for column pointers */
272   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273   *iia = ia;
274   PetscCall(PetscMalloc1(nslim_col + 1, &work));
275 
276   /* determine the number of columns in each row */
277   ia[0] = oshift;
278   for (i1 = 0; i1 < nslim_row; i1++) {
279     row = ns_row[i1];
280     j   = aj + ai[row] + ishift;
281     col = *j++ + ishift;
282     i2  = tvc[col];
283     nz  = ai[row + 1] - ai[row];
284     while (nz-- > 0) { /* off-diagonal elements */
285       /* ia[i1+1]++; */
286       ia[i2 + 1]++;
287       i2++;
288       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289       if (nz > 0) i2 = tvc[col];
290     }
291   }
292 
293   /* shift ia[i] to point to next col */
294   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295     col = ia[i1 - 1];
296     ia[i1] += col;
297     work[i1 - 1] = col - oshift;
298   }
299 
300   /* allocate space for column pointers */
301   nz = ia[nslim_col] + (!ishift);
302   PetscCall(PetscMalloc1(nz, &ja));
303   *jja = ja;
304 
305   /* loop over matrix putting into ja */
306   for (i1 = 0; i1 < nslim_row; i1++) {
307     row = ns_row[i1];
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row + 1] - ai[row];
312     while (nz-- > 0) {
313       /* ja[work[i1]++] = i2 + oshift; */
314       ja[work[i2]++] = i1 + oshift;
315       i2++;
316       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320   PetscCall(PetscFree(ns_col));
321   PetscCall(PetscFree(work));
322   PetscCall(PetscFree2(tns, tvc));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreateColInode_Private(A, n, NULL));
330   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331 
332   if (!blockcompressed) {
333     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334   } else if (symmetric) {
335     /* Since the indices are symmetric it doesn't matter */
336     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337   } else {
338     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345   PetscFunctionBegin;
346   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347   if (!blockcompressed) {
348     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349   } else {
350     PetscCall(PetscFree(*ia));
351     PetscCall(PetscFree(*ja));
352   }
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
360   PetscScalar       *y;
361   const PetscScalar *x;
362   const MatScalar   *v1, *v2, *v3, *v4, *v5;
363   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
364   const PetscInt    *idx, *ns, *ii;
365 
366 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
367   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
368 #endif
369 
370   PetscFunctionBegin;
371   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
372   node_max = a->inode.node_count;
373   ns       = a->inode.size_csr; /* Node Size array */
374   PetscCall(VecGetArrayRead(xx, &x));
375   PetscCall(VecGetArray(yy, &y));
376   idx = a->j;
377   v1  = a->a;
378   ii  = a->i;
379 
380   for (i = 0, row = 0; i < node_max; ++i) {
381     nsz = ns[i + 1] - ns[i];
382     n   = ii[1] - ii[0];
383     nonzerorow += (n > 0) * nsz;
384     ii += nsz;
385     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
386     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
387     sz = n;                                                                /* No of non zeros in this row */
388                                                                            /* Switch on the size of Node */
389     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
390     case 1:
391       sum1 = 0.;
392 
393       for (n = 0; n < sz - 1; n += 2) {
394         i1 = idx[0]; /* The instructions are ordered to */
395         i2 = idx[1]; /* make the compiler's job easy */
396         idx += 2;
397         tmp0 = x[i1];
398         tmp1 = x[i2];
399         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
400         v1 += 2;
401       }
402 
403       if (n == sz - 1) { /* Take care of the last nonzero  */
404         tmp0 = x[*idx++];
405         sum1 += *v1++ * tmp0;
406       }
407       y[row++] = sum1;
408       break;
409     case 2:
410       sum1 = 0.;
411       sum2 = 0.;
412       v2   = v1 + n;
413 
414       for (n = 0; n < sz - 1; n += 2) {
415         i1 = idx[0];
416         i2 = idx[1];
417         idx += 2;
418         tmp0 = x[i1];
419         tmp1 = x[i2];
420         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
421         v1 += 2;
422         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
423         v2 += 2;
424       }
425       if (n == sz - 1) {
426         tmp0 = x[*idx++];
427         sum1 += *v1++ * tmp0;
428         sum2 += *v2++ * tmp0;
429       }
430       y[row++] = sum1;
431       y[row++] = sum2;
432       v1       = v2; /* Since the next block to be processed starts there*/
433       idx += sz;
434       break;
435     case 3:
436       sum1 = 0.;
437       sum2 = 0.;
438       sum3 = 0.;
439       v2   = v1 + n;
440       v3   = v2 + n;
441 
442       for (n = 0; n < sz - 1; n += 2) {
443         i1 = idx[0];
444         i2 = idx[1];
445         idx += 2;
446         tmp0 = x[i1];
447         tmp1 = x[i2];
448         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
449         v1 += 2;
450         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
451         v2 += 2;
452         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
453         v3 += 2;
454       }
455       if (n == sz - 1) {
456         tmp0 = x[*idx++];
457         sum1 += *v1++ * tmp0;
458         sum2 += *v2++ * tmp0;
459         sum3 += *v3++ * tmp0;
460       }
461       y[row++] = sum1;
462       y[row++] = sum2;
463       y[row++] = sum3;
464       v1       = v3; /* Since the next block to be processed starts there*/
465       idx += 2 * sz;
466       break;
467     case 4:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       sum4 = 0.;
472       v2   = v1 + n;
473       v3   = v2 + n;
474       v4   = v3 + n;
475 
476       for (n = 0; n < sz - 1; n += 2) {
477         i1 = idx[0];
478         i2 = idx[1];
479         idx += 2;
480         tmp0 = x[i1];
481         tmp1 = x[i2];
482         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
483         v1 += 2;
484         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
485         v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
487         v3 += 2;
488         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
489         v4 += 2;
490       }
491       if (n == sz - 1) {
492         tmp0 = x[*idx++];
493         sum1 += *v1++ * tmp0;
494         sum2 += *v2++ * tmp0;
495         sum3 += *v3++ * tmp0;
496         sum4 += *v4++ * tmp0;
497       }
498       y[row++] = sum1;
499       y[row++] = sum2;
500       y[row++] = sum3;
501       y[row++] = sum4;
502       v1       = v4; /* Since the next block to be processed starts there*/
503       idx += 3 * sz;
504       break;
505     case 5:
506       sum1 = 0.;
507       sum2 = 0.;
508       sum3 = 0.;
509       sum4 = 0.;
510       sum5 = 0.;
511       v2   = v1 + n;
512       v3   = v2 + n;
513       v4   = v3 + n;
514       v5   = v4 + n;
515 
516       for (n = 0; n < sz - 1; n += 2) {
517         i1 = idx[0];
518         i2 = idx[1];
519         idx += 2;
520         tmp0 = x[i1];
521         tmp1 = x[i2];
522         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
523         v1 += 2;
524         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
525         v2 += 2;
526         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
527         v3 += 2;
528         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
529         v4 += 2;
530         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
531         v5 += 2;
532       }
533       if (n == sz - 1) {
534         tmp0 = x[*idx++];
535         sum1 += *v1++ * tmp0;
536         sum2 += *v2++ * tmp0;
537         sum3 += *v3++ * tmp0;
538         sum4 += *v4++ * tmp0;
539         sum5 += *v5++ * tmp0;
540       }
541       y[row++] = sum1;
542       y[row++] = sum2;
543       y[row++] = sum3;
544       y[row++] = sum4;
545       y[row++] = sum5;
546       v1       = v5; /* Since the next block to be processed starts there */
547       idx += 4 * sz;
548       break;
549     default:
550       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
551     }
552   }
553   PetscCall(VecRestoreArrayRead(xx, &x));
554   PetscCall(VecRestoreArray(yy, &y));
555   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
556   PetscFunctionReturn(PETSC_SUCCESS);
557 }
558 
559 /* Almost same code as the MatMult_SeqAIJ_Inode() */
560 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
561 {
562   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
563   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
564   const MatScalar   *v1, *v2, *v3, *v4, *v5;
565   const PetscScalar *x;
566   PetscScalar       *y, *z, *zt;
567   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
568   const PetscInt    *idx, *ns, *ii;
569 
570   PetscFunctionBegin;
571   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
572   node_max = a->inode.node_count;
573   ns       = a->inode.size_csr; /* Node Size array */
574 
575   PetscCall(VecGetArrayRead(xx, &x));
576   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
577   zt = z;
578 
579   idx = a->j;
580   v1  = a->a;
581   ii  = a->i;
582 
583   for (i = 0; i < node_max; ++i) {
584     row = ns[i];
585     nsz = ns[i + 1] - ns[i];
586     n   = ii[1] - ii[0];
587     ii += nsz;
588     sz = n;        /* No of non zeros in this row */
589                    /* Switch on the size of Node */
590     switch (nsz) { /* Each loop in 'case' is unrolled */
591     case 1:
592       sum1 = *zt++;
593 
594       for (n = 0; n < sz - 1; n += 2) {
595         i1 = idx[0]; /* The instructions are ordered to */
596         i2 = idx[1]; /* make the compiler's job easy */
597         idx += 2;
598         tmp0 = x[i1];
599         tmp1 = x[i2];
600         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
601         v1 += 2;
602       }
603 
604       if (n == sz - 1) { /* Take care of the last nonzero  */
605         tmp0 = x[*idx++];
606         sum1 += *v1++ * tmp0;
607       }
608       y[row++] = sum1;
609       break;
610     case 2:
611       sum1 = *zt++;
612       sum2 = *zt++;
613       v2   = v1 + n;
614 
615       for (n = 0; n < sz - 1; n += 2) {
616         i1 = idx[0];
617         i2 = idx[1];
618         idx += 2;
619         tmp0 = x[i1];
620         tmp1 = x[i2];
621         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
622         v1 += 2;
623         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
624         v2 += 2;
625       }
626       if (n == sz - 1) {
627         tmp0 = x[*idx++];
628         sum1 += *v1++ * tmp0;
629         sum2 += *v2++ * tmp0;
630       }
631       y[row++] = sum1;
632       y[row++] = sum2;
633       v1       = v2; /* Since the next block to be processed starts there*/
634       idx += sz;
635       break;
636     case 3:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       sum3 = *zt++;
640       v2   = v1 + n;
641       v3   = v2 + n;
642 
643       for (n = 0; n < sz - 1; n += 2) {
644         i1 = idx[0];
645         i2 = idx[1];
646         idx += 2;
647         tmp0 = x[i1];
648         tmp1 = x[i2];
649         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
650         v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
652         v2 += 2;
653         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
654         v3 += 2;
655       }
656       if (n == sz - 1) {
657         tmp0 = x[*idx++];
658         sum1 += *v1++ * tmp0;
659         sum2 += *v2++ * tmp0;
660         sum3 += *v3++ * tmp0;
661       }
662       y[row++] = sum1;
663       y[row++] = sum2;
664       y[row++] = sum3;
665       v1       = v3; /* Since the next block to be processed starts there*/
666       idx += 2 * sz;
667       break;
668     case 4:
669       sum1 = *zt++;
670       sum2 = *zt++;
671       sum3 = *zt++;
672       sum4 = *zt++;
673       v2   = v1 + n;
674       v3   = v2 + n;
675       v4   = v3 + n;
676 
677       for (n = 0; n < sz - 1; n += 2) {
678         i1 = idx[0];
679         i2 = idx[1];
680         idx += 2;
681         tmp0 = x[i1];
682         tmp1 = x[i2];
683         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
684         v1 += 2;
685         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
686         v2 += 2;
687         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
688         v3 += 2;
689         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
690         v4 += 2;
691       }
692       if (n == sz - 1) {
693         tmp0 = x[*idx++];
694         sum1 += *v1++ * tmp0;
695         sum2 += *v2++ * tmp0;
696         sum3 += *v3++ * tmp0;
697         sum4 += *v4++ * tmp0;
698       }
699       y[row++] = sum1;
700       y[row++] = sum2;
701       y[row++] = sum3;
702       y[row++] = sum4;
703       v1       = v4; /* Since the next block to be processed starts there*/
704       idx += 3 * sz;
705       break;
706     case 5:
707       sum1 = *zt++;
708       sum2 = *zt++;
709       sum3 = *zt++;
710       sum4 = *zt++;
711       sum5 = *zt++;
712       v2   = v1 + n;
713       v3   = v2 + n;
714       v4   = v3 + n;
715       v5   = v4 + n;
716 
717       for (n = 0; n < sz - 1; n += 2) {
718         i1 = idx[0];
719         i2 = idx[1];
720         idx += 2;
721         tmp0 = x[i1];
722         tmp1 = x[i2];
723         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
724         v1 += 2;
725         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
726         v2 += 2;
727         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
728         v3 += 2;
729         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
730         v4 += 2;
731         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
732         v5 += 2;
733       }
734       if (n == sz - 1) {
735         tmp0 = x[*idx++];
736         sum1 += *v1++ * tmp0;
737         sum2 += *v2++ * tmp0;
738         sum3 += *v3++ * tmp0;
739         sum4 += *v4++ * tmp0;
740         sum5 += *v5++ * tmp0;
741       }
742       y[row++] = sum1;
743       y[row++] = sum2;
744       y[row++] = sum3;
745       y[row++] = sum4;
746       y[row++] = sum5;
747       v1       = v5; /* Since the next block to be processed starts there */
748       idx += 4 * sz;
749       break;
750     default:
751       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
752     }
753   }
754   PetscCall(VecRestoreArrayRead(xx, &x));
755   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
756   PetscCall(PetscLogFlops(2.0 * a->nz));
757   PetscFunctionReturn(PETSC_SUCCESS);
758 }
759 
760 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
761 {
762   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
763   IS                 iscol = a->col, isrow = a->row;
764   const PetscInt    *r, *c, *rout, *cout;
765   PetscInt           i, j, n = A->rmap->n, nz;
766   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
767   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
768   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
769   PetscScalar        sum1, sum2, sum3, sum4, sum5;
770   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
771   const PetscScalar *b;
772 
773   PetscFunctionBegin;
774   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
775   node_max = a->inode.node_count;
776   ns       = a->inode.size_csr; /* Node Size array */
777 
778   PetscCall(VecGetArrayRead(bb, &b));
779   PetscCall(VecGetArrayWrite(xx, &x));
780   tmp = a->solve_work;
781 
782   PetscCall(ISGetIndices(isrow, &rout));
783   r = rout;
784   PetscCall(ISGetIndices(iscol, &cout));
785   c = cout + (n - 1);
786 
787   /* forward solve the lower triangular */
788   tmps = tmp;
789   aa   = a_a;
790   aj   = a_j;
791   ad   = a->diag;
792 
793   for (i = 0, row = 0; i < node_max; ++i) {
794     row = ns[i];
795     nsz = ns[i + 1] - ns[i];
796     aii = ai[row];
797     v1  = aa + aii;
798     vi  = aj + aii;
799     nz  = ad[row] - aii;
800     if (i < node_max - 1) {
801       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
802       * but our indexing to determine its size could. */
803       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
804       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
805       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
806       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
807     }
808 
809     switch (nsz) { /* Each loop in 'case' is unrolled */
810     case 1:
811       sum1 = b[*r++];
812       for (j = 0; j < nz - 1; j += 2) {
813         i0 = vi[0];
814         i1 = vi[1];
815         vi += 2;
816         tmp0 = tmps[i0];
817         tmp1 = tmps[i1];
818         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
819         v1 += 2;
820       }
821       if (j == nz - 1) {
822         tmp0 = tmps[*vi++];
823         sum1 -= *v1++ * tmp0;
824       }
825       tmp[row++] = sum1;
826       break;
827     case 2:
828       sum1 = b[*r++];
829       sum2 = b[*r++];
830       v2   = aa + ai[row + 1];
831 
832       for (j = 0; j < nz - 1; j += 2) {
833         i0 = vi[0];
834         i1 = vi[1];
835         vi += 2;
836         tmp0 = tmps[i0];
837         tmp1 = tmps[i1];
838         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
839         v1 += 2;
840         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
841         v2 += 2;
842       }
843       if (j == nz - 1) {
844         tmp0 = tmps[*vi++];
845         sum1 -= *v1++ * tmp0;
846         sum2 -= *v2++ * tmp0;
847       }
848       sum2 -= *v2++ * sum1;
849       tmp[row++] = sum1;
850       tmp[row++] = sum2;
851       break;
852     case 3:
853       sum1 = b[*r++];
854       sum2 = b[*r++];
855       sum3 = b[*r++];
856       v2   = aa + ai[row + 1];
857       v3   = aa + ai[row + 2];
858 
859       for (j = 0; j < nz - 1; j += 2) {
860         i0 = vi[0];
861         i1 = vi[1];
862         vi += 2;
863         tmp0 = tmps[i0];
864         tmp1 = tmps[i1];
865         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
866         v1 += 2;
867         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
868         v2 += 2;
869         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
870         v3 += 2;
871       }
872       if (j == nz - 1) {
873         tmp0 = tmps[*vi++];
874         sum1 -= *v1++ * tmp0;
875         sum2 -= *v2++ * tmp0;
876         sum3 -= *v3++ * tmp0;
877       }
878       sum2 -= *v2++ * sum1;
879       sum3 -= *v3++ * sum1;
880       sum3 -= *v3++ * sum2;
881 
882       tmp[row++] = sum1;
883       tmp[row++] = sum2;
884       tmp[row++] = sum3;
885       break;
886 
887     case 4:
888       sum1 = b[*r++];
889       sum2 = b[*r++];
890       sum3 = b[*r++];
891       sum4 = b[*r++];
892       v2   = aa + ai[row + 1];
893       v3   = aa + ai[row + 2];
894       v4   = aa + ai[row + 3];
895 
896       for (j = 0; j < nz - 1; j += 2) {
897         i0 = vi[0];
898         i1 = vi[1];
899         vi += 2;
900         tmp0 = tmps[i0];
901         tmp1 = tmps[i1];
902         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
903         v1 += 2;
904         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
905         v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
907         v3 += 2;
908         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
909         v4 += 2;
910       }
911       if (j == nz - 1) {
912         tmp0 = tmps[*vi++];
913         sum1 -= *v1++ * tmp0;
914         sum2 -= *v2++ * tmp0;
915         sum3 -= *v3++ * tmp0;
916         sum4 -= *v4++ * tmp0;
917       }
918       sum2 -= *v2++ * sum1;
919       sum3 -= *v3++ * sum1;
920       sum4 -= *v4++ * sum1;
921       sum3 -= *v3++ * sum2;
922       sum4 -= *v4++ * sum2;
923       sum4 -= *v4++ * sum3;
924 
925       tmp[row++] = sum1;
926       tmp[row++] = sum2;
927       tmp[row++] = sum3;
928       tmp[row++] = sum4;
929       break;
930     case 5:
931       sum1 = b[*r++];
932       sum2 = b[*r++];
933       sum3 = b[*r++];
934       sum4 = b[*r++];
935       sum5 = b[*r++];
936       v2   = aa + ai[row + 1];
937       v3   = aa + ai[row + 2];
938       v4   = aa + ai[row + 3];
939       v5   = aa + ai[row + 4];
940 
941       for (j = 0; j < nz - 1; j += 2) {
942         i0 = vi[0];
943         i1 = vi[1];
944         vi += 2;
945         tmp0 = tmps[i0];
946         tmp1 = tmps[i1];
947         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
948         v1 += 2;
949         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
950         v2 += 2;
951         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
952         v3 += 2;
953         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
954         v4 += 2;
955         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
956         v5 += 2;
957       }
958       if (j == nz - 1) {
959         tmp0 = tmps[*vi++];
960         sum1 -= *v1++ * tmp0;
961         sum2 -= *v2++ * tmp0;
962         sum3 -= *v3++ * tmp0;
963         sum4 -= *v4++ * tmp0;
964         sum5 -= *v5++ * tmp0;
965       }
966 
967       sum2 -= *v2++ * sum1;
968       sum3 -= *v3++ * sum1;
969       sum4 -= *v4++ * sum1;
970       sum5 -= *v5++ * sum1;
971       sum3 -= *v3++ * sum2;
972       sum4 -= *v4++ * sum2;
973       sum5 -= *v5++ * sum2;
974       sum4 -= *v4++ * sum3;
975       sum5 -= *v5++ * sum3;
976       sum5 -= *v5++ * sum4;
977 
978       tmp[row++] = sum1;
979       tmp[row++] = sum2;
980       tmp[row++] = sum3;
981       tmp[row++] = sum4;
982       tmp[row++] = sum5;
983       break;
984     default:
985       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
986     }
987   }
988   /* backward solve the upper triangular */
989   for (i = node_max - 1; i >= 0; i--) {
990     row = ns[i + 1];
991     nsz = ns[i + 1] - ns[i];
992     aii = ai[row + 1] - 1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii - ad[row];
996     switch (nsz) { /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j = nz; j > 1; j -= 2) {
1001         vi -= 2;
1002         i0   = vi[2];
1003         i1   = vi[1];
1004         tmp0 = tmps[i0];
1005         tmp1 = tmps[i1];
1006         v1 -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j == 1) {
1010         tmp0 = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1014       row--;
1015       break;
1016     case 2:
1017       sum1 = tmp[row];
1018       sum2 = tmp[row - 1];
1019       v2   = aa + ai[row] - 1;
1020       for (j = nz; j > 1; j -= 2) {
1021         vi -= 2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1 -= 2;
1027         v2 -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j == 1) {
1032         tmp0 = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1038       row--;
1039       sum2 -= *v2-- * tmp0;
1040       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1041       row--;
1042       break;
1043     case 3:
1044       sum1 = tmp[row];
1045       sum2 = tmp[row - 1];
1046       sum3 = tmp[row - 2];
1047       v2   = aa + ai[row] - 1;
1048       v3   = aa + ai[row - 1] - 1;
1049       for (j = nz; j > 1; j -= 2) {
1050         vi -= 2;
1051         i0   = vi[2];
1052         i1   = vi[1];
1053         tmp0 = tmps[i0];
1054         tmp1 = tmps[i1];
1055         v1 -= 2;
1056         v2 -= 2;
1057         v3 -= 2;
1058         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1059         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1060         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1061       }
1062       if (j == 1) {
1063         tmp0 = tmps[*vi--];
1064         sum1 -= *v1-- * tmp0;
1065         sum2 -= *v2-- * tmp0;
1066         sum3 -= *v3-- * tmp0;
1067       }
1068       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1069       row--;
1070       sum2 -= *v2-- * tmp0;
1071       sum3 -= *v3-- * tmp0;
1072       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1073       row--;
1074       sum3 -= *v3-- * tmp0;
1075       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1076       row--;
1077 
1078       break;
1079     case 4:
1080       sum1 = tmp[row];
1081       sum2 = tmp[row - 1];
1082       sum3 = tmp[row - 2];
1083       sum4 = tmp[row - 3];
1084       v2   = aa + ai[row] - 1;
1085       v3   = aa + ai[row - 1] - 1;
1086       v4   = aa + ai[row - 2] - 1;
1087 
1088       for (j = nz; j > 1; j -= 2) {
1089         vi -= 2;
1090         i0   = vi[2];
1091         i1   = vi[1];
1092         tmp0 = tmps[i0];
1093         tmp1 = tmps[i1];
1094         v1 -= 2;
1095         v2 -= 2;
1096         v3 -= 2;
1097         v4 -= 2;
1098         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1099         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1100         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1101         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1102       }
1103       if (j == 1) {
1104         tmp0 = tmps[*vi--];
1105         sum1 -= *v1-- * tmp0;
1106         sum2 -= *v2-- * tmp0;
1107         sum3 -= *v3-- * tmp0;
1108         sum4 -= *v4-- * tmp0;
1109       }
1110 
1111       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1112       row--;
1113       sum2 -= *v2-- * tmp0;
1114       sum3 -= *v3-- * tmp0;
1115       sum4 -= *v4-- * tmp0;
1116       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1117       row--;
1118       sum3 -= *v3-- * tmp0;
1119       sum4 -= *v4-- * tmp0;
1120       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1121       row--;
1122       sum4 -= *v4-- * tmp0;
1123       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1124       row--;
1125       break;
1126     case 5:
1127       sum1 = tmp[row];
1128       sum2 = tmp[row - 1];
1129       sum3 = tmp[row - 2];
1130       sum4 = tmp[row - 3];
1131       sum5 = tmp[row - 4];
1132       v2   = aa + ai[row] - 1;
1133       v3   = aa + ai[row - 1] - 1;
1134       v4   = aa + ai[row - 2] - 1;
1135       v5   = aa + ai[row - 3] - 1;
1136       for (j = nz; j > 1; j -= 2) {
1137         vi -= 2;
1138         i0   = vi[2];
1139         i1   = vi[1];
1140         tmp0 = tmps[i0];
1141         tmp1 = tmps[i1];
1142         v1 -= 2;
1143         v2 -= 2;
1144         v3 -= 2;
1145         v4 -= 2;
1146         v5 -= 2;
1147         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1148         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1149         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1150         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1151         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1152       }
1153       if (j == 1) {
1154         tmp0 = tmps[*vi--];
1155         sum1 -= *v1-- * tmp0;
1156         sum2 -= *v2-- * tmp0;
1157         sum3 -= *v3-- * tmp0;
1158         sum4 -= *v4-- * tmp0;
1159         sum5 -= *v5-- * tmp0;
1160       }
1161 
1162       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1163       row--;
1164       sum2 -= *v2-- * tmp0;
1165       sum3 -= *v3-- * tmp0;
1166       sum4 -= *v4-- * tmp0;
1167       sum5 -= *v5-- * tmp0;
1168       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1169       row--;
1170       sum3 -= *v3-- * tmp0;
1171       sum4 -= *v4-- * tmp0;
1172       sum5 -= *v5-- * tmp0;
1173       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1174       row--;
1175       sum4 -= *v4-- * tmp0;
1176       sum5 -= *v5-- * tmp0;
1177       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1178       row--;
1179       sum5 -= *v5-- * tmp0;
1180       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1181       row--;
1182       break;
1183     default:
1184       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1185     }
1186   }
1187   PetscCall(ISRestoreIndices(isrow, &rout));
1188   PetscCall(ISRestoreIndices(iscol, &cout));
1189   PetscCall(VecRestoreArrayRead(bb, &b));
1190   PetscCall(VecRestoreArrayWrite(xx, &x));
1191   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1192   PetscFunctionReturn(PETSC_SUCCESS);
1193 }
1194 
1195 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1196 {
1197   Mat              C = B;
1198   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1199   IS               isrow = b->row, isicol = b->icol;
1200   const PetscInt  *r, *ic, *ics;
1201   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1202   PetscInt         i, j, k, nz, nzL, row, *pj;
1203   const PetscInt  *ajtmp, *bjtmp;
1204   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1205   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1206   FactorShiftCtx   sctx;
1207   const PetscInt  *ddiag;
1208   PetscReal        rs;
1209   MatScalar        d;
1210   PetscInt         inod, nodesz, node_max, col;
1211   const PetscInt  *ns;
1212   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1213 
1214   PetscFunctionBegin;
1215   /* MatPivotSetUp(): initialize shift context sctx */
1216   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1217 
1218   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1219     ddiag          = a->diag;
1220     sctx.shift_top = info->zeropivot;
1221     for (i = 0; i < n; i++) {
1222       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1223       d  = (aa)[ddiag[i]];
1224       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1225       v  = aa + ai[i];
1226       nz = ai[i + 1] - ai[i];
1227       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1228       if (rs > sctx.shift_top) sctx.shift_top = rs;
1229     }
1230     sctx.shift_top *= 1.1;
1231     sctx.nshift_max = 5;
1232     sctx.shift_lo   = 0.;
1233     sctx.shift_hi   = 1.;
1234   }
1235 
1236   PetscCall(ISGetIndices(isrow, &r));
1237   PetscCall(ISGetIndices(isicol, &ic));
1238 
1239   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1240   ics = ic;
1241 
1242   node_max = a->inode.node_count;
1243   ns       = a->inode.size_csr;
1244   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1245 
1246   /* If max inode size > 4, split it into two inodes.*/
1247   /* also map the inode sizes according to the ordering */
1248   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1249   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1250     nodesz = ns[i + 1] - ns[i];
1251     if (nodesz > 4) {
1252       tmp_vec1[j] = 4;
1253       ++j;
1254       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1255     } else {
1256       tmp_vec1[j] = nodesz;
1257     }
1258   }
1259   /* Use the correct node_max */
1260   node_max = j;
1261 
1262   /* Now reorder the inode info based on mat re-ordering info */
1263   /* First create a row -> inode_size_array_index map */
1264   PetscCall(PetscMalloc1(n + 1, &nsmap));
1265   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1266   tmp_vec2[0] = 0;
1267   for (i = 0, row = 0; i < node_max; i++) {
1268     nodesz = tmp_vec1[i];
1269     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1270   }
1271   /* Using nsmap, create a reordered ns structure */
1272   for (i = 0, j = 0; i < node_max; i++) {
1273     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1274     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1275     j += nodesz;
1276   }
1277   PetscCall(PetscFree(nsmap));
1278   PetscCall(PetscFree(tmp_vec1));
1279 
1280   /* Now use the correct ns */
1281   ns = tmp_vec2;
1282 
1283   do {
1284     sctx.newshift = PETSC_FALSE;
1285     /* Now loop over each block-row, and do the factorization */
1286     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1287       nodesz = ns[inod + 1] - ns[inod];
1288 
1289       switch (nodesz) {
1290       case 1:
1291         /* zero rtmp1 */
1292         /* L part */
1293         nz    = bi[i + 1] - bi[i];
1294         bjtmp = bj + bi[i];
1295         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* U part */
1298         nz    = bdiag[i] - bdiag[i + 1];
1299         bjtmp = bj + bdiag[i + 1] + 1;
1300         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1301 
1302         /* load in initial (unfactored row) */
1303         nz    = ai[r[i] + 1] - ai[r[i]];
1304         ajtmp = aj + ai[r[i]];
1305         v     = aa + ai[r[i]];
1306         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1307 
1308         /* ZeropivotApply() */
1309         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1310 
1311         /* elimination */
1312         bjtmp = bj + bi[i];
1313         row   = *bjtmp++;
1314         nzL   = bi[i + 1] - bi[i];
1315         for (k = 0; k < nzL; k++) {
1316           pc = rtmp1 + row;
1317           if (*pc != 0.0) {
1318             pv   = b->a + bdiag[row];
1319             mul1 = *pc * (*pv);
1320             *pc  = mul1;
1321             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1322             pv   = b->a + bdiag[row + 1] + 1;
1323             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1324             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1325             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1326           }
1327           row = *bjtmp++;
1328         }
1329 
1330         /* finished row so stick it into b->a */
1331         rs = 0.0;
1332         /* L part */
1333         pv = b->a + bi[i];
1334         pj = b->j + bi[i];
1335         nz = bi[i + 1] - bi[i];
1336         for (j = 0; j < nz; j++) {
1337           pv[j] = rtmp1[pj[j]];
1338           rs += PetscAbsScalar(pv[j]);
1339         }
1340 
1341         /* U part */
1342         pv = b->a + bdiag[i + 1] + 1;
1343         pj = b->j + bdiag[i + 1] + 1;
1344         nz = bdiag[i] - bdiag[i + 1] - 1;
1345         for (j = 0; j < nz; j++) {
1346           pv[j] = rtmp1[pj[j]];
1347           rs += PetscAbsScalar(pv[j]);
1348         }
1349 
1350         /* Check zero pivot */
1351         sctx.rs = rs;
1352         sctx.pv = rtmp1[i];
1353         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1354         if (sctx.newshift) break;
1355 
1356         /* Mark diagonal and invert diagonal for simpler triangular solves */
1357         pv  = b->a + bdiag[i];
1358         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1359         break;
1360 
1361       case 2:
1362         /* zero rtmp1 and rtmp2 */
1363         /* L part */
1364         nz    = bi[i + 1] - bi[i];
1365         bjtmp = bj + bi[i];
1366         for (j = 0; j < nz; j++) {
1367           col        = bjtmp[j];
1368           rtmp1[col] = 0.0;
1369           rtmp2[col] = 0.0;
1370         }
1371 
1372         /* U part */
1373         nz    = bdiag[i] - bdiag[i + 1];
1374         bjtmp = bj + bdiag[i + 1] + 1;
1375         for (j = 0; j < nz; j++) {
1376           col        = bjtmp[j];
1377           rtmp1[col] = 0.0;
1378           rtmp2[col] = 0.0;
1379         }
1380 
1381         /* load in initial (unfactored row) */
1382         nz    = ai[r[i] + 1] - ai[r[i]];
1383         ajtmp = aj + ai[r[i]];
1384         v1    = aa + ai[r[i]];
1385         v2    = aa + ai[r[i] + 1];
1386         for (j = 0; j < nz; j++) {
1387           col        = ics[ajtmp[j]];
1388           rtmp1[col] = v1[j];
1389           rtmp2[col] = v2[j];
1390         }
1391         /* ZeropivotApply(): shift the diagonal of the matrix  */
1392         rtmp1[i] += sctx.shift_amount;
1393         rtmp2[i + 1] += sctx.shift_amount;
1394 
1395         /* elimination */
1396         bjtmp = bj + bi[i];
1397         row   = *bjtmp++; /* pivot row */
1398         nzL   = bi[i + 1] - bi[i];
1399         for (k = 0; k < nzL; k++) {
1400           pc1 = rtmp1 + row;
1401           pc2 = rtmp2 + row;
1402           if (*pc1 != 0.0 || *pc2 != 0.0) {
1403             pv   = b->a + bdiag[row];
1404             mul1 = *pc1 * (*pv);
1405             mul2 = *pc2 * (*pv);
1406             *pc1 = mul1;
1407             *pc2 = mul2;
1408 
1409             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1410             pv = b->a + bdiag[row + 1] + 1;
1411             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1412             for (j = 0; j < nz; j++) {
1413               col = pj[j];
1414               rtmp1[col] -= mul1 * pv[j];
1415               rtmp2[col] -= mul2 * pv[j];
1416             }
1417             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1418           }
1419           row = *bjtmp++;
1420         }
1421 
1422         /* finished row i; check zero pivot, then stick row i into b->a */
1423         rs = 0.0;
1424         /* L part */
1425         pc1 = b->a + bi[i];
1426         pj  = b->j + bi[i];
1427         nz  = bi[i + 1] - bi[i];
1428         for (j = 0; j < nz; j++) {
1429           col    = pj[j];
1430           pc1[j] = rtmp1[col];
1431           rs += PetscAbsScalar(pc1[j]);
1432         }
1433         /* U part */
1434         pc1 = b->a + bdiag[i + 1] + 1;
1435         pj  = b->j + bdiag[i + 1] + 1;
1436         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1437         for (j = 0; j < nz; j++) {
1438           col    = pj[j];
1439           pc1[j] = rtmp1[col];
1440           rs += PetscAbsScalar(pc1[j]);
1441         }
1442 
1443         sctx.rs = rs;
1444         sctx.pv = rtmp1[i];
1445         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1446         if (sctx.newshift) break;
1447         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1448         *pc1 = 1.0 / sctx.pv;
1449 
1450         /* Now take care of diagonal 2x2 block. */
1451         pc2 = rtmp2 + i;
1452         if (*pc2 != 0.0) {
1453           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1454           *pc2 = mul1;                        /* insert L entry */
1455           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1456           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1457           for (j = 0; j < nz; j++) {
1458             col = pj[j];
1459             rtmp2[col] -= mul1 * rtmp1[col];
1460           }
1461           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1462         }
1463 
1464         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1465         rs = 0.0;
1466         /* L part */
1467         pc2 = b->a + bi[i + 1];
1468         pj  = b->j + bi[i + 1];
1469         nz  = bi[i + 2] - bi[i + 1];
1470         for (j = 0; j < nz; j++) {
1471           col    = pj[j];
1472           pc2[j] = rtmp2[col];
1473           rs += PetscAbsScalar(pc2[j]);
1474         }
1475         /* U part */
1476         pc2 = b->a + bdiag[i + 2] + 1;
1477         pj  = b->j + bdiag[i + 2] + 1;
1478         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1479         for (j = 0; j < nz; j++) {
1480           col    = pj[j];
1481           pc2[j] = rtmp2[col];
1482           rs += PetscAbsScalar(pc2[j]);
1483         }
1484 
1485         sctx.rs = rs;
1486         sctx.pv = rtmp2[i + 1];
1487         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1488         if (sctx.newshift) break;
1489         pc2  = b->a + bdiag[i + 1];
1490         *pc2 = 1.0 / sctx.pv;
1491         break;
1492 
1493       case 3:
1494         /* zero rtmp */
1495         /* L part */
1496         nz    = bi[i + 1] - bi[i];
1497         bjtmp = bj + bi[i];
1498         for (j = 0; j < nz; j++) {
1499           col        = bjtmp[j];
1500           rtmp1[col] = 0.0;
1501           rtmp2[col] = 0.0;
1502           rtmp3[col] = 0.0;
1503         }
1504 
1505         /* U part */
1506         nz    = bdiag[i] - bdiag[i + 1];
1507         bjtmp = bj + bdiag[i + 1] + 1;
1508         for (j = 0; j < nz; j++) {
1509           col        = bjtmp[j];
1510           rtmp1[col] = 0.0;
1511           rtmp2[col] = 0.0;
1512           rtmp3[col] = 0.0;
1513         }
1514 
1515         /* load in initial (unfactored row) */
1516         nz    = ai[r[i] + 1] - ai[r[i]];
1517         ajtmp = aj + ai[r[i]];
1518         v1    = aa + ai[r[i]];
1519         v2    = aa + ai[r[i] + 1];
1520         v3    = aa + ai[r[i] + 2];
1521         for (j = 0; j < nz; j++) {
1522           col        = ics[ajtmp[j]];
1523           rtmp1[col] = v1[j];
1524           rtmp2[col] = v2[j];
1525           rtmp3[col] = v3[j];
1526         }
1527         /* ZeropivotApply(): shift the diagonal of the matrix  */
1528         rtmp1[i] += sctx.shift_amount;
1529         rtmp2[i + 1] += sctx.shift_amount;
1530         rtmp3[i + 2] += sctx.shift_amount;
1531 
1532         /* elimination */
1533         bjtmp = bj + bi[i];
1534         row   = *bjtmp++; /* pivot row */
1535         nzL   = bi[i + 1] - bi[i];
1536         for (k = 0; k < nzL; k++) {
1537           pc1 = rtmp1 + row;
1538           pc2 = rtmp2 + row;
1539           pc3 = rtmp3 + row;
1540           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1541             pv   = b->a + bdiag[row];
1542             mul1 = *pc1 * (*pv);
1543             mul2 = *pc2 * (*pv);
1544             mul3 = *pc3 * (*pv);
1545             *pc1 = mul1;
1546             *pc2 = mul2;
1547             *pc3 = mul3;
1548 
1549             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1550             pv = b->a + bdiag[row + 1] + 1;
1551             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1552             for (j = 0; j < nz; j++) {
1553               col = pj[j];
1554               rtmp1[col] -= mul1 * pv[j];
1555               rtmp2[col] -= mul2 * pv[j];
1556               rtmp3[col] -= mul3 * pv[j];
1557             }
1558             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1559           }
1560           row = *bjtmp++;
1561         }
1562 
1563         /* finished row i; check zero pivot, then stick row i into b->a */
1564         rs = 0.0;
1565         /* L part */
1566         pc1 = b->a + bi[i];
1567         pj  = b->j + bi[i];
1568         nz  = bi[i + 1] - bi[i];
1569         for (j = 0; j < nz; j++) {
1570           col    = pj[j];
1571           pc1[j] = rtmp1[col];
1572           rs += PetscAbsScalar(pc1[j]);
1573         }
1574         /* U part */
1575         pc1 = b->a + bdiag[i + 1] + 1;
1576         pj  = b->j + bdiag[i + 1] + 1;
1577         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1578         for (j = 0; j < nz; j++) {
1579           col    = pj[j];
1580           pc1[j] = rtmp1[col];
1581           rs += PetscAbsScalar(pc1[j]);
1582         }
1583 
1584         sctx.rs = rs;
1585         sctx.pv = rtmp1[i];
1586         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1587         if (sctx.newshift) break;
1588         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1589         *pc1 = 1.0 / sctx.pv;
1590 
1591         /* Now take care of 1st column of diagonal 3x3 block. */
1592         pc2 = rtmp2 + i;
1593         pc3 = rtmp3 + i;
1594         if (*pc2 != 0.0 || *pc3 != 0.0) {
1595           mul2 = (*pc2) * (*pc1);
1596           *pc2 = mul2;
1597           mul3 = (*pc3) * (*pc1);
1598           *pc3 = mul3;
1599           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1600           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1601           for (j = 0; j < nz; j++) {
1602             col = pj[j];
1603             rtmp2[col] -= mul2 * rtmp1[col];
1604             rtmp3[col] -= mul3 * rtmp1[col];
1605           }
1606           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1607         }
1608 
1609         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1610         rs = 0.0;
1611         /* L part */
1612         pc2 = b->a + bi[i + 1];
1613         pj  = b->j + bi[i + 1];
1614         nz  = bi[i + 2] - bi[i + 1];
1615         for (j = 0; j < nz; j++) {
1616           col    = pj[j];
1617           pc2[j] = rtmp2[col];
1618           rs += PetscAbsScalar(pc2[j]);
1619         }
1620         /* U part */
1621         pc2 = b->a + bdiag[i + 2] + 1;
1622         pj  = b->j + bdiag[i + 2] + 1;
1623         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1624         for (j = 0; j < nz; j++) {
1625           col    = pj[j];
1626           pc2[j] = rtmp2[col];
1627           rs += PetscAbsScalar(pc2[j]);
1628         }
1629 
1630         sctx.rs = rs;
1631         sctx.pv = rtmp2[i + 1];
1632         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1633         if (sctx.newshift) break;
1634         pc2  = b->a + bdiag[i + 1];
1635         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1636 
1637         /* Now take care of 2nd column of diagonal 3x3 block. */
1638         pc3 = rtmp3 + i + 1;
1639         if (*pc3 != 0.0) {
1640           mul3 = (*pc3) * (*pc2);
1641           *pc3 = mul3;
1642           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1643           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1644           for (j = 0; j < nz; j++) {
1645             col = pj[j];
1646             rtmp3[col] -= mul3 * rtmp2[col];
1647           }
1648           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1649         }
1650 
1651         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1652         rs = 0.0;
1653         /* L part */
1654         pc3 = b->a + bi[i + 2];
1655         pj  = b->j + bi[i + 2];
1656         nz  = bi[i + 3] - bi[i + 2];
1657         for (j = 0; j < nz; j++) {
1658           col    = pj[j];
1659           pc3[j] = rtmp3[col];
1660           rs += PetscAbsScalar(pc3[j]);
1661         }
1662         /* U part */
1663         pc3 = b->a + bdiag[i + 3] + 1;
1664         pj  = b->j + bdiag[i + 3] + 1;
1665         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1666         for (j = 0; j < nz; j++) {
1667           col    = pj[j];
1668           pc3[j] = rtmp3[col];
1669           rs += PetscAbsScalar(pc3[j]);
1670         }
1671 
1672         sctx.rs = rs;
1673         sctx.pv = rtmp3[i + 2];
1674         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1675         if (sctx.newshift) break;
1676         pc3  = b->a + bdiag[i + 2];
1677         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1678         break;
1679       case 4:
1680         /* zero rtmp */
1681         /* L part */
1682         nz    = bi[i + 1] - bi[i];
1683         bjtmp = bj + bi[i];
1684         for (j = 0; j < nz; j++) {
1685           col        = bjtmp[j];
1686           rtmp1[col] = 0.0;
1687           rtmp2[col] = 0.0;
1688           rtmp3[col] = 0.0;
1689           rtmp4[col] = 0.0;
1690         }
1691 
1692         /* U part */
1693         nz    = bdiag[i] - bdiag[i + 1];
1694         bjtmp = bj + bdiag[i + 1] + 1;
1695         for (j = 0; j < nz; j++) {
1696           col        = bjtmp[j];
1697           rtmp1[col] = 0.0;
1698           rtmp2[col] = 0.0;
1699           rtmp3[col] = 0.0;
1700           rtmp4[col] = 0.0;
1701         }
1702 
1703         /* load in initial (unfactored row) */
1704         nz    = ai[r[i] + 1] - ai[r[i]];
1705         ajtmp = aj + ai[r[i]];
1706         v1    = aa + ai[r[i]];
1707         v2    = aa + ai[r[i] + 1];
1708         v3    = aa + ai[r[i] + 2];
1709         v4    = aa + ai[r[i] + 3];
1710         for (j = 0; j < nz; j++) {
1711           col        = ics[ajtmp[j]];
1712           rtmp1[col] = v1[j];
1713           rtmp2[col] = v2[j];
1714           rtmp3[col] = v3[j];
1715           rtmp4[col] = v4[j];
1716         }
1717         /* ZeropivotApply(): shift the diagonal of the matrix  */
1718         rtmp1[i] += sctx.shift_amount;
1719         rtmp2[i + 1] += sctx.shift_amount;
1720         rtmp3[i + 2] += sctx.shift_amount;
1721         rtmp4[i + 3] += sctx.shift_amount;
1722 
1723         /* elimination */
1724         bjtmp = bj + bi[i];
1725         row   = *bjtmp++; /* pivot row */
1726         nzL   = bi[i + 1] - bi[i];
1727         for (k = 0; k < nzL; k++) {
1728           pc1 = rtmp1 + row;
1729           pc2 = rtmp2 + row;
1730           pc3 = rtmp3 + row;
1731           pc4 = rtmp4 + row;
1732           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1733             pv   = b->a + bdiag[row];
1734             mul1 = *pc1 * (*pv);
1735             mul2 = *pc2 * (*pv);
1736             mul3 = *pc3 * (*pv);
1737             mul4 = *pc4 * (*pv);
1738             *pc1 = mul1;
1739             *pc2 = mul2;
1740             *pc3 = mul3;
1741             *pc4 = mul4;
1742 
1743             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1744             pv = b->a + bdiag[row + 1] + 1;
1745             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1746             for (j = 0; j < nz; j++) {
1747               col = pj[j];
1748               rtmp1[col] -= mul1 * pv[j];
1749               rtmp2[col] -= mul2 * pv[j];
1750               rtmp3[col] -= mul3 * pv[j];
1751               rtmp4[col] -= mul4 * pv[j];
1752             }
1753             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1754           }
1755           row = *bjtmp++;
1756         }
1757 
1758         /* finished row i; check zero pivot, then stick row i into b->a */
1759         rs = 0.0;
1760         /* L part */
1761         pc1 = b->a + bi[i];
1762         pj  = b->j + bi[i];
1763         nz  = bi[i + 1] - bi[i];
1764         for (j = 0; j < nz; j++) {
1765           col    = pj[j];
1766           pc1[j] = rtmp1[col];
1767           rs += PetscAbsScalar(pc1[j]);
1768         }
1769         /* U part */
1770         pc1 = b->a + bdiag[i + 1] + 1;
1771         pj  = b->j + bdiag[i + 1] + 1;
1772         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1773         for (j = 0; j < nz; j++) {
1774           col    = pj[j];
1775           pc1[j] = rtmp1[col];
1776           rs += PetscAbsScalar(pc1[j]);
1777         }
1778 
1779         sctx.rs = rs;
1780         sctx.pv = rtmp1[i];
1781         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1782         if (sctx.newshift) break;
1783         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1784         *pc1 = 1.0 / sctx.pv;
1785 
1786         /* Now take care of 1st column of diagonal 4x4 block. */
1787         pc2 = rtmp2 + i;
1788         pc3 = rtmp3 + i;
1789         pc4 = rtmp4 + i;
1790         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1791           mul2 = (*pc2) * (*pc1);
1792           *pc2 = mul2;
1793           mul3 = (*pc3) * (*pc1);
1794           *pc3 = mul3;
1795           mul4 = (*pc4) * (*pc1);
1796           *pc4 = mul4;
1797           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1798           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1799           for (j = 0; j < nz; j++) {
1800             col = pj[j];
1801             rtmp2[col] -= mul2 * rtmp1[col];
1802             rtmp3[col] -= mul3 * rtmp1[col];
1803             rtmp4[col] -= mul4 * rtmp1[col];
1804           }
1805           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1806         }
1807 
1808         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1809         rs = 0.0;
1810         /* L part */
1811         pc2 = b->a + bi[i + 1];
1812         pj  = b->j + bi[i + 1];
1813         nz  = bi[i + 2] - bi[i + 1];
1814         for (j = 0; j < nz; j++) {
1815           col    = pj[j];
1816           pc2[j] = rtmp2[col];
1817           rs += PetscAbsScalar(pc2[j]);
1818         }
1819         /* U part */
1820         pc2 = b->a + bdiag[i + 2] + 1;
1821         pj  = b->j + bdiag[i + 2] + 1;
1822         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1823         for (j = 0; j < nz; j++) {
1824           col    = pj[j];
1825           pc2[j] = rtmp2[col];
1826           rs += PetscAbsScalar(pc2[j]);
1827         }
1828 
1829         sctx.rs = rs;
1830         sctx.pv = rtmp2[i + 1];
1831         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1832         if (sctx.newshift) break;
1833         pc2  = b->a + bdiag[i + 1];
1834         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1835 
1836         /* Now take care of 2nd column of diagonal 4x4 block. */
1837         pc3 = rtmp3 + i + 1;
1838         pc4 = rtmp4 + i + 1;
1839         if (*pc3 != 0.0 || *pc4 != 0.0) {
1840           mul3 = (*pc3) * (*pc2);
1841           *pc3 = mul3;
1842           mul4 = (*pc4) * (*pc2);
1843           *pc4 = mul4;
1844           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1845           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1846           for (j = 0; j < nz; j++) {
1847             col = pj[j];
1848             rtmp3[col] -= mul3 * rtmp2[col];
1849             rtmp4[col] -= mul4 * rtmp2[col];
1850           }
1851           PetscCall(PetscLogFlops(4.0 * nz));
1852         }
1853 
1854         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1855         rs = 0.0;
1856         /* L part */
1857         pc3 = b->a + bi[i + 2];
1858         pj  = b->j + bi[i + 2];
1859         nz  = bi[i + 3] - bi[i + 2];
1860         for (j = 0; j < nz; j++) {
1861           col    = pj[j];
1862           pc3[j] = rtmp3[col];
1863           rs += PetscAbsScalar(pc3[j]);
1864         }
1865         /* U part */
1866         pc3 = b->a + bdiag[i + 3] + 1;
1867         pj  = b->j + bdiag[i + 3] + 1;
1868         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1869         for (j = 0; j < nz; j++) {
1870           col    = pj[j];
1871           pc3[j] = rtmp3[col];
1872           rs += PetscAbsScalar(pc3[j]);
1873         }
1874 
1875         sctx.rs = rs;
1876         sctx.pv = rtmp3[i + 2];
1877         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1878         if (sctx.newshift) break;
1879         pc3  = b->a + bdiag[i + 2];
1880         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1881 
1882         /* Now take care of 3rd column of diagonal 4x4 block. */
1883         pc4 = rtmp4 + i + 2;
1884         if (*pc4 != 0.0) {
1885           mul4 = (*pc4) * (*pc3);
1886           *pc4 = mul4;
1887           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1888           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1889           for (j = 0; j < nz; j++) {
1890             col = pj[j];
1891             rtmp4[col] -= mul4 * rtmp3[col];
1892           }
1893           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1894         }
1895 
1896         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1897         rs = 0.0;
1898         /* L part */
1899         pc4 = b->a + bi[i + 3];
1900         pj  = b->j + bi[i + 3];
1901         nz  = bi[i + 4] - bi[i + 3];
1902         for (j = 0; j < nz; j++) {
1903           col    = pj[j];
1904           pc4[j] = rtmp4[col];
1905           rs += PetscAbsScalar(pc4[j]);
1906         }
1907         /* U part */
1908         pc4 = b->a + bdiag[i + 4] + 1;
1909         pj  = b->j + bdiag[i + 4] + 1;
1910         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1911         for (j = 0; j < nz; j++) {
1912           col    = pj[j];
1913           pc4[j] = rtmp4[col];
1914           rs += PetscAbsScalar(pc4[j]);
1915         }
1916 
1917         sctx.rs = rs;
1918         sctx.pv = rtmp4[i + 3];
1919         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1920         if (sctx.newshift) break;
1921         pc4  = b->a + bdiag[i + 3];
1922         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1923         break;
1924 
1925       default:
1926         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1927       }
1928       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1929       i += nodesz;              /* Update the row */
1930     }
1931 
1932     /* MatPivotRefine() */
1933     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1934       /*
1935        * if no shift in this attempt & shifting & started shifting & can refine,
1936        * then try lower shift
1937        */
1938       sctx.shift_hi       = sctx.shift_fraction;
1939       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1940       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1941       sctx.newshift       = PETSC_TRUE;
1942       sctx.nshift++;
1943     }
1944   } while (sctx.newshift);
1945 
1946   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1947   PetscCall(PetscFree(tmp_vec2));
1948   PetscCall(ISRestoreIndices(isicol, &ic));
1949   PetscCall(ISRestoreIndices(isrow, &r));
1950 
1951   if (b->inode.size_csr) {
1952     C->ops->solve = MatSolve_SeqAIJ_Inode;
1953   } else {
1954     C->ops->solve = MatSolve_SeqAIJ;
1955   }
1956   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1957   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1958   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1959   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1960   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1961   C->assembled              = PETSC_TRUE;
1962   C->preallocated           = PETSC_TRUE;
1963 
1964   PetscCall(PetscLogFlops(C->cmap->n));
1965 
1966   /* MatShiftView(A,info,&sctx) */
1967   if (sctx.nshift) {
1968     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1969       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1970     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1971       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1972     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1973       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1974     }
1975   }
1976   PetscFunctionReturn(PETSC_SUCCESS);
1977 }
1978 
1979 #if 0
1980 // unused
1981 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1982 {
1983   Mat              C = B;
1984   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1985   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1986   const PetscInt  *r, *ic, *c, *ics;
1987   PetscInt         n = A->rmap->n, *bi = b->i;
1988   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1989   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1990   PetscInt        *ai = a->i, *aj = a->j;
1991   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1992   PetscScalar      mul1, mul2, mul3, tmp;
1993   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1994   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1995   PetscReal        rs = 0.0;
1996   FactorShiftCtx   sctx;
1997 
1998   PetscFunctionBegin;
1999   sctx.shift_top      = 0;
2000   sctx.nshift_max     = 0;
2001   sctx.shift_lo       = 0;
2002   sctx.shift_hi       = 0;
2003   sctx.shift_fraction = 0;
2004 
2005   /* if both shift schemes are chosen by user, only use info->shiftpd */
2006   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2007     sctx.shift_top = 0;
2008     for (i = 0; i < n; i++) {
2009       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2010       rs    = 0.0;
2011       ajtmp = aj + ai[i];
2012       rtmp1 = aa + ai[i];
2013       nz    = ai[i + 1] - ai[i];
2014       for (j = 0; j < nz; j++) {
2015         if (*ajtmp != i) {
2016           rs += PetscAbsScalar(*rtmp1++);
2017         } else {
2018           rs -= PetscRealPart(*rtmp1++);
2019         }
2020         ajtmp++;
2021       }
2022       if (rs > sctx.shift_top) sctx.shift_top = rs;
2023     }
2024     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2025     sctx.shift_top *= 1.1;
2026     sctx.nshift_max = 5;
2027     sctx.shift_lo   = 0.;
2028     sctx.shift_hi   = 1.;
2029   }
2030   sctx.shift_amount = 0;
2031   sctx.nshift       = 0;
2032 
2033   PetscCall(ISGetIndices(isrow, &r));
2034   PetscCall(ISGetIndices(iscol, &c));
2035   PetscCall(ISGetIndices(isicol, &ic));
2036   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2037   ics = ic;
2038 
2039   node_max = a->inode.node_count;
2040   ns       = a->inode.size;
2041   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2042 
2043   /* If max inode size > 3, split it into two inodes.*/
2044   /* also map the inode sizes according to the ordering */
2045   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2046   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2047     if (ns[i] > 3) {
2048       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2049       ++j;
2050       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2051     } else {
2052       tmp_vec1[j] = ns[i];
2053     }
2054   }
2055   /* Use the correct node_max */
2056   node_max = j;
2057 
2058   /* Now reorder the inode info based on mat re-ordering info */
2059   /* First create a row -> inode_size_array_index map */
2060   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2061   for (i = 0, row = 0; i < node_max; i++) {
2062     nodesz = tmp_vec1[i];
2063     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2064   }
2065   /* Using nsmap, create a reordered ns structure */
2066   for (i = 0, j = 0; i < node_max; i++) {
2067     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2068     tmp_vec2[i] = nodesz;
2069     j += nodesz;
2070   }
2071   PetscCall(PetscFree2(nsmap, tmp_vec1));
2072   /* Now use the correct ns */
2073   ns = tmp_vec2;
2074 
2075   do {
2076     sctx.newshift = PETSC_FALSE;
2077     /* Now loop over each block-row, and do the factorization */
2078     for (i = 0, row = 0; i < node_max; i++) {
2079       nodesz = ns[i];
2080       nz     = bi[row + 1] - bi[row];
2081       bjtmp  = bj + bi[row];
2082 
2083       switch (nodesz) {
2084       case 1:
2085         for (j = 0; j < nz; j++) {
2086           idx         = bjtmp[j];
2087           rtmp11[idx] = 0.0;
2088         }
2089 
2090         /* load in initial (unfactored row) */
2091         idx    = r[row];
2092         nz_tmp = ai[idx + 1] - ai[idx];
2093         ajtmp  = aj + ai[idx];
2094         v1     = aa + ai[idx];
2095 
2096         for (j = 0; j < nz_tmp; j++) {
2097           idx         = ics[ajtmp[j]];
2098           rtmp11[idx] = v1[j];
2099         }
2100         rtmp11[ics[r[row]]] += sctx.shift_amount;
2101 
2102         prow = *bjtmp++;
2103         while (prow < row) {
2104           pc1 = rtmp11 + prow;
2105           if (*pc1 != 0.0) {
2106             pv     = ba + bd[prow];
2107             pj     = nbj + bd[prow];
2108             mul1   = *pc1 * *pv++;
2109             *pc1   = mul1;
2110             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2111             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2112             for (j = 0; j < nz_tmp; j++) {
2113               tmp = pv[j];
2114               idx = pj[j];
2115               rtmp11[idx] -= mul1 * tmp;
2116             }
2117           }
2118           prow = *bjtmp++;
2119         }
2120         pj  = bj + bi[row];
2121         pc1 = ba + bi[row];
2122 
2123         sctx.pv     = rtmp11[row];
2124         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2125         rs          = 0.0;
2126         for (j = 0; j < nz; j++) {
2127           idx    = pj[j];
2128           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2129           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2130         }
2131         sctx.rs = rs;
2132         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2133         if (sctx.newshift) goto endofwhile;
2134         break;
2135 
2136       case 2:
2137         for (j = 0; j < nz; j++) {
2138           idx         = bjtmp[j];
2139           rtmp11[idx] = 0.0;
2140           rtmp22[idx] = 0.0;
2141         }
2142 
2143         /* load in initial (unfactored row) */
2144         idx    = r[row];
2145         nz_tmp = ai[idx + 1] - ai[idx];
2146         ajtmp  = aj + ai[idx];
2147         v1     = aa + ai[idx];
2148         v2     = aa + ai[idx + 1];
2149         for (j = 0; j < nz_tmp; j++) {
2150           idx         = ics[ajtmp[j]];
2151           rtmp11[idx] = v1[j];
2152           rtmp22[idx] = v2[j];
2153         }
2154         rtmp11[ics[r[row]]] += sctx.shift_amount;
2155         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2156 
2157         prow = *bjtmp++;
2158         while (prow < row) {
2159           pc1 = rtmp11 + prow;
2160           pc2 = rtmp22 + prow;
2161           if (*pc1 != 0.0 || *pc2 != 0.0) {
2162             pv   = ba + bd[prow];
2163             pj   = nbj + bd[prow];
2164             mul1 = *pc1 * *pv;
2165             mul2 = *pc2 * *pv;
2166             ++pv;
2167             *pc1 = mul1;
2168             *pc2 = mul2;
2169 
2170             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2171             for (j = 0; j < nz_tmp; j++) {
2172               tmp = pv[j];
2173               idx = pj[j];
2174               rtmp11[idx] -= mul1 * tmp;
2175               rtmp22[idx] -= mul2 * tmp;
2176             }
2177             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2178           }
2179           prow = *bjtmp++;
2180         }
2181 
2182         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2183         pc1 = rtmp11 + prow;
2184         pc2 = rtmp22 + prow;
2185 
2186         sctx.pv = *pc1;
2187         pj      = bj + bi[prow];
2188         rs      = 0.0;
2189         for (j = 0; j < nz; j++) {
2190           idx = pj[j];
2191           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2192         }
2193         sctx.rs = rs;
2194         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2195         if (sctx.newshift) goto endofwhile;
2196 
2197         if (*pc2 != 0.0) {
2198           pj     = nbj + bd[prow];
2199           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2200           *pc2   = mul2;
2201           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2202           for (j = 0; j < nz_tmp; j++) {
2203             idx = pj[j];
2204             tmp = rtmp11[idx];
2205             rtmp22[idx] -= mul2 * tmp;
2206           }
2207           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2208         }
2209 
2210         pj  = bj + bi[row];
2211         pc1 = ba + bi[row];
2212         pc2 = ba + bi[row + 1];
2213 
2214         sctx.pv         = rtmp22[row + 1];
2215         rs              = 0.0;
2216         rtmp11[row]     = 1.0 / rtmp11[row];
2217         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2218         /* copy row entries from dense representation to sparse */
2219         for (j = 0; j < nz; j++) {
2220           idx    = pj[j];
2221           pc1[j] = rtmp11[idx];
2222           pc2[j] = rtmp22[idx];
2223           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2224         }
2225         sctx.rs = rs;
2226         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2227         if (sctx.newshift) goto endofwhile;
2228         break;
2229 
2230       case 3:
2231         for (j = 0; j < nz; j++) {
2232           idx         = bjtmp[j];
2233           rtmp11[idx] = 0.0;
2234           rtmp22[idx] = 0.0;
2235           rtmp33[idx] = 0.0;
2236         }
2237         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2238         idx    = r[row];
2239         nz_tmp = ai[idx + 1] - ai[idx];
2240         ajtmp  = aj + ai[idx];
2241         v1     = aa + ai[idx];
2242         v2     = aa + ai[idx + 1];
2243         v3     = aa + ai[idx + 2];
2244         for (j = 0; j < nz_tmp; j++) {
2245           idx         = ics[ajtmp[j]];
2246           rtmp11[idx] = v1[j];
2247           rtmp22[idx] = v2[j];
2248           rtmp33[idx] = v3[j];
2249         }
2250         rtmp11[ics[r[row]]] += sctx.shift_amount;
2251         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2252         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2253 
2254         /* loop over all pivot row blocks above this row block */
2255         prow = *bjtmp++;
2256         while (prow < row) {
2257           pc1 = rtmp11 + prow;
2258           pc2 = rtmp22 + prow;
2259           pc3 = rtmp33 + prow;
2260           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2261             pv   = ba + bd[prow];
2262             pj   = nbj + bd[prow];
2263             mul1 = *pc1 * *pv;
2264             mul2 = *pc2 * *pv;
2265             mul3 = *pc3 * *pv;
2266             ++pv;
2267             *pc1 = mul1;
2268             *pc2 = mul2;
2269             *pc3 = mul3;
2270 
2271             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2272             /* update this row based on pivot row */
2273             for (j = 0; j < nz_tmp; j++) {
2274               tmp = pv[j];
2275               idx = pj[j];
2276               rtmp11[idx] -= mul1 * tmp;
2277               rtmp22[idx] -= mul2 * tmp;
2278               rtmp33[idx] -= mul3 * tmp;
2279             }
2280             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2281           }
2282           prow = *bjtmp++;
2283         }
2284 
2285         /* Now take care of diagonal 3x3 block in this set of rows */
2286         /* note: prow = row here */
2287         pc1 = rtmp11 + prow;
2288         pc2 = rtmp22 + prow;
2289         pc3 = rtmp33 + prow;
2290 
2291         sctx.pv = *pc1;
2292         pj      = bj + bi[prow];
2293         rs      = 0.0;
2294         for (j = 0; j < nz; j++) {
2295           idx = pj[j];
2296           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2297         }
2298         sctx.rs = rs;
2299         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2300         if (sctx.newshift) goto endofwhile;
2301 
2302         if (*pc2 != 0.0 || *pc3 != 0.0) {
2303           mul2   = (*pc2) / (*pc1);
2304           mul3   = (*pc3) / (*pc1);
2305           *pc2   = mul2;
2306           *pc3   = mul3;
2307           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2308           pj     = nbj + bd[prow];
2309           for (j = 0; j < nz_tmp; j++) {
2310             idx = pj[j];
2311             tmp = rtmp11[idx];
2312             rtmp22[idx] -= mul2 * tmp;
2313             rtmp33[idx] -= mul3 * tmp;
2314           }
2315           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2316         }
2317         ++prow;
2318 
2319         pc2     = rtmp22 + prow;
2320         pc3     = rtmp33 + prow;
2321         sctx.pv = *pc2;
2322         pj      = bj + bi[prow];
2323         rs      = 0.0;
2324         for (j = 0; j < nz; j++) {
2325           idx = pj[j];
2326           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2327         }
2328         sctx.rs = rs;
2329         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2330         if (sctx.newshift) goto endofwhile;
2331 
2332         if (*pc3 != 0.0) {
2333           mul3   = (*pc3) / (*pc2);
2334           *pc3   = mul3;
2335           pj     = nbj + bd[prow];
2336           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2337           for (j = 0; j < nz_tmp; j++) {
2338             idx = pj[j];
2339             tmp = rtmp22[idx];
2340             rtmp33[idx] -= mul3 * tmp;
2341           }
2342           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2343         }
2344 
2345         pj  = bj + bi[row];
2346         pc1 = ba + bi[row];
2347         pc2 = ba + bi[row + 1];
2348         pc3 = ba + bi[row + 2];
2349 
2350         sctx.pv         = rtmp33[row + 2];
2351         rs              = 0.0;
2352         rtmp11[row]     = 1.0 / rtmp11[row];
2353         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2354         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2355         /* copy row entries from dense representation to sparse */
2356         for (j = 0; j < nz; j++) {
2357           idx    = pj[j];
2358           pc1[j] = rtmp11[idx];
2359           pc2[j] = rtmp22[idx];
2360           pc3[j] = rtmp33[idx];
2361           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2362         }
2363 
2364         sctx.rs = rs;
2365         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2366         if (sctx.newshift) goto endofwhile;
2367         break;
2368 
2369       default:
2370         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2371       }
2372       row += nodesz; /* Update the row */
2373     }
2374   endofwhile:;
2375   } while (sctx.newshift);
2376   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2377   PetscCall(PetscFree(tmp_vec2));
2378   PetscCall(ISRestoreIndices(isicol, &ic));
2379   PetscCall(ISRestoreIndices(isrow, &r));
2380   PetscCall(ISRestoreIndices(iscol, &c));
2381 
2382   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2383   /* do not set solve add, since MatSolve_Inode + Add is faster */
2384   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2385   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2386   C->assembled              = PETSC_TRUE;
2387   C->preallocated           = PETSC_TRUE;
2388   if (sctx.nshift) {
2389     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2390       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2391     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2392       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2393     }
2394   }
2395   PetscCall(PetscLogFlops(C->cmap->n));
2396   PetscCall(MatSeqAIJCheckInode(C));
2397   PetscFunctionReturn(PETSC_SUCCESS);
2398 }
2399 #endif
2400 
2401 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2402 {
2403   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2404   IS                 iscol = a->col, isrow = a->row;
2405   const PetscInt    *r, *c, *rout, *cout;
2406   PetscInt           i, j;
2407   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2408   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2409   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2410   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2411   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2412   const PetscScalar *b;
2413 
2414   PetscFunctionBegin;
2415   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2416   node_max = a->inode.node_count;
2417   ns       = a->inode.size_csr; /* Node Size array */
2418 
2419   PetscCall(VecGetArrayRead(bb, &b));
2420   PetscCall(VecGetArrayWrite(xx, &x));
2421   tmp = a->solve_work;
2422 
2423   PetscCall(ISGetIndices(isrow, &rout));
2424   r = rout;
2425   PetscCall(ISGetIndices(iscol, &cout));
2426   c = cout;
2427 
2428   /* forward solve the lower triangular */
2429   tmps = tmp;
2430   aa   = a_a;
2431   aj   = a_j;
2432   ad   = a->diag;
2433 
2434   for (i = 0; i < node_max; ++i) {
2435     row = ns[i];
2436     nsz = ns[i + 1] - ns[i];
2437     aii = ai[row];
2438     v1  = aa + aii;
2439     vi  = aj + aii;
2440     nz  = ai[row + 1] - ai[row];
2441 
2442     if (i < node_max - 1) {
2443       /* Prefetch the indices for the next block */
2444       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2445       /* Prefetch the data for the next block */
2446       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2447     }
2448 
2449     switch (nsz) { /* Each loop in 'case' is unrolled */
2450     case 1:
2451       sum1 = b[r[row]];
2452       for (j = 0; j < nz - 1; j += 2) {
2453         i0   = vi[j];
2454         i1   = vi[j + 1];
2455         tmp0 = tmps[i0];
2456         tmp1 = tmps[i1];
2457         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2458       }
2459       if (j == nz - 1) {
2460         tmp0 = tmps[vi[j]];
2461         sum1 -= v1[j] * tmp0;
2462       }
2463       tmp[row++] = sum1;
2464       break;
2465     case 2:
2466       sum1 = b[r[row]];
2467       sum2 = b[r[row + 1]];
2468       v2   = aa + ai[row + 1];
2469 
2470       for (j = 0; j < nz - 1; j += 2) {
2471         i0   = vi[j];
2472         i1   = vi[j + 1];
2473         tmp0 = tmps[i0];
2474         tmp1 = tmps[i1];
2475         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2476         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2477       }
2478       if (j == nz - 1) {
2479         tmp0 = tmps[vi[j]];
2480         sum1 -= v1[j] * tmp0;
2481         sum2 -= v2[j] * tmp0;
2482       }
2483       sum2 -= v2[nz] * sum1;
2484       tmp[row++] = sum1;
2485       tmp[row++] = sum2;
2486       break;
2487     case 3:
2488       sum1 = b[r[row]];
2489       sum2 = b[r[row + 1]];
2490       sum3 = b[r[row + 2]];
2491       v2   = aa + ai[row + 1];
2492       v3   = aa + ai[row + 2];
2493 
2494       for (j = 0; j < nz - 1; j += 2) {
2495         i0   = vi[j];
2496         i1   = vi[j + 1];
2497         tmp0 = tmps[i0];
2498         tmp1 = tmps[i1];
2499         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2500         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2501         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2502       }
2503       if (j == nz - 1) {
2504         tmp0 = tmps[vi[j]];
2505         sum1 -= v1[j] * tmp0;
2506         sum2 -= v2[j] * tmp0;
2507         sum3 -= v3[j] * tmp0;
2508       }
2509       sum2 -= v2[nz] * sum1;
2510       sum3 -= v3[nz] * sum1;
2511       sum3 -= v3[nz + 1] * sum2;
2512       tmp[row++] = sum1;
2513       tmp[row++] = sum2;
2514       tmp[row++] = sum3;
2515       break;
2516 
2517     case 4:
2518       sum1 = b[r[row]];
2519       sum2 = b[r[row + 1]];
2520       sum3 = b[r[row + 2]];
2521       sum4 = b[r[row + 3]];
2522       v2   = aa + ai[row + 1];
2523       v3   = aa + ai[row + 2];
2524       v4   = aa + ai[row + 3];
2525 
2526       for (j = 0; j < nz - 1; j += 2) {
2527         i0   = vi[j];
2528         i1   = vi[j + 1];
2529         tmp0 = tmps[i0];
2530         tmp1 = tmps[i1];
2531         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2532         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2533         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2534         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2535       }
2536       if (j == nz - 1) {
2537         tmp0 = tmps[vi[j]];
2538         sum1 -= v1[j] * tmp0;
2539         sum2 -= v2[j] * tmp0;
2540         sum3 -= v3[j] * tmp0;
2541         sum4 -= v4[j] * tmp0;
2542       }
2543       sum2 -= v2[nz] * sum1;
2544       sum3 -= v3[nz] * sum1;
2545       sum4 -= v4[nz] * sum1;
2546       sum3 -= v3[nz + 1] * sum2;
2547       sum4 -= v4[nz + 1] * sum2;
2548       sum4 -= v4[nz + 2] * sum3;
2549 
2550       tmp[row++] = sum1;
2551       tmp[row++] = sum2;
2552       tmp[row++] = sum3;
2553       tmp[row++] = sum4;
2554       break;
2555     case 5:
2556       sum1 = b[r[row]];
2557       sum2 = b[r[row + 1]];
2558       sum3 = b[r[row + 2]];
2559       sum4 = b[r[row + 3]];
2560       sum5 = b[r[row + 4]];
2561       v2   = aa + ai[row + 1];
2562       v3   = aa + ai[row + 2];
2563       v4   = aa + ai[row + 3];
2564       v5   = aa + ai[row + 4];
2565 
2566       for (j = 0; j < nz - 1; j += 2) {
2567         i0   = vi[j];
2568         i1   = vi[j + 1];
2569         tmp0 = tmps[i0];
2570         tmp1 = tmps[i1];
2571         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2572         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2573         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2574         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2575         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2576       }
2577       if (j == nz - 1) {
2578         tmp0 = tmps[vi[j]];
2579         sum1 -= v1[j] * tmp0;
2580         sum2 -= v2[j] * tmp0;
2581         sum3 -= v3[j] * tmp0;
2582         sum4 -= v4[j] * tmp0;
2583         sum5 -= v5[j] * tmp0;
2584       }
2585 
2586       sum2 -= v2[nz] * sum1;
2587       sum3 -= v3[nz] * sum1;
2588       sum4 -= v4[nz] * sum1;
2589       sum5 -= v5[nz] * sum1;
2590       sum3 -= v3[nz + 1] * sum2;
2591       sum4 -= v4[nz + 1] * sum2;
2592       sum5 -= v5[nz + 1] * sum2;
2593       sum4 -= v4[nz + 2] * sum3;
2594       sum5 -= v5[nz + 2] * sum3;
2595       sum5 -= v5[nz + 3] * sum4;
2596 
2597       tmp[row++] = sum1;
2598       tmp[row++] = sum2;
2599       tmp[row++] = sum3;
2600       tmp[row++] = sum4;
2601       tmp[row++] = sum5;
2602       break;
2603     default:
2604       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2605     }
2606   }
2607   /* backward solve the upper triangular */
2608   for (i = node_max - 1; i >= 0; i--) {
2609     row = ns[i + 1] - 1;
2610     nsz = ns[i + 1] - ns[i];
2611     aii = ad[row + 1] + 1;
2612     v1  = aa + aii;
2613     vi  = aj + aii;
2614     nz  = ad[row] - ad[row + 1] - 1;
2615 
2616     if (i > 0) {
2617       /* Prefetch the indices for the next block */
2618       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2619       /* Prefetch the data for the next block */
2620       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2621     }
2622 
2623     switch (nsz) { /* Each loop in 'case' is unrolled */
2624     case 1:
2625       sum1 = tmp[row];
2626 
2627       for (j = 0; j < nz - 1; j += 2) {
2628         i0   = vi[j];
2629         i1   = vi[j + 1];
2630         tmp0 = tmps[i0];
2631         tmp1 = tmps[i1];
2632         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2633       }
2634       if (j == nz - 1) {
2635         tmp0 = tmps[vi[j]];
2636         sum1 -= v1[j] * tmp0;
2637       }
2638       x[c[row]] = tmp[row] = sum1 * v1[nz];
2639       row--;
2640       break;
2641     case 2:
2642       sum1 = tmp[row];
2643       sum2 = tmp[row - 1];
2644       v2   = aa + ad[row] + 1;
2645       for (j = 0; j < nz - 1; j += 2) {
2646         i0   = vi[j];
2647         i1   = vi[j + 1];
2648         tmp0 = tmps[i0];
2649         tmp1 = tmps[i1];
2650         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2651         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2652       }
2653       if (j == nz - 1) {
2654         tmp0 = tmps[vi[j]];
2655         sum1 -= v1[j] * tmp0;
2656         sum2 -= v2[j + 1] * tmp0;
2657       }
2658 
2659       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2660       row--;
2661       sum2 -= v2[0] * tmp0;
2662       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2663       row--;
2664       break;
2665     case 3:
2666       sum1 = tmp[row];
2667       sum2 = tmp[row - 1];
2668       sum3 = tmp[row - 2];
2669       v2   = aa + ad[row] + 1;
2670       v3   = aa + ad[row - 1] + 1;
2671       for (j = 0; j < nz - 1; j += 2) {
2672         i0   = vi[j];
2673         i1   = vi[j + 1];
2674         tmp0 = tmps[i0];
2675         tmp1 = tmps[i1];
2676         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2677         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2678         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2679       }
2680       if (j == nz - 1) {
2681         tmp0 = tmps[vi[j]];
2682         sum1 -= v1[j] * tmp0;
2683         sum2 -= v2[j + 1] * tmp0;
2684         sum3 -= v3[j + 2] * tmp0;
2685       }
2686       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2687       row--;
2688       sum2 -= v2[0] * tmp0;
2689       sum3 -= v3[1] * tmp0;
2690       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2691       row--;
2692       sum3 -= v3[0] * tmp0;
2693       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2694       row--;
2695 
2696       break;
2697     case 4:
2698       sum1 = tmp[row];
2699       sum2 = tmp[row - 1];
2700       sum3 = tmp[row - 2];
2701       sum4 = tmp[row - 3];
2702       v2   = aa + ad[row] + 1;
2703       v3   = aa + ad[row - 1] + 1;
2704       v4   = aa + ad[row - 2] + 1;
2705 
2706       for (j = 0; j < nz - 1; j += 2) {
2707         i0   = vi[j];
2708         i1   = vi[j + 1];
2709         tmp0 = tmps[i0];
2710         tmp1 = tmps[i1];
2711         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2712         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2713         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2714         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2715       }
2716       if (j == nz - 1) {
2717         tmp0 = tmps[vi[j]];
2718         sum1 -= v1[j] * tmp0;
2719         sum2 -= v2[j + 1] * tmp0;
2720         sum3 -= v3[j + 2] * tmp0;
2721         sum4 -= v4[j + 3] * tmp0;
2722       }
2723 
2724       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2725       row--;
2726       sum2 -= v2[0] * tmp0;
2727       sum3 -= v3[1] * tmp0;
2728       sum4 -= v4[2] * tmp0;
2729       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2730       row--;
2731       sum3 -= v3[0] * tmp0;
2732       sum4 -= v4[1] * tmp0;
2733       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2734       row--;
2735       sum4 -= v4[0] * tmp0;
2736       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2737       row--;
2738       break;
2739     case 5:
2740       sum1 = tmp[row];
2741       sum2 = tmp[row - 1];
2742       sum3 = tmp[row - 2];
2743       sum4 = tmp[row - 3];
2744       sum5 = tmp[row - 4];
2745       v2   = aa + ad[row] + 1;
2746       v3   = aa + ad[row - 1] + 1;
2747       v4   = aa + ad[row - 2] + 1;
2748       v5   = aa + ad[row - 3] + 1;
2749       for (j = 0; j < nz - 1; j += 2) {
2750         i0   = vi[j];
2751         i1   = vi[j + 1];
2752         tmp0 = tmps[i0];
2753         tmp1 = tmps[i1];
2754         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2755         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2756         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2757         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2758         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2759       }
2760       if (j == nz - 1) {
2761         tmp0 = tmps[vi[j]];
2762         sum1 -= v1[j] * tmp0;
2763         sum2 -= v2[j + 1] * tmp0;
2764         sum3 -= v3[j + 2] * tmp0;
2765         sum4 -= v4[j + 3] * tmp0;
2766         sum5 -= v5[j + 4] * tmp0;
2767       }
2768 
2769       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2770       row--;
2771       sum2 -= v2[0] * tmp0;
2772       sum3 -= v3[1] * tmp0;
2773       sum4 -= v4[2] * tmp0;
2774       sum5 -= v5[3] * tmp0;
2775       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2776       row--;
2777       sum3 -= v3[0] * tmp0;
2778       sum4 -= v4[1] * tmp0;
2779       sum5 -= v5[2] * tmp0;
2780       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2781       row--;
2782       sum4 -= v4[0] * tmp0;
2783       sum5 -= v5[1] * tmp0;
2784       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2785       row--;
2786       sum5 -= v5[0] * tmp0;
2787       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2788       row--;
2789       break;
2790     default:
2791       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2792     }
2793   }
2794   PetscCall(ISRestoreIndices(isrow, &rout));
2795   PetscCall(ISRestoreIndices(iscol, &cout));
2796   PetscCall(VecRestoreArrayRead(bb, &b));
2797   PetscCall(VecRestoreArrayWrite(xx, &x));
2798   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2799   PetscFunctionReturn(PETSC_SUCCESS);
2800 }
2801 
2802 /*
2803      Makes a longer coloring[] array and calls the usual code with that
2804 */
2805 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2806 {
2807   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2808   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2809   PetscInt        *colorused, i;
2810   ISColoringValue *newcolor;
2811 
2812   PetscFunctionBegin;
2813   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2814   PetscCall(PetscMalloc1(n + 1, &newcolor));
2815   /* loop over inodes, marking a color for each column*/
2816   row = 0;
2817   for (i = 0; i < m; i++) {
2818     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2819   }
2820 
2821   /* eliminate unneeded colors */
2822   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2823   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2824 
2825   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2826   ncolors = colorused[5 * ncolors - 1];
2827   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2828   PetscCall(PetscFree(colorused));
2829   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2830   PetscCall(PetscFree(coloring));
2831   PetscFunctionReturn(PETSC_SUCCESS);
2832 }
2833 
2834 #include <petsc/private/kernels/blockinvert.h>
2835 
2836 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2837 {
2838   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2839   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2840   MatScalar         *ibdiag, *bdiag, work[25], *t;
2841   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2842   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2843   const PetscScalar *xb, *b;
2844   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2845   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2846   PetscInt           sz, k, ipvt[5];
2847   PetscBool          allowzeropivot, zeropivotdetected;
2848   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
2849 
2850   PetscFunctionBegin;
2851   allowzeropivot = PetscNot(A->erroriffailure);
2852   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2853   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2854   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2855 
2856   if (!a->inode.ibdiagvalid) {
2857     if (!a->inode.ibdiag) {
2858       /* calculate space needed for diagonal blocks */
2859       for (i = 0; i < m; i++) {
2860         nodesz = sizes[i + 1] - sizes[i];
2861         cnt += nodesz * nodesz;
2862       }
2863       a->inode.bdiagsize = cnt;
2864 
2865       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2866     }
2867 
2868     /* copy over the diagonal blocks and invert them */
2869     ibdiag = a->inode.ibdiag;
2870     bdiag  = a->inode.bdiag;
2871     cnt    = 0;
2872     for (i = 0, row = 0; i < m; i++) {
2873       nodesz = sizes[i + 1] - sizes[i];
2874       for (j = 0; j < nodesz; j++) {
2875         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2876       }
2877       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2878 
2879       switch (nodesz) {
2880       case 1:
2881         /* Create matrix data structure */
2882         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2883           if (allowzeropivot) {
2884             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2885             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2886             A->factorerror_zeropivot_row   = row;
2887             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2888           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2889         }
2890         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2891         break;
2892       case 2:
2893         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2894         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2895         break;
2896       case 3:
2897         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2898         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2899         break;
2900       case 4:
2901         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2902         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2903         break;
2904       case 5:
2905         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2906         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2907         break;
2908       default:
2909         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2910       }
2911       cnt += nodesz * nodesz;
2912       row += nodesz;
2913     }
2914     a->inode.ibdiagvalid = PETSC_TRUE;
2915   }
2916   ibdiag = a->inode.ibdiag;
2917   bdiag  = a->inode.bdiag;
2918   t      = a->inode.ssor_work;
2919 
2920   PetscCall(VecGetArray(xx, &x));
2921   PetscCall(VecGetArrayRead(bb, &b));
2922   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2923   if (flag & SOR_ZERO_INITIAL_GUESS) {
2924     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2925       for (i = 0, row = 0; i < m; i++) {
2926         sz  = diag[row] - ii[row];
2927         v1  = a->a + ii[row];
2928         idx = a->j + ii[row];
2929 
2930         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2931         nodesz = sizes[i + 1] - sizes[i];
2932         switch (nodesz) {
2933         case 1:
2934 
2935           sum1 = b[row];
2936           for (n = 0; n < sz - 1; n += 2) {
2937             i1 = idx[0];
2938             i2 = idx[1];
2939             idx += 2;
2940             tmp0 = x[i1];
2941             tmp1 = x[i2];
2942             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2943             v1 += 2;
2944           }
2945 
2946           if (n == sz - 1) {
2947             tmp0 = x[*idx];
2948             sum1 -= *v1 * tmp0;
2949           }
2950           t[row]   = sum1;
2951           x[row++] = sum1 * (*ibdiag++);
2952           break;
2953         case 2:
2954           v2   = a->a + ii[row + 1];
2955           sum1 = b[row];
2956           sum2 = b[row + 1];
2957           for (n = 0; n < sz - 1; n += 2) {
2958             i1 = idx[0];
2959             i2 = idx[1];
2960             idx += 2;
2961             tmp0 = x[i1];
2962             tmp1 = x[i2];
2963             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2964             v1 += 2;
2965             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2966             v2 += 2;
2967           }
2968 
2969           if (n == sz - 1) {
2970             tmp0 = x[*idx];
2971             sum1 -= v1[0] * tmp0;
2972             sum2 -= v2[0] * tmp0;
2973           }
2974           t[row]     = sum1;
2975           t[row + 1] = sum2;
2976           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2977           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2978           ibdiag += 4;
2979           break;
2980         case 3:
2981           v2   = a->a + ii[row + 1];
2982           v3   = a->a + ii[row + 2];
2983           sum1 = b[row];
2984           sum2 = b[row + 1];
2985           sum3 = b[row + 2];
2986           for (n = 0; n < sz - 1; n += 2) {
2987             i1 = idx[0];
2988             i2 = idx[1];
2989             idx += 2;
2990             tmp0 = x[i1];
2991             tmp1 = x[i2];
2992             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2993             v1 += 2;
2994             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2995             v2 += 2;
2996             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2997             v3 += 2;
2998           }
2999 
3000           if (n == sz - 1) {
3001             tmp0 = x[*idx];
3002             sum1 -= v1[0] * tmp0;
3003             sum2 -= v2[0] * tmp0;
3004             sum3 -= v3[0] * tmp0;
3005           }
3006           t[row]     = sum1;
3007           t[row + 1] = sum2;
3008           t[row + 2] = sum3;
3009           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3010           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3011           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3012           ibdiag += 9;
3013           break;
3014         case 4:
3015           v2   = a->a + ii[row + 1];
3016           v3   = a->a + ii[row + 2];
3017           v4   = a->a + ii[row + 3];
3018           sum1 = b[row];
3019           sum2 = b[row + 1];
3020           sum3 = b[row + 2];
3021           sum4 = b[row + 3];
3022           for (n = 0; n < sz - 1; n += 2) {
3023             i1 = idx[0];
3024             i2 = idx[1];
3025             idx += 2;
3026             tmp0 = x[i1];
3027             tmp1 = x[i2];
3028             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3029             v1 += 2;
3030             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3031             v2 += 2;
3032             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3033             v3 += 2;
3034             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3035             v4 += 2;
3036           }
3037 
3038           if (n == sz - 1) {
3039             tmp0 = x[*idx];
3040             sum1 -= v1[0] * tmp0;
3041             sum2 -= v2[0] * tmp0;
3042             sum3 -= v3[0] * tmp0;
3043             sum4 -= v4[0] * tmp0;
3044           }
3045           t[row]     = sum1;
3046           t[row + 1] = sum2;
3047           t[row + 2] = sum3;
3048           t[row + 3] = sum4;
3049           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3050           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3051           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3052           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3053           ibdiag += 16;
3054           break;
3055         case 5:
3056           v2   = a->a + ii[row + 1];
3057           v3   = a->a + ii[row + 2];
3058           v4   = a->a + ii[row + 3];
3059           v5   = a->a + ii[row + 4];
3060           sum1 = b[row];
3061           sum2 = b[row + 1];
3062           sum3 = b[row + 2];
3063           sum4 = b[row + 3];
3064           sum5 = b[row + 4];
3065           for (n = 0; n < sz - 1; n += 2) {
3066             i1 = idx[0];
3067             i2 = idx[1];
3068             idx += 2;
3069             tmp0 = x[i1];
3070             tmp1 = x[i2];
3071             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3072             v1 += 2;
3073             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3074             v2 += 2;
3075             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3076             v3 += 2;
3077             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3078             v4 += 2;
3079             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3080             v5 += 2;
3081           }
3082 
3083           if (n == sz - 1) {
3084             tmp0 = x[*idx];
3085             sum1 -= v1[0] * tmp0;
3086             sum2 -= v2[0] * tmp0;
3087             sum3 -= v3[0] * tmp0;
3088             sum4 -= v4[0] * tmp0;
3089             sum5 -= v5[0] * tmp0;
3090           }
3091           t[row]     = sum1;
3092           t[row + 1] = sum2;
3093           t[row + 2] = sum3;
3094           t[row + 3] = sum4;
3095           t[row + 4] = sum5;
3096           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3097           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3098           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3099           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3100           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3101           ibdiag += 25;
3102           break;
3103         default:
3104           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3105         }
3106       }
3107 
3108       xb = t;
3109       PetscCall(PetscLogFlops(a->nz));
3110     } else xb = b;
3111     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3112       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3113       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3114         nodesz = sizes[i + 1] - sizes[i];
3115         ibdiag -= nodesz * nodesz;
3116         sz  = ii[row + 1] - diag[row] - 1;
3117         v1  = a->a + diag[row] + 1;
3118         idx = a->j + diag[row] + 1;
3119 
3120         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3121         switch (nodesz) {
3122         case 1:
3123 
3124           sum1 = xb[row];
3125           for (n = 0; n < sz - 1; n += 2) {
3126             i1 = idx[0];
3127             i2 = idx[1];
3128             idx += 2;
3129             tmp0 = x[i1];
3130             tmp1 = x[i2];
3131             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3132             v1 += 2;
3133           }
3134 
3135           if (n == sz - 1) {
3136             tmp0 = x[*idx];
3137             sum1 -= *v1 * tmp0;
3138           }
3139           x[row--] = sum1 * (*ibdiag);
3140           break;
3141 
3142         case 2:
3143 
3144           sum1 = xb[row];
3145           sum2 = xb[row - 1];
3146           /* note that sum1 is associated with the second of the two rows */
3147           v2 = a->a + diag[row - 1] + 2;
3148           for (n = 0; n < sz - 1; n += 2) {
3149             i1 = idx[0];
3150             i2 = idx[1];
3151             idx += 2;
3152             tmp0 = x[i1];
3153             tmp1 = x[i2];
3154             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3155             v1 += 2;
3156             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3157             v2 += 2;
3158           }
3159 
3160           if (n == sz - 1) {
3161             tmp0 = x[*idx];
3162             sum1 -= *v1 * tmp0;
3163             sum2 -= *v2 * tmp0;
3164           }
3165           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3166           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3167           break;
3168         case 3:
3169 
3170           sum1 = xb[row];
3171           sum2 = xb[row - 1];
3172           sum3 = xb[row - 2];
3173           v2   = a->a + diag[row - 1] + 2;
3174           v3   = a->a + diag[row - 2] + 3;
3175           for (n = 0; n < sz - 1; n += 2) {
3176             i1 = idx[0];
3177             i2 = idx[1];
3178             idx += 2;
3179             tmp0 = x[i1];
3180             tmp1 = x[i2];
3181             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3182             v1 += 2;
3183             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3184             v2 += 2;
3185             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3186             v3 += 2;
3187           }
3188 
3189           if (n == sz - 1) {
3190             tmp0 = x[*idx];
3191             sum1 -= *v1 * tmp0;
3192             sum2 -= *v2 * tmp0;
3193             sum3 -= *v3 * tmp0;
3194           }
3195           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3196           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3197           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3198           break;
3199         case 4:
3200 
3201           sum1 = xb[row];
3202           sum2 = xb[row - 1];
3203           sum3 = xb[row - 2];
3204           sum4 = xb[row - 3];
3205           v2   = a->a + diag[row - 1] + 2;
3206           v3   = a->a + diag[row - 2] + 3;
3207           v4   = a->a + diag[row - 3] + 4;
3208           for (n = 0; n < sz - 1; n += 2) {
3209             i1 = idx[0];
3210             i2 = idx[1];
3211             idx += 2;
3212             tmp0 = x[i1];
3213             tmp1 = x[i2];
3214             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3215             v1 += 2;
3216             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3217             v2 += 2;
3218             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3219             v3 += 2;
3220             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3221             v4 += 2;
3222           }
3223 
3224           if (n == sz - 1) {
3225             tmp0 = x[*idx];
3226             sum1 -= *v1 * tmp0;
3227             sum2 -= *v2 * tmp0;
3228             sum3 -= *v3 * tmp0;
3229             sum4 -= *v4 * tmp0;
3230           }
3231           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3232           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3233           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3234           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3235           break;
3236         case 5:
3237 
3238           sum1 = xb[row];
3239           sum2 = xb[row - 1];
3240           sum3 = xb[row - 2];
3241           sum4 = xb[row - 3];
3242           sum5 = xb[row - 4];
3243           v2   = a->a + diag[row - 1] + 2;
3244           v3   = a->a + diag[row - 2] + 3;
3245           v4   = a->a + diag[row - 3] + 4;
3246           v5   = a->a + diag[row - 4] + 5;
3247           for (n = 0; n < sz - 1; n += 2) {
3248             i1 = idx[0];
3249             i2 = idx[1];
3250             idx += 2;
3251             tmp0 = x[i1];
3252             tmp1 = x[i2];
3253             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3254             v1 += 2;
3255             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3256             v2 += 2;
3257             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3258             v3 += 2;
3259             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3260             v4 += 2;
3261             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3262             v5 += 2;
3263           }
3264 
3265           if (n == sz - 1) {
3266             tmp0 = x[*idx];
3267             sum1 -= *v1 * tmp0;
3268             sum2 -= *v2 * tmp0;
3269             sum3 -= *v3 * tmp0;
3270             sum4 -= *v4 * tmp0;
3271             sum5 -= *v5 * tmp0;
3272           }
3273           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3274           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3275           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3276           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3277           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3278           break;
3279         default:
3280           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3281         }
3282       }
3283 
3284       PetscCall(PetscLogFlops(a->nz));
3285     }
3286     its--;
3287   }
3288   while (its--) {
3289     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3290       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3291         nodesz = sizes[i + 1] - sizes[i];
3292         sz     = diag[row] - ii[row];
3293         v1     = a->a + ii[row];
3294         idx    = a->j + ii[row];
3295         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3296         switch (nodesz) {
3297         case 1:
3298           sum1 = b[row];
3299           for (n = 0; n < sz - 1; n += 2) {
3300             i1 = idx[0];
3301             i2 = idx[1];
3302             idx += 2;
3303             tmp0 = x[i1];
3304             tmp1 = x[i2];
3305             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3306             v1 += 2;
3307           }
3308           if (n == sz - 1) {
3309             tmp0 = x[*idx++];
3310             sum1 -= *v1 * tmp0;
3311             v1++;
3312           }
3313           t[row] = sum1;
3314           sz     = ii[row + 1] - diag[row] - 1;
3315           idx    = a->j + diag[row] + 1;
3316           v1 += 1;
3317           for (n = 0; n < sz - 1; n += 2) {
3318             i1 = idx[0];
3319             i2 = idx[1];
3320             idx += 2;
3321             tmp0 = x[i1];
3322             tmp1 = x[i2];
3323             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3324             v1 += 2;
3325           }
3326           if (n == sz - 1) {
3327             tmp0 = x[*idx++];
3328             sum1 -= *v1 * tmp0;
3329           }
3330           /* in MatSOR_SeqAIJ this line would be
3331            *
3332            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3333            *
3334            * but omega == 1, so this becomes
3335            *
3336            * x[row] = sum1*(*ibdiag++);
3337            *
3338            */
3339           x[row] = sum1 * (*ibdiag);
3340           break;
3341         case 2:
3342           v2   = a->a + ii[row + 1];
3343           sum1 = b[row];
3344           sum2 = b[row + 1];
3345           for (n = 0; n < sz - 1; n += 2) {
3346             i1 = idx[0];
3347             i2 = idx[1];
3348             idx += 2;
3349             tmp0 = x[i1];
3350             tmp1 = x[i2];
3351             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3352             v1 += 2;
3353             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3354             v2 += 2;
3355           }
3356           if (n == sz - 1) {
3357             tmp0 = x[*idx++];
3358             sum1 -= v1[0] * tmp0;
3359             sum2 -= v2[0] * tmp0;
3360             v1++;
3361             v2++;
3362           }
3363           t[row]     = sum1;
3364           t[row + 1] = sum2;
3365           sz         = ii[row + 1] - diag[row] - 2;
3366           idx        = a->j + diag[row] + 2;
3367           v1 += 2;
3368           v2 += 2;
3369           for (n = 0; n < sz - 1; n += 2) {
3370             i1 = idx[0];
3371             i2 = idx[1];
3372             idx += 2;
3373             tmp0 = x[i1];
3374             tmp1 = x[i2];
3375             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3376             v1 += 2;
3377             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3378             v2 += 2;
3379           }
3380           if (n == sz - 1) {
3381             tmp0 = x[*idx];
3382             sum1 -= v1[0] * tmp0;
3383             sum2 -= v2[0] * tmp0;
3384           }
3385           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3386           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3387           break;
3388         case 3:
3389           v2   = a->a + ii[row + 1];
3390           v3   = a->a + ii[row + 2];
3391           sum1 = b[row];
3392           sum2 = b[row + 1];
3393           sum3 = b[row + 2];
3394           for (n = 0; n < sz - 1; n += 2) {
3395             i1 = idx[0];
3396             i2 = idx[1];
3397             idx += 2;
3398             tmp0 = x[i1];
3399             tmp1 = x[i2];
3400             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3401             v1 += 2;
3402             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3403             v2 += 2;
3404             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3405             v3 += 2;
3406           }
3407           if (n == sz - 1) {
3408             tmp0 = x[*idx++];
3409             sum1 -= v1[0] * tmp0;
3410             sum2 -= v2[0] * tmp0;
3411             sum3 -= v3[0] * tmp0;
3412             v1++;
3413             v2++;
3414             v3++;
3415           }
3416           t[row]     = sum1;
3417           t[row + 1] = sum2;
3418           t[row + 2] = sum3;
3419           sz         = ii[row + 1] - diag[row] - 3;
3420           idx        = a->j + diag[row] + 3;
3421           v1 += 3;
3422           v2 += 3;
3423           v3 += 3;
3424           for (n = 0; n < sz - 1; n += 2) {
3425             i1 = idx[0];
3426             i2 = idx[1];
3427             idx += 2;
3428             tmp0 = x[i1];
3429             tmp1 = x[i2];
3430             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3431             v1 += 2;
3432             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3433             v2 += 2;
3434             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3435             v3 += 2;
3436           }
3437           if (n == sz - 1) {
3438             tmp0 = x[*idx];
3439             sum1 -= v1[0] * tmp0;
3440             sum2 -= v2[0] * tmp0;
3441             sum3 -= v3[0] * tmp0;
3442           }
3443           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3444           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3445           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3446           break;
3447         case 4:
3448           v2   = a->a + ii[row + 1];
3449           v3   = a->a + ii[row + 2];
3450           v4   = a->a + ii[row + 3];
3451           sum1 = b[row];
3452           sum2 = b[row + 1];
3453           sum3 = b[row + 2];
3454           sum4 = b[row + 3];
3455           for (n = 0; n < sz - 1; n += 2) {
3456             i1 = idx[0];
3457             i2 = idx[1];
3458             idx += 2;
3459             tmp0 = x[i1];
3460             tmp1 = x[i2];
3461             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3462             v1 += 2;
3463             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3464             v2 += 2;
3465             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3466             v3 += 2;
3467             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3468             v4 += 2;
3469           }
3470           if (n == sz - 1) {
3471             tmp0 = x[*idx++];
3472             sum1 -= v1[0] * tmp0;
3473             sum2 -= v2[0] * tmp0;
3474             sum3 -= v3[0] * tmp0;
3475             sum4 -= v4[0] * tmp0;
3476             v1++;
3477             v2++;
3478             v3++;
3479             v4++;
3480           }
3481           t[row]     = sum1;
3482           t[row + 1] = sum2;
3483           t[row + 2] = sum3;
3484           t[row + 3] = sum4;
3485           sz         = ii[row + 1] - diag[row] - 4;
3486           idx        = a->j + diag[row] + 4;
3487           v1 += 4;
3488           v2 += 4;
3489           v3 += 4;
3490           v4 += 4;
3491           for (n = 0; n < sz - 1; n += 2) {
3492             i1 = idx[0];
3493             i2 = idx[1];
3494             idx += 2;
3495             tmp0 = x[i1];
3496             tmp1 = x[i2];
3497             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3498             v1 += 2;
3499             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3500             v2 += 2;
3501             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3502             v3 += 2;
3503             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3504             v4 += 2;
3505           }
3506           if (n == sz - 1) {
3507             tmp0 = x[*idx];
3508             sum1 -= v1[0] * tmp0;
3509             sum2 -= v2[0] * tmp0;
3510             sum3 -= v3[0] * tmp0;
3511             sum4 -= v4[0] * tmp0;
3512           }
3513           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3514           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3515           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3516           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3517           break;
3518         case 5:
3519           v2   = a->a + ii[row + 1];
3520           v3   = a->a + ii[row + 2];
3521           v4   = a->a + ii[row + 3];
3522           v5   = a->a + ii[row + 4];
3523           sum1 = b[row];
3524           sum2 = b[row + 1];
3525           sum3 = b[row + 2];
3526           sum4 = b[row + 3];
3527           sum5 = b[row + 4];
3528           for (n = 0; n < sz - 1; n += 2) {
3529             i1 = idx[0];
3530             i2 = idx[1];
3531             idx += 2;
3532             tmp0 = x[i1];
3533             tmp1 = x[i2];
3534             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3535             v1 += 2;
3536             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3537             v2 += 2;
3538             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3539             v3 += 2;
3540             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3541             v4 += 2;
3542             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3543             v5 += 2;
3544           }
3545           if (n == sz - 1) {
3546             tmp0 = x[*idx++];
3547             sum1 -= v1[0] * tmp0;
3548             sum2 -= v2[0] * tmp0;
3549             sum3 -= v3[0] * tmp0;
3550             sum4 -= v4[0] * tmp0;
3551             sum5 -= v5[0] * tmp0;
3552             v1++;
3553             v2++;
3554             v3++;
3555             v4++;
3556             v5++;
3557           }
3558           t[row]     = sum1;
3559           t[row + 1] = sum2;
3560           t[row + 2] = sum3;
3561           t[row + 3] = sum4;
3562           t[row + 4] = sum5;
3563           sz         = ii[row + 1] - diag[row] - 5;
3564           idx        = a->j + diag[row] + 5;
3565           v1 += 5;
3566           v2 += 5;
3567           v3 += 5;
3568           v4 += 5;
3569           v5 += 5;
3570           for (n = 0; n < sz - 1; n += 2) {
3571             i1 = idx[0];
3572             i2 = idx[1];
3573             idx += 2;
3574             tmp0 = x[i1];
3575             tmp1 = x[i2];
3576             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3577             v1 += 2;
3578             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3579             v2 += 2;
3580             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3581             v3 += 2;
3582             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3583             v4 += 2;
3584             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3585             v5 += 2;
3586           }
3587           if (n == sz - 1) {
3588             tmp0 = x[*idx];
3589             sum1 -= v1[0] * tmp0;
3590             sum2 -= v2[0] * tmp0;
3591             sum3 -= v3[0] * tmp0;
3592             sum4 -= v4[0] * tmp0;
3593             sum5 -= v5[0] * tmp0;
3594           }
3595           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3596           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3597           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3598           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3599           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3600           break;
3601         default:
3602           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3603         }
3604       }
3605       xb = t;
3606       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3607     } else xb = b;
3608 
3609     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3610       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3611       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3612         nodesz = sizes[i + 1] - sizes[i];
3613         ibdiag -= nodesz * nodesz;
3614 
3615         /* set RHS */
3616         if (xb == b) {
3617           /* whole (old way) */
3618           sz  = ii[row + 1] - ii[row];
3619           idx = a->j + ii[row];
3620           switch (nodesz) {
3621           case 5:
3622             v5 = a->a + ii[row - 4]; /* fall through */
3623           case 4:
3624             v4 = a->a + ii[row - 3]; /* fall through */
3625           case 3:
3626             v3 = a->a + ii[row - 2]; /* fall through */
3627           case 2:
3628             v2 = a->a + ii[row - 1]; /* fall through */
3629           case 1:
3630             v1 = a->a + ii[row];
3631             break;
3632           default:
3633             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3634           }
3635         } else {
3636           /* upper, no diag */
3637           sz  = ii[row + 1] - diag[row] - 1;
3638           idx = a->j + diag[row] + 1;
3639           switch (nodesz) {
3640           case 5:
3641             v5 = a->a + diag[row - 4] + 5; /* fall through */
3642           case 4:
3643             v4 = a->a + diag[row - 3] + 4; /* fall through */
3644           case 3:
3645             v3 = a->a + diag[row - 2] + 3; /* fall through */
3646           case 2:
3647             v2 = a->a + diag[row - 1] + 2; /* fall through */
3648           case 1:
3649             v1 = a->a + diag[row] + 1;
3650           }
3651         }
3652         /* set sum */
3653         switch (nodesz) {
3654         case 5:
3655           sum5 = xb[row - 4]; /* fall through */
3656         case 4:
3657           sum4 = xb[row - 3]; /* fall through */
3658         case 3:
3659           sum3 = xb[row - 2]; /* fall through */
3660         case 2:
3661           sum2 = xb[row - 1]; /* fall through */
3662         case 1:
3663           /* note that sum1 is associated with the last row */
3664           sum1 = xb[row];
3665         }
3666         /* do sums */
3667         for (n = 0; n < sz - 1; n += 2) {
3668           i1 = idx[0];
3669           i2 = idx[1];
3670           idx += 2;
3671           tmp0 = x[i1];
3672           tmp1 = x[i2];
3673           switch (nodesz) {
3674           case 5:
3675             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3676             v5 += 2; /* fall through */
3677           case 4:
3678             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3679             v4 += 2; /* fall through */
3680           case 3:
3681             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3682             v3 += 2; /* fall through */
3683           case 2:
3684             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3685             v2 += 2; /* fall through */
3686           case 1:
3687             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3688             v1 += 2;
3689           }
3690         }
3691         /* ragged edge */
3692         if (n == sz - 1) {
3693           tmp0 = x[*idx];
3694           switch (nodesz) {
3695           case 5:
3696             sum5 -= *v5 * tmp0; /* fall through */
3697           case 4:
3698             sum4 -= *v4 * tmp0; /* fall through */
3699           case 3:
3700             sum3 -= *v3 * tmp0; /* fall through */
3701           case 2:
3702             sum2 -= *v2 * tmp0; /* fall through */
3703           case 1:
3704             sum1 -= *v1 * tmp0;
3705           }
3706         }
3707         /* update */
3708         if (xb == b) {
3709           /* whole (old way) w/ diag */
3710           switch (nodesz) {
3711           case 5:
3712             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3713             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3714             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3715             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3716             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3717             break;
3718           case 4:
3719             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3720             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3721             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3722             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3723             break;
3724           case 3:
3725             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3726             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3727             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3728             break;
3729           case 2:
3730             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3731             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3732             break;
3733           case 1:
3734             x[row--] += sum1 * (*ibdiag);
3735             break;
3736           }
3737         } else {
3738           /* no diag so set =  */
3739           switch (nodesz) {
3740           case 5:
3741             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3742             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3743             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3744             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3745             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3746             break;
3747           case 4:
3748             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3749             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3750             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3751             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3752             break;
3753           case 3:
3754             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3755             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3756             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3757             break;
3758           case 2:
3759             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3760             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3761             break;
3762           case 1:
3763             x[row--] = sum1 * (*ibdiag);
3764             break;
3765           }
3766         }
3767       }
3768       if (xb == b) {
3769         PetscCall(PetscLogFlops(2.0 * a->nz));
3770       } else {
3771         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3772       }
3773     }
3774   }
3775   if (flag & SOR_EISENSTAT) {
3776     /*
3777           Apply  (U + D)^-1  where D is now the block diagonal
3778     */
3779     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3780     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3781       nodesz = sizes[i + 1] - sizes[i];
3782       ibdiag -= nodesz * nodesz;
3783       sz  = ii[row + 1] - diag[row] - 1;
3784       v1  = a->a + diag[row] + 1;
3785       idx = a->j + diag[row] + 1;
3786       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3787       switch (nodesz) {
3788       case 1:
3789 
3790         sum1 = b[row];
3791         for (n = 0; n < sz - 1; n += 2) {
3792           i1 = idx[0];
3793           i2 = idx[1];
3794           idx += 2;
3795           tmp0 = x[i1];
3796           tmp1 = x[i2];
3797           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3798           v1 += 2;
3799         }
3800 
3801         if (n == sz - 1) {
3802           tmp0 = x[*idx];
3803           sum1 -= *v1 * tmp0;
3804         }
3805         x[row] = sum1 * (*ibdiag);
3806         row--;
3807         break;
3808 
3809       case 2:
3810 
3811         sum1 = b[row];
3812         sum2 = b[row - 1];
3813         /* note that sum1 is associated with the second of the two rows */
3814         v2 = a->a + diag[row - 1] + 2;
3815         for (n = 0; n < sz - 1; n += 2) {
3816           i1 = idx[0];
3817           i2 = idx[1];
3818           idx += 2;
3819           tmp0 = x[i1];
3820           tmp1 = x[i2];
3821           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3822           v1 += 2;
3823           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3824           v2 += 2;
3825         }
3826 
3827         if (n == sz - 1) {
3828           tmp0 = x[*idx];
3829           sum1 -= *v1 * tmp0;
3830           sum2 -= *v2 * tmp0;
3831         }
3832         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3833         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3834         row -= 2;
3835         break;
3836       case 3:
3837 
3838         sum1 = b[row];
3839         sum2 = b[row - 1];
3840         sum3 = b[row - 2];
3841         v2   = a->a + diag[row - 1] + 2;
3842         v3   = a->a + diag[row - 2] + 3;
3843         for (n = 0; n < sz - 1; n += 2) {
3844           i1 = idx[0];
3845           i2 = idx[1];
3846           idx += 2;
3847           tmp0 = x[i1];
3848           tmp1 = x[i2];
3849           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3850           v1 += 2;
3851           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3852           v2 += 2;
3853           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3854           v3 += 2;
3855         }
3856 
3857         if (n == sz - 1) {
3858           tmp0 = x[*idx];
3859           sum1 -= *v1 * tmp0;
3860           sum2 -= *v2 * tmp0;
3861           sum3 -= *v3 * tmp0;
3862         }
3863         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3864         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3865         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3866         row -= 3;
3867         break;
3868       case 4:
3869 
3870         sum1 = b[row];
3871         sum2 = b[row - 1];
3872         sum3 = b[row - 2];
3873         sum4 = b[row - 3];
3874         v2   = a->a + diag[row - 1] + 2;
3875         v3   = a->a + diag[row - 2] + 3;
3876         v4   = a->a + diag[row - 3] + 4;
3877         for (n = 0; n < sz - 1; n += 2) {
3878           i1 = idx[0];
3879           i2 = idx[1];
3880           idx += 2;
3881           tmp0 = x[i1];
3882           tmp1 = x[i2];
3883           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3884           v1 += 2;
3885           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3886           v2 += 2;
3887           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3888           v3 += 2;
3889           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3890           v4 += 2;
3891         }
3892 
3893         if (n == sz - 1) {
3894           tmp0 = x[*idx];
3895           sum1 -= *v1 * tmp0;
3896           sum2 -= *v2 * tmp0;
3897           sum3 -= *v3 * tmp0;
3898           sum4 -= *v4 * tmp0;
3899         }
3900         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3901         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3902         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3903         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3904         row -= 4;
3905         break;
3906       case 5:
3907 
3908         sum1 = b[row];
3909         sum2 = b[row - 1];
3910         sum3 = b[row - 2];
3911         sum4 = b[row - 3];
3912         sum5 = b[row - 4];
3913         v2   = a->a + diag[row - 1] + 2;
3914         v3   = a->a + diag[row - 2] + 3;
3915         v4   = a->a + diag[row - 3] + 4;
3916         v5   = a->a + diag[row - 4] + 5;
3917         for (n = 0; n < sz - 1; n += 2) {
3918           i1 = idx[0];
3919           i2 = idx[1];
3920           idx += 2;
3921           tmp0 = x[i1];
3922           tmp1 = x[i2];
3923           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3924           v1 += 2;
3925           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3926           v2 += 2;
3927           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3928           v3 += 2;
3929           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3930           v4 += 2;
3931           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3932           v5 += 2;
3933         }
3934 
3935         if (n == sz - 1) {
3936           tmp0 = x[*idx];
3937           sum1 -= *v1 * tmp0;
3938           sum2 -= *v2 * tmp0;
3939           sum3 -= *v3 * tmp0;
3940           sum4 -= *v4 * tmp0;
3941           sum5 -= *v5 * tmp0;
3942         }
3943         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3944         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3945         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3946         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3947         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3948         row -= 5;
3949         break;
3950       default:
3951         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3952       }
3953     }
3954     PetscCall(PetscLogFlops(a->nz));
3955 
3956     /*
3957            t = b - D x    where D is the block diagonal
3958     */
3959     cnt = 0;
3960     for (i = 0, row = 0; i < m; i++) {
3961       nodesz = sizes[i + 1] - sizes[i];
3962       switch (nodesz) {
3963       case 1:
3964         t[row] = b[row] - bdiag[cnt++] * x[row];
3965         row++;
3966         break;
3967       case 2:
3968         x1         = x[row];
3969         x2         = x[row + 1];
3970         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3971         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3972         t[row]     = b[row] - tmp1;
3973         t[row + 1] = b[row + 1] - tmp2;
3974         row += 2;
3975         cnt += 4;
3976         break;
3977       case 3:
3978         x1         = x[row];
3979         x2         = x[row + 1];
3980         x3         = x[row + 2];
3981         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3982         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3983         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3984         t[row]     = b[row] - tmp1;
3985         t[row + 1] = b[row + 1] - tmp2;
3986         t[row + 2] = b[row + 2] - tmp3;
3987         row += 3;
3988         cnt += 9;
3989         break;
3990       case 4:
3991         x1         = x[row];
3992         x2         = x[row + 1];
3993         x3         = x[row + 2];
3994         x4         = x[row + 3];
3995         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3996         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3997         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3998         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3999         t[row]     = b[row] - tmp1;
4000         t[row + 1] = b[row + 1] - tmp2;
4001         t[row + 2] = b[row + 2] - tmp3;
4002         t[row + 3] = b[row + 3] - tmp4;
4003         row += 4;
4004         cnt += 16;
4005         break;
4006       case 5:
4007         x1         = x[row];
4008         x2         = x[row + 1];
4009         x3         = x[row + 2];
4010         x4         = x[row + 3];
4011         x5         = x[row + 4];
4012         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4013         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4014         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4015         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4016         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4017         t[row]     = b[row] - tmp1;
4018         t[row + 1] = b[row + 1] - tmp2;
4019         t[row + 2] = b[row + 2] - tmp3;
4020         t[row + 3] = b[row + 3] - tmp4;
4021         t[row + 4] = b[row + 4] - tmp5;
4022         row += 5;
4023         cnt += 25;
4024         break;
4025       default:
4026         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4027       }
4028     }
4029     PetscCall(PetscLogFlops(m));
4030 
4031     /*
4032           Apply (L + D)^-1 where D is the block diagonal
4033     */
4034     for (i = 0, row = 0; i < m; i++) {
4035       nodesz = sizes[i + 1] - sizes[i];
4036       sz     = diag[row] - ii[row];
4037       v1     = a->a + ii[row];
4038       idx    = a->j + ii[row];
4039       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4040       switch (nodesz) {
4041       case 1:
4042 
4043         sum1 = t[row];
4044         for (n = 0; n < sz - 1; n += 2) {
4045           i1 = idx[0];
4046           i2 = idx[1];
4047           idx += 2;
4048           tmp0 = t[i1];
4049           tmp1 = t[i2];
4050           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4051           v1 += 2;
4052         }
4053 
4054         if (n == sz - 1) {
4055           tmp0 = t[*idx];
4056           sum1 -= *v1 * tmp0;
4057         }
4058         x[row] += t[row] = sum1 * (*ibdiag++);
4059         row++;
4060         break;
4061       case 2:
4062         v2   = a->a + ii[row + 1];
4063         sum1 = t[row];
4064         sum2 = t[row + 1];
4065         for (n = 0; n < sz - 1; n += 2) {
4066           i1 = idx[0];
4067           i2 = idx[1];
4068           idx += 2;
4069           tmp0 = t[i1];
4070           tmp1 = t[i2];
4071           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4072           v1 += 2;
4073           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4074           v2 += 2;
4075         }
4076 
4077         if (n == sz - 1) {
4078           tmp0 = t[*idx];
4079           sum1 -= v1[0] * tmp0;
4080           sum2 -= v2[0] * tmp0;
4081         }
4082         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4083         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4084         ibdiag += 4;
4085         row += 2;
4086         break;
4087       case 3:
4088         v2   = a->a + ii[row + 1];
4089         v3   = a->a + ii[row + 2];
4090         sum1 = t[row];
4091         sum2 = t[row + 1];
4092         sum3 = t[row + 2];
4093         for (n = 0; n < sz - 1; n += 2) {
4094           i1 = idx[0];
4095           i2 = idx[1];
4096           idx += 2;
4097           tmp0 = t[i1];
4098           tmp1 = t[i2];
4099           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4100           v1 += 2;
4101           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4102           v2 += 2;
4103           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4104           v3 += 2;
4105         }
4106 
4107         if (n == sz - 1) {
4108           tmp0 = t[*idx];
4109           sum1 -= v1[0] * tmp0;
4110           sum2 -= v2[0] * tmp0;
4111           sum3 -= v3[0] * tmp0;
4112         }
4113         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4114         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4115         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4116         ibdiag += 9;
4117         row += 3;
4118         break;
4119       case 4:
4120         v2   = a->a + ii[row + 1];
4121         v3   = a->a + ii[row + 2];
4122         v4   = a->a + ii[row + 3];
4123         sum1 = t[row];
4124         sum2 = t[row + 1];
4125         sum3 = t[row + 2];
4126         sum4 = t[row + 3];
4127         for (n = 0; n < sz - 1; n += 2) {
4128           i1 = idx[0];
4129           i2 = idx[1];
4130           idx += 2;
4131           tmp0 = t[i1];
4132           tmp1 = t[i2];
4133           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4134           v1 += 2;
4135           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4136           v2 += 2;
4137           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4138           v3 += 2;
4139           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4140           v4 += 2;
4141         }
4142 
4143         if (n == sz - 1) {
4144           tmp0 = t[*idx];
4145           sum1 -= v1[0] * tmp0;
4146           sum2 -= v2[0] * tmp0;
4147           sum3 -= v3[0] * tmp0;
4148           sum4 -= v4[0] * tmp0;
4149         }
4150         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4151         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4152         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4153         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4154         ibdiag += 16;
4155         row += 4;
4156         break;
4157       case 5:
4158         v2   = a->a + ii[row + 1];
4159         v3   = a->a + ii[row + 2];
4160         v4   = a->a + ii[row + 3];
4161         v5   = a->a + ii[row + 4];
4162         sum1 = t[row];
4163         sum2 = t[row + 1];
4164         sum3 = t[row + 2];
4165         sum4 = t[row + 3];
4166         sum5 = t[row + 4];
4167         for (n = 0; n < sz - 1; n += 2) {
4168           i1 = idx[0];
4169           i2 = idx[1];
4170           idx += 2;
4171           tmp0 = t[i1];
4172           tmp1 = t[i2];
4173           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4174           v1 += 2;
4175           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4176           v2 += 2;
4177           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4178           v3 += 2;
4179           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4180           v4 += 2;
4181           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4182           v5 += 2;
4183         }
4184 
4185         if (n == sz - 1) {
4186           tmp0 = t[*idx];
4187           sum1 -= v1[0] * tmp0;
4188           sum2 -= v2[0] * tmp0;
4189           sum3 -= v3[0] * tmp0;
4190           sum4 -= v4[0] * tmp0;
4191           sum5 -= v5[0] * tmp0;
4192         }
4193         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4194         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4195         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4196         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4197         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4198         ibdiag += 25;
4199         row += 5;
4200         break;
4201       default:
4202         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4203       }
4204     }
4205     PetscCall(PetscLogFlops(a->nz));
4206   }
4207   PetscCall(VecRestoreArray(xx, &x));
4208   PetscCall(VecRestoreArrayRead(bb, &b));
4209   PetscFunctionReturn(PETSC_SUCCESS);
4210 }
4211 
4212 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4213 {
4214   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4215   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4216   const MatScalar   *bdiag = a->inode.bdiag;
4217   const PetscScalar *b;
4218   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4219   const PetscInt    *sizes = a->inode.size_csr;
4220 
4221   PetscFunctionBegin;
4222   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4223   PetscCall(VecGetArray(xx, &x));
4224   PetscCall(VecGetArrayRead(bb, &b));
4225   cnt = 0;
4226   for (i = 0, row = 0; i < m; i++) {
4227     nodesz = sizes[i + 1] - sizes[i];
4228     switch (nodesz) {
4229     case 1:
4230       x[row] = b[row] * bdiag[cnt++];
4231       row++;
4232       break;
4233     case 2:
4234       x1       = b[row];
4235       x2       = b[row + 1];
4236       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4237       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4238       x[row++] = tmp1;
4239       x[row++] = tmp2;
4240       cnt += 4;
4241       break;
4242     case 3:
4243       x1       = b[row];
4244       x2       = b[row + 1];
4245       x3       = b[row + 2];
4246       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4247       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4248       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4249       x[row++] = tmp1;
4250       x[row++] = tmp2;
4251       x[row++] = tmp3;
4252       cnt += 9;
4253       break;
4254     case 4:
4255       x1       = b[row];
4256       x2       = b[row + 1];
4257       x3       = b[row + 2];
4258       x4       = b[row + 3];
4259       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4260       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4261       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4262       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4263       x[row++] = tmp1;
4264       x[row++] = tmp2;
4265       x[row++] = tmp3;
4266       x[row++] = tmp4;
4267       cnt += 16;
4268       break;
4269     case 5:
4270       x1       = b[row];
4271       x2       = b[row + 1];
4272       x3       = b[row + 2];
4273       x4       = b[row + 3];
4274       x5       = b[row + 4];
4275       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4276       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4277       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4278       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4279       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4280       x[row++] = tmp1;
4281       x[row++] = tmp2;
4282       x[row++] = tmp3;
4283       x[row++] = tmp4;
4284       x[row++] = tmp5;
4285       cnt += 25;
4286       break;
4287     default:
4288       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4289     }
4290   }
4291   PetscCall(PetscLogFlops(2.0 * cnt));
4292   PetscCall(VecRestoreArray(xx, &x));
4293   PetscCall(VecRestoreArrayRead(bb, &b));
4294   PetscFunctionReturn(PETSC_SUCCESS);
4295 }
4296 
4297 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4298 {
4299   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4300 
4301   PetscFunctionBegin;
4302   a->inode.node_count       = 0;
4303   a->inode.use              = PETSC_FALSE;
4304   a->inode.checked          = PETSC_FALSE;
4305   a->inode.mat_nonzerostate = -1;
4306   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4307   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4308   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4309   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4310   A->ops->coloringpatch     = NULL;
4311   A->ops->multdiagonalblock = NULL;
4312   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4313   PetscFunctionReturn(PETSC_SUCCESS);
4314 }
4315 
4316 /*
4317     samestructure indicates that the matrix has not changed its nonzero structure so we
4318     do not need to recompute the inodes
4319 */
4320 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4321 {
4322   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4323   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4324   PetscBool       flag;
4325   const PetscInt *idx, *idy, *ii;
4326 
4327   PetscFunctionBegin;
4328   if (!a->inode.use) {
4329     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4330     PetscCall(PetscFree(a->inode.size_csr));
4331     PetscFunctionReturn(PETSC_SUCCESS);
4332   }
4333   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4334 
4335   m = A->rmap->n;
4336   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4337   ns    = a->inode.size_csr;
4338   ns[0] = 0;
4339 
4340   i          = 0;
4341   node_count = 0;
4342   idx        = a->j;
4343   ii         = a->i;
4344   if (idx) {
4345     while (i < m) {            /* For each row */
4346       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4347       /* Limits the number of elements in a node to 'a->inode.limit' */
4348       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4349         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4350         if (nzy != nzx) break;
4351         idy += nzx; /* Same nonzero pattern */
4352         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4353         if (!flag) break;
4354       }
4355       ns[node_count + 1] = ns[node_count] + blk_size;
4356       node_count++;
4357       idx += blk_size * nzx;
4358       i = j;
4359     }
4360   }
4361   /* If not enough inodes found,, do not use inode version of the routines */
4362   if (!m || !idx || node_count > .8 * m) {
4363     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4364     PetscCall(PetscFree(a->inode.size_csr));
4365     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4366   } else {
4367     if (!A->factortype) {
4368       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4369       if (A->rmap->n == A->cmap->n) {
4370         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4371         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4372         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4373         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4374         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4375       }
4376     } else {
4377       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4378     }
4379     a->inode.node_count = node_count;
4380     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4381   }
4382   a->inode.checked          = PETSC_TRUE;
4383   a->inode.mat_nonzerostate = A->nonzerostate;
4384   PetscFunctionReturn(PETSC_SUCCESS);
4385 }
4386 
4387 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4388 {
4389   Mat         B = *C;
4390   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4391   PetscInt    m = A->rmap->n;
4392 
4393   PetscFunctionBegin;
4394   c->inode.use              = a->inode.use;
4395   c->inode.limit            = a->inode.limit;
4396   c->inode.max_limit        = a->inode.max_limit;
4397   c->inode.checked          = PETSC_FALSE;
4398   c->inode.size_csr         = NULL;
4399   c->inode.node_count       = 0;
4400   c->inode.ibdiagvalid      = PETSC_FALSE;
4401   c->inode.ibdiag           = NULL;
4402   c->inode.bdiag            = NULL;
4403   c->inode.mat_nonzerostate = -1;
4404   if (a->inode.use) {
4405     if (a->inode.checked && a->inode.size_csr) {
4406       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4407       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4408 
4409       c->inode.checked          = PETSC_TRUE;
4410       c->inode.node_count       = a->inode.node_count;
4411       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4412     }
4413     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4414     if (!B->factortype) {
4415       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4416       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4417       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4418       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4419       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4420       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4421     } else {
4422       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4423     }
4424   }
4425   PetscFunctionReturn(PETSC_SUCCESS);
4426 }
4427 
4428 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4429 {
4430   PetscInt        k;
4431   const PetscInt *vi;
4432 
4433   PetscFunctionBegin;
4434   vi = aj + ai[row];
4435   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4436   vi        = aj + adiag[row];
4437   cols[nzl] = vi[0];
4438   vi        = aj + adiag[row + 1] + 1;
4439   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4440   PetscFunctionReturn(PETSC_SUCCESS);
4441 }
4442 /*
4443    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4444    Modified from MatSeqAIJCheckInode().
4445 
4446    Input Parameters:
4447 .  Mat A - ILU or LU matrix factor
4448 
4449 */
4450 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4451 {
4452   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4453   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4454   PetscInt       *cols1, *cols2, *ns;
4455   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4456   PetscBool       flag;
4457 
4458   PetscFunctionBegin;
4459   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4460   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4461 
4462   m = A->rmap->n;
4463   if (a->inode.size_csr) ns = a->inode.size_csr;
4464   else PetscCall(PetscMalloc1(m + 1, &ns));
4465   ns[0] = 0;
4466 
4467   i          = 0;
4468   node_count = 0;
4469   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4470   while (i < m) {                       /* For each row */
4471     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4472     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4473     nzx  = nzl1 + nzu1 + 1;
4474     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4475 
4476     /* Limits the number of elements in a node to 'a->inode.limit' */
4477     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4478       nzl2 = ai[j + 1] - ai[j];
4479       nzu2 = adiag[j] - adiag[j + 1] - 1;
4480       nzy  = nzl2 + nzu2 + 1;
4481       if (nzy != nzx) break;
4482       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4483       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4484       if (!flag) break;
4485     }
4486     ns[node_count + 1] = ns[node_count] + blk_size;
4487     node_count++;
4488     i = j;
4489   }
4490   PetscCall(PetscFree2(cols1, cols2));
4491   /* If not enough inodes found,, do not use inode version of the routines */
4492   if (!m || node_count > .8 * m) {
4493     PetscCall(PetscFree(ns));
4494 
4495     a->inode.node_count = 0;
4496     a->inode.size_csr   = NULL;
4497     a->inode.use        = PETSC_FALSE;
4498 
4499     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4500   } else {
4501     A->ops->mult              = NULL;
4502     A->ops->sor               = NULL;
4503     A->ops->multadd           = NULL;
4504     A->ops->getrowij          = NULL;
4505     A->ops->restorerowij      = NULL;
4506     A->ops->getcolumnij       = NULL;
4507     A->ops->restorecolumnij   = NULL;
4508     A->ops->coloringpatch     = NULL;
4509     A->ops->multdiagonalblock = NULL;
4510     a->inode.node_count       = node_count;
4511     a->inode.size_csr         = ns;
4512     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4513   }
4514   a->inode.checked = PETSC_TRUE;
4515   PetscFunctionReturn(PETSC_SUCCESS);
4516 }
4517 
4518 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4519 {
4520   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4521 
4522   PetscFunctionBegin;
4523   a->inode.ibdiagvalid = PETSC_FALSE;
4524   PetscFunctionReturn(PETSC_SUCCESS);
4525 }
4526 
4527 /*
4528      This is really ugly. if inodes are used this replaces the
4529   permutations with ones that correspond to rows/cols of the matrix
4530   rather than inode blocks
4531 */
4532 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4533 {
4534   PetscFunctionBegin;
4535   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4536   PetscFunctionReturn(PETSC_SUCCESS);
4537 }
4538 
4539 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4540 {
4541   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4542   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4543   const PetscInt *ridx, *cidx;
4544   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4545   PetscInt        nslim_col, *ns_col;
4546   IS              ris = *rperm, cis = *cperm;
4547 
4548   PetscFunctionBegin;
4549   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4550   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4551 
4552   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4553   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4554   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4555 
4556   PetscCall(ISGetIndices(ris, &ridx));
4557   PetscCall(ISGetIndices(cis, &cidx));
4558 
4559   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4560   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4561 
4562   /* Construct the permutations for rows*/
4563   for (i = 0, row = 0; i < nslim_row; ++i) {
4564     indx      = ridx[i];
4565     start_val = tns[indx];
4566     end_val   = tns[indx + 1];
4567     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4568   }
4569 
4570   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4571   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4572 
4573   /* Construct permutations for columns */
4574   for (i = 0, col = 0; i < nslim_col; ++i) {
4575     indx      = cidx[i];
4576     start_val = tns[indx];
4577     end_val   = tns[indx + 1];
4578     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4579   }
4580 
4581   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4582   PetscCall(ISSetPermutation(*rperm));
4583   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4584   PetscCall(ISSetPermutation(*cperm));
4585 
4586   PetscCall(ISRestoreIndices(ris, &ridx));
4587   PetscCall(ISRestoreIndices(cis, &cidx));
4588 
4589   PetscCall(PetscFree(ns_col));
4590   PetscCall(PetscFree2(permr, permc));
4591   PetscCall(ISDestroy(&cis));
4592   PetscCall(ISDestroy(&ris));
4593   PetscCall(PetscFree(tns));
4594   PetscFunctionReturn(PETSC_SUCCESS);
4595 }
4596 
4597 /*@C
4598   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4599 
4600   Not Collective
4601 
4602   Input Parameter:
4603 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4604 
4605   Output Parameters:
4606 + node_count - no of inodes present in the matrix.
4607 . sizes      - an array of size `node_count`, with the sizes of each inode.
4608 - limit      - the max size used to generate the inodes.
4609 
4610   Level: advanced
4611 
4612   Note:
4613   It should be called after the matrix is assembled.
4614   The contents of the sizes[] array should not be changed.
4615   `NULL` may be passed for information not needed
4616 
4617 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4618 @*/
4619 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4620 {
4621   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4622 
4623   PetscFunctionBegin;
4624   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4625   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4626   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4627   PetscFunctionReturn(PETSC_SUCCESS);
4628 }
4629 
4630 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4631 {
4632   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4633 
4634   PetscFunctionBegin;
4635   if (node_count) *node_count = a->inode.node_count;
4636   if (sizes) *sizes = a->inode.size_csr;
4637   if (limit) *limit = a->inode.limit;
4638   PetscFunctionReturn(PETSC_SUCCESS);
4639 }
4640