xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 07425a8d4172ec73b7b53c5ce4d6ba1b92fe45cf)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size_csr;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30   ns_col[0] = 0;
31 
32   /* Use the same row structure wherever feasible. */
33   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34 
35   /* if m < n; pad up the remainder with inode_limit */
36   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37   /* The last node is the odd ball. pad it up with the remaining rows; */
38   if (count < n) {
39     ns_col[i + 1] = ns_col[i] + (n - count);
40     i++;
41   } else if (count > n) {
42     /* Adjust for the over estimation */
43     ns_col[i] += n - count;
44   }
45   *size = i;
46   *ns   = ns_col;
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /*
51       This builds symmetric version of nonzero structure,
52 */
53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
56   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59 
60   PetscFunctionBegin;
61   nslim_row = a->inode.node_count;
62   m         = A->rmap->n;
63   n         = A->cmap->n;
64   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66 
67   /* Use the row_inode as column_inode */
68   nslim_col = nslim_row;
69   ns_col    = ns_row;
70 
71   /* allocate space for reformatted inode structure */
72   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74 
75   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76     nsz = ns_col[i1 + 1] - ns_col[i1];
77     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78   }
79   /* allocate space for row pointers */
80   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81   *iia = ia;
82   PetscCall(PetscMalloc1(nslim_row + 1, &work));
83 
84   /* determine the number of columns in each row */
85   ia[0] = oshift;
86   for (i1 = 0; i1 < nslim_row; i1++) {
87     row  = ns_row[i1];
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0; i1 < nslim_row; i1++) {
117     row  = ns_row[i1];
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1 + 1] - ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0; i1 < nslim_row; i1++) {
171     row = ns_row[i1];
172     j   = aj + ai[row] + ishift;
173     nz  = ai[row + 1] - ai[row];
174     if (!nz) continue; /* empty row */
175     col = *j++ + ishift;
176     i2  = tvc[col];
177     while (nz-- > 0) { /* off-diagonal elements */
178       ia[i1 + 1]++;
179       i2++; /* Start col of next node */
180       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181       if (nz > 0) i2 = tvc[col];
182     }
183   }
184 
185   /* shift ia[i] to point to next row */
186   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187     row = ia[i1 - 1];
188     ia[i1] += row;
189     work[i1 - 1] = row - oshift;
190   }
191 
192   /* allocate space for column pointers */
193   nz = ia[nslim_row] + (!ishift);
194   PetscCall(PetscMalloc1(nz, &ja));
195   *jja = ja;
196 
197   /* loop over matrix putting into ja */
198   for (i1 = 0; i1 < nslim_row; i1++) {
199     row = ns_row[i1];
200     j   = aj + ai[row] + ishift;
201     nz  = ai[row + 1] - ai[row];
202     if (!nz) continue; /* empty row */
203     col = *j++ + ishift;
204     i2  = tvc[col];
205     while (nz-- > 0) {
206       ja[work[i1]++] = i2 + oshift;
207       ++i2;
208       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209       if (nz > 0) i2 = tvc[col];
210     }
211   }
212   PetscCall(PetscFree(ns_col));
213   PetscCall(PetscFree(work));
214   PetscCall(PetscFree2(tns, tvc));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221 
222   PetscFunctionBegin;
223   if (n) *n = a->inode.node_count;
224   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225   if (!blockcompressed) {
226     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227   } else if (symmetric) {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229   } else {
230     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231   }
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237   PetscFunctionBegin;
238   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239 
240   if (!blockcompressed) {
241     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242   } else {
243     PetscCall(PetscFree(*ia));
244     PetscCall(PetscFree(*ja));
245   }
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254 
255   PetscFunctionBegin;
256   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257   nslim_row = a->inode.node_count;
258   n         = A->cmap->n;
259 
260   /* Create The column_inode for this matrix */
261   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262 
263   /* allocate space for reformatted column_inode structure */
264   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266 
267   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268     nsz = ns_col[i1 + 1] - ns_col[i1];
269     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270   }
271   /* allocate space for column pointers */
272   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273   *iia = ia;
274   PetscCall(PetscMalloc1(nslim_col + 1, &work));
275 
276   /* determine the number of columns in each row */
277   ia[0] = oshift;
278   for (i1 = 0; i1 < nslim_row; i1++) {
279     row = ns_row[i1];
280     j   = aj + ai[row] + ishift;
281     col = *j++ + ishift;
282     i2  = tvc[col];
283     nz  = ai[row + 1] - ai[row];
284     while (nz-- > 0) { /* off-diagonal elements */
285       /* ia[i1+1]++; */
286       ia[i2 + 1]++;
287       i2++;
288       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289       if (nz > 0) i2 = tvc[col];
290     }
291   }
292 
293   /* shift ia[i] to point to next col */
294   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295     col = ia[i1 - 1];
296     ia[i1] += col;
297     work[i1 - 1] = col - oshift;
298   }
299 
300   /* allocate space for column pointers */
301   nz = ia[nslim_col] + (!ishift);
302   PetscCall(PetscMalloc1(nz, &ja));
303   *jja = ja;
304 
305   /* loop over matrix putting into ja */
306   for (i1 = 0; i1 < nslim_row; i1++) {
307     row = ns_row[i1];
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row + 1] - ai[row];
312     while (nz-- > 0) {
313       /* ja[work[i1]++] = i2 + oshift; */
314       ja[work[i2]++] = i1 + oshift;
315       i2++;
316       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320   PetscCall(PetscFree(ns_col));
321   PetscCall(PetscFree(work));
322   PetscCall(PetscFree2(tns, tvc));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreateColInode_Private(A, n, NULL));
330   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331 
332   if (!blockcompressed) {
333     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334   } else if (symmetric) {
335     /* Since the indices are symmetric it doesn't matter */
336     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337   } else {
338     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345   PetscFunctionBegin;
346   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347   if (!blockcompressed) {
348     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349   } else {
350     PetscCall(PetscFree(*ia));
351     PetscCall(PetscFree(*ja));
352   }
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359   PetscScalar       *y;
360   const PetscScalar *x;
361   PetscInt           row, node_max, nonzerorow = 0;
362   PetscInt          *ns;
363 
364   PetscFunctionBegin;
365   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366   node_max = a->inode.node_count;
367   ns       = a->inode.size_csr; /* Node Size array */
368   PetscCall(VecGetArrayRead(xx, &x));
369   PetscCall(VecGetArray(yy, &y));
370 
371   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372   for (PetscInt i = 0; i < node_max; ++i) {
373     PetscInt         i1, i2, nsz, n, sz;
374     const MatScalar *v1, *v2, *v3, *v4, *v5;
375     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376     const PetscInt  *idx;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380 #endif
381     row = ns[i];
382     nsz = ns[i + 1] - ns[i];
383     n   = a->i[row + 1] - a->i[row];
384     nonzerorow += (n > 0) * nsz;
385 
386     idx = &a->j[a->i[row]];
387     v1  = &a->a[a->i[row]];
388     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
389     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390     sz = n;                                                                /* No of non zeros in this row */
391                                                                            /* Switch on the size of Node */
392     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
393     case 1:
394       sum1 = 0.;
395 
396       for (n = 0; n < sz - 1; n += 2) {
397         i1 = idx[0]; /* The instructions are ordered to */
398         i2 = idx[1]; /* make the compiler's job easy */
399         idx += 2;
400         tmp0 = x[i1];
401         tmp1 = x[i2];
402         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403         v1 += 2;
404       }
405 
406       if (n == sz - 1) { /* Take care of the last nonzero  */
407         tmp0 = x[*idx++];
408         sum1 += *v1++ * tmp0;
409       }
410       y[row++] = sum1;
411       break;
412     case 2:
413       sum1 = 0.;
414       sum2 = 0.;
415       v2   = v1 + n;
416 
417       for (n = 0; n < sz - 1; n += 2) {
418         i1 = idx[0];
419         i2 = idx[1];
420         idx += 2;
421         tmp0 = x[i1];
422         tmp1 = x[i2];
423         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424         v1 += 2;
425         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426         v2 += 2;
427       }
428       if (n == sz - 1) {
429         tmp0 = x[*idx++];
430         sum1 += *v1++ * tmp0;
431         sum2 += *v2++ * tmp0;
432       }
433       y[row++] = sum1;
434       y[row++] = sum2;
435       v1       = v2; /* Since the next block to be processed starts there*/
436       idx += sz;
437       break;
438     case 3:
439       sum1 = 0.;
440       sum2 = 0.;
441       sum3 = 0.;
442       v2   = v1 + n;
443       v3   = v2 + n;
444 
445       for (n = 0; n < sz - 1; n += 2) {
446         i1 = idx[0];
447         i2 = idx[1];
448         idx += 2;
449         tmp0 = x[i1];
450         tmp1 = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452         v1 += 2;
453         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454         v2 += 2;
455         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456         v3 += 2;
457       }
458       if (n == sz - 1) {
459         tmp0 = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462         sum3 += *v3++ * tmp0;
463       }
464       y[row++] = sum1;
465       y[row++] = sum2;
466       y[row++] = sum3;
467       v1       = v3; /* Since the next block to be processed starts there*/
468       idx += 2 * sz;
469       break;
470     case 4:
471       sum1 = 0.;
472       sum2 = 0.;
473       sum3 = 0.;
474       sum4 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477       v4   = v3 + n;
478 
479       for (n = 0; n < sz - 1; n += 2) {
480         i1 = idx[0];
481         i2 = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486         v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488         v2 += 2;
489         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490         v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492         v4 += 2;
493       }
494       if (n == sz - 1) {
495         tmp0 = x[*idx++];
496         sum1 += *v1++ * tmp0;
497         sum2 += *v2++ * tmp0;
498         sum3 += *v3++ * tmp0;
499         sum4 += *v4++ * tmp0;
500       }
501       y[row++] = sum1;
502       y[row++] = sum2;
503       y[row++] = sum3;
504       y[row++] = sum4;
505       v1       = v4; /* Since the next block to be processed starts there*/
506       idx += 3 * sz;
507       break;
508     case 5:
509       sum1 = 0.;
510       sum2 = 0.;
511       sum3 = 0.;
512       sum4 = 0.;
513       sum5 = 0.;
514       v2   = v1 + n;
515       v3   = v2 + n;
516       v4   = v3 + n;
517       v5   = v4 + n;
518 
519       for (n = 0; n < sz - 1; n += 2) {
520         i1 = idx[0];
521         i2 = idx[1];
522         idx += 2;
523         tmp0 = x[i1];
524         tmp1 = x[i2];
525         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526         v1 += 2;
527         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528         v2 += 2;
529         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530         v3 += 2;
531         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532         v4 += 2;
533         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534         v5 += 2;
535       }
536       if (n == sz - 1) {
537         tmp0 = x[*idx++];
538         sum1 += *v1++ * tmp0;
539         sum2 += *v2++ * tmp0;
540         sum3 += *v3++ * tmp0;
541         sum4 += *v4++ * tmp0;
542         sum5 += *v5++ * tmp0;
543       }
544       y[row++] = sum1;
545       y[row++] = sum2;
546       y[row++] = sum3;
547       y[row++] = sum4;
548       y[row++] = sum5;
549       v1       = v5; /* Since the next block to be processed starts there */
550       idx += 4 * sz;
551       break;
552     default:
553       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554     }
555   }
556   PetscCall(VecRestoreArrayRead(xx, &x));
557   PetscCall(VecRestoreArray(yy, &y));
558   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559   PetscFunctionReturn(PETSC_SUCCESS);
560 }
561 
562 /* Almost same code as the MatMult_SeqAIJ_Inode() */
563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564 {
565   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
566   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567   const MatScalar   *v1, *v2, *v3, *v4, *v5;
568   const PetscScalar *x;
569   PetscScalar       *y, *z, *zt;
570   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
571   const PetscInt    *idx, *ns, *ii;
572 
573   PetscFunctionBegin;
574   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575   node_max = a->inode.node_count;
576   ns       = a->inode.size_csr; /* Node Size array */
577 
578   PetscCall(VecGetArrayRead(xx, &x));
579   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580   zt = z;
581 
582   idx = a->j;
583   v1  = a->a;
584   ii  = a->i;
585 
586   for (i = 0; i < node_max; ++i) {
587     row = ns[i];
588     nsz = ns[i + 1] - ns[i];
589     n   = ii[1] - ii[0];
590     ii += nsz;
591     sz = n;        /* No of non zeros in this row */
592                    /* Switch on the size of Node */
593     switch (nsz) { /* Each loop in 'case' is unrolled */
594     case 1:
595       sum1 = *zt++;
596 
597       for (n = 0; n < sz - 1; n += 2) {
598         i1 = idx[0]; /* The instructions are ordered to */
599         i2 = idx[1]; /* make the compiler's job easy */
600         idx += 2;
601         tmp0 = x[i1];
602         tmp1 = x[i2];
603         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604         v1 += 2;
605       }
606 
607       if (n == sz - 1) { /* Take care of the last nonzero  */
608         tmp0 = x[*idx++];
609         sum1 += *v1++ * tmp0;
610       }
611       y[row++] = sum1;
612       break;
613     case 2:
614       sum1 = *zt++;
615       sum2 = *zt++;
616       v2   = v1 + n;
617 
618       for (n = 0; n < sz - 1; n += 2) {
619         i1 = idx[0];
620         i2 = idx[1];
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625         v1 += 2;
626         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627         v2 += 2;
628       }
629       if (n == sz - 1) {
630         tmp0 = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++] = sum1;
635       y[row++] = sum2;
636       v1       = v2; /* Since the next block to be processed starts there*/
637       idx += sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n < sz - 1; n += 2) {
647         i1 = idx[0];
648         i2 = idx[1];
649         idx += 2;
650         tmp0 = x[i1];
651         tmp1 = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653         v1 += 2;
654         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655         v2 += 2;
656         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657         v3 += 2;
658       }
659       if (n == sz - 1) {
660         tmp0 = x[*idx++];
661         sum1 += *v1++ * tmp0;
662         sum2 += *v2++ * tmp0;
663         sum3 += *v3++ * tmp0;
664       }
665       y[row++] = sum1;
666       y[row++] = sum2;
667       y[row++] = sum3;
668       v1       = v3; /* Since the next block to be processed starts there*/
669       idx += 2 * sz;
670       break;
671     case 4:
672       sum1 = *zt++;
673       sum2 = *zt++;
674       sum3 = *zt++;
675       sum4 = *zt++;
676       v2   = v1 + n;
677       v3   = v2 + n;
678       v4   = v3 + n;
679 
680       for (n = 0; n < sz - 1; n += 2) {
681         i1 = idx[0];
682         i2 = idx[1];
683         idx += 2;
684         tmp0 = x[i1];
685         tmp1 = x[i2];
686         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687         v1 += 2;
688         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689         v2 += 2;
690         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691         v3 += 2;
692         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693         v4 += 2;
694       }
695       if (n == sz - 1) {
696         tmp0 = x[*idx++];
697         sum1 += *v1++ * tmp0;
698         sum2 += *v2++ * tmp0;
699         sum3 += *v3++ * tmp0;
700         sum4 += *v4++ * tmp0;
701       }
702       y[row++] = sum1;
703       y[row++] = sum2;
704       y[row++] = sum3;
705       y[row++] = sum4;
706       v1       = v4; /* Since the next block to be processed starts there*/
707       idx += 3 * sz;
708       break;
709     case 5:
710       sum1 = *zt++;
711       sum2 = *zt++;
712       sum3 = *zt++;
713       sum4 = *zt++;
714       sum5 = *zt++;
715       v2   = v1 + n;
716       v3   = v2 + n;
717       v4   = v3 + n;
718       v5   = v4 + n;
719 
720       for (n = 0; n < sz - 1; n += 2) {
721         i1 = idx[0];
722         i2 = idx[1];
723         idx += 2;
724         tmp0 = x[i1];
725         tmp1 = x[i2];
726         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727         v1 += 2;
728         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729         v2 += 2;
730         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731         v3 += 2;
732         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733         v4 += 2;
734         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735         v5 += 2;
736       }
737       if (n == sz - 1) {
738         tmp0 = x[*idx++];
739         sum1 += *v1++ * tmp0;
740         sum2 += *v2++ * tmp0;
741         sum3 += *v3++ * tmp0;
742         sum4 += *v4++ * tmp0;
743         sum5 += *v5++ * tmp0;
744       }
745       y[row++] = sum1;
746       y[row++] = sum2;
747       y[row++] = sum3;
748       y[row++] = sum4;
749       y[row++] = sum5;
750       v1       = v5; /* Since the next block to be processed starts there */
751       idx += 4 * sz;
752       break;
753     default:
754       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755     }
756   }
757   PetscCall(VecRestoreArrayRead(xx, &x));
758   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759   PetscCall(PetscLogFlops(2.0 * a->nz));
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764 {
765   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
766   IS                 iscol = a->col, isrow = a->row;
767   const PetscInt    *r, *c, *rout, *cout;
768   PetscInt           i, j, n = A->rmap->n, nz;
769   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
770   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774   const PetscScalar *b;
775 
776   PetscFunctionBegin;
777   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778   node_max = a->inode.node_count;
779   ns       = a->inode.size_csr; /* Node Size array */
780 
781   PetscCall(VecGetArrayRead(bb, &b));
782   PetscCall(VecGetArrayWrite(xx, &x));
783   tmp = a->solve_work;
784 
785   PetscCall(ISGetIndices(isrow, &rout));
786   r = rout;
787   PetscCall(ISGetIndices(iscol, &cout));
788   c = cout + (n - 1);
789 
790   /* forward solve the lower triangular */
791   tmps = tmp;
792   aa   = a_a;
793   aj   = a_j;
794   ad   = a->diag;
795 
796   for (i = 0, row = 0; i < node_max; ++i) {
797     row = ns[i];
798     nsz = ns[i + 1] - ns[i];
799     aii = ai[row];
800     v1  = aa + aii;
801     vi  = aj + aii;
802     nz  = ad[row] - aii;
803     if (i < node_max - 1) {
804       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805       * but our indexing to determine its size could. */
806       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810     }
811 
812     switch (nsz) { /* Each loop in 'case' is unrolled */
813     case 1:
814       sum1 = b[*r++];
815       for (j = 0; j < nz - 1; j += 2) {
816         i0 = vi[0];
817         i1 = vi[1];
818         vi += 2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822         v1 += 2;
823       }
824       if (j == nz - 1) {
825         tmp0 = tmps[*vi++];
826         sum1 -= *v1++ * tmp0;
827       }
828       tmp[row++] = sum1;
829       break;
830     case 2:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       v2   = aa + ai[row + 1];
834 
835       for (j = 0; j < nz - 1; j += 2) {
836         i0 = vi[0];
837         i1 = vi[1];
838         vi += 2;
839         tmp0 = tmps[i0];
840         tmp1 = tmps[i1];
841         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842         v1 += 2;
843         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844         v2 += 2;
845       }
846       if (j == nz - 1) {
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ * tmp0;
849         sum2 -= *v2++ * tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row++] = sum1;
853       tmp[row++] = sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row + 1];
860       v3   = aa + ai[row + 2];
861 
862       for (j = 0; j < nz - 1; j += 2) {
863         i0 = vi[0];
864         i1 = vi[1];
865         vi += 2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869         v1 += 2;
870         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871         v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873         v3 += 2;
874       }
875       if (j == nz - 1) {
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ * tmp0;
878         sum2 -= *v2++ * tmp0;
879         sum3 -= *v3++ * tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884 
885       tmp[row++] = sum1;
886       tmp[row++] = sum2;
887       tmp[row++] = sum3;
888       break;
889 
890     case 4:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       v2   = aa + ai[row + 1];
896       v3   = aa + ai[row + 2];
897       v4   = aa + ai[row + 3];
898 
899       for (j = 0; j < nz - 1; j += 2) {
900         i0 = vi[0];
901         i1 = vi[1];
902         vi += 2;
903         tmp0 = tmps[i0];
904         tmp1 = tmps[i1];
905         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906         v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908         v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910         v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912         v4 += 2;
913       }
914       if (j == nz - 1) {
915         tmp0 = tmps[*vi++];
916         sum1 -= *v1++ * tmp0;
917         sum2 -= *v2++ * tmp0;
918         sum3 -= *v3++ * tmp0;
919         sum4 -= *v4++ * tmp0;
920       }
921       sum2 -= *v2++ * sum1;
922       sum3 -= *v3++ * sum1;
923       sum4 -= *v4++ * sum1;
924       sum3 -= *v3++ * sum2;
925       sum4 -= *v4++ * sum2;
926       sum4 -= *v4++ * sum3;
927 
928       tmp[row++] = sum1;
929       tmp[row++] = sum2;
930       tmp[row++] = sum3;
931       tmp[row++] = sum4;
932       break;
933     case 5:
934       sum1 = b[*r++];
935       sum2 = b[*r++];
936       sum3 = b[*r++];
937       sum4 = b[*r++];
938       sum5 = b[*r++];
939       v2   = aa + ai[row + 1];
940       v3   = aa + ai[row + 2];
941       v4   = aa + ai[row + 3];
942       v5   = aa + ai[row + 4];
943 
944       for (j = 0; j < nz - 1; j += 2) {
945         i0 = vi[0];
946         i1 = vi[1];
947         vi += 2;
948         tmp0 = tmps[i0];
949         tmp1 = tmps[i1];
950         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951         v1 += 2;
952         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953         v2 += 2;
954         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955         v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957         v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959         v5 += 2;
960       }
961       if (j == nz - 1) {
962         tmp0 = tmps[*vi++];
963         sum1 -= *v1++ * tmp0;
964         sum2 -= *v2++ * tmp0;
965         sum3 -= *v3++ * tmp0;
966         sum4 -= *v4++ * tmp0;
967         sum5 -= *v5++ * tmp0;
968       }
969 
970       sum2 -= *v2++ * sum1;
971       sum3 -= *v3++ * sum1;
972       sum4 -= *v4++ * sum1;
973       sum5 -= *v5++ * sum1;
974       sum3 -= *v3++ * sum2;
975       sum4 -= *v4++ * sum2;
976       sum5 -= *v5++ * sum2;
977       sum4 -= *v4++ * sum3;
978       sum5 -= *v5++ * sum3;
979       sum5 -= *v5++ * sum4;
980 
981       tmp[row++] = sum1;
982       tmp[row++] = sum2;
983       tmp[row++] = sum3;
984       tmp[row++] = sum4;
985       tmp[row++] = sum5;
986       break;
987     default:
988       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989     }
990   }
991   /* backward solve the upper triangular */
992   for (i = node_max - 1; i >= 0; i--) {
993     row = ns[i + 1];
994     nsz = ns[i + 1] - ns[i];
995     aii = ai[row + 1] - 1;
996     v1  = aa + aii;
997     vi  = aj + aii;
998     nz  = aii - ad[row];
999     switch (nsz) { /* Each loop in 'case' is unrolled */
1000     case 1:
1001       sum1 = tmp[row];
1002 
1003       for (j = nz; j > 1; j -= 2) {
1004         vi -= 2;
1005         i0   = vi[2];
1006         i1   = vi[1];
1007         tmp0 = tmps[i0];
1008         tmp1 = tmps[i1];
1009         v1 -= 2;
1010         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011       }
1012       if (j == 1) {
1013         tmp0 = tmps[*vi--];
1014         sum1 -= *v1-- * tmp0;
1015       }
1016       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017       row--;
1018       break;
1019     case 2:
1020       sum1 = tmp[row];
1021       sum2 = tmp[row - 1];
1022       v2   = aa + ai[row] - 1;
1023       for (j = nz; j > 1; j -= 2) {
1024         vi -= 2;
1025         i0   = vi[2];
1026         i1   = vi[1];
1027         tmp0 = tmps[i0];
1028         tmp1 = tmps[i1];
1029         v1 -= 2;
1030         v2 -= 2;
1031         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033       }
1034       if (j == 1) {
1035         tmp0 = tmps[*vi--];
1036         sum1 -= *v1-- * tmp0;
1037         sum2 -= *v2-- * tmp0;
1038       }
1039 
1040       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041       row--;
1042       sum2 -= *v2-- * tmp0;
1043       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044       row--;
1045       break;
1046     case 3:
1047       sum1 = tmp[row];
1048       sum2 = tmp[row - 1];
1049       sum3 = tmp[row - 2];
1050       v2   = aa + ai[row] - 1;
1051       v3   = aa + ai[row - 1] - 1;
1052       for (j = nz; j > 1; j -= 2) {
1053         vi -= 2;
1054         i0   = vi[2];
1055         i1   = vi[1];
1056         tmp0 = tmps[i0];
1057         tmp1 = tmps[i1];
1058         v1 -= 2;
1059         v2 -= 2;
1060         v3 -= 2;
1061         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064       }
1065       if (j == 1) {
1066         tmp0 = tmps[*vi--];
1067         sum1 -= *v1-- * tmp0;
1068         sum2 -= *v2-- * tmp0;
1069         sum3 -= *v3-- * tmp0;
1070       }
1071       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072       row--;
1073       sum2 -= *v2-- * tmp0;
1074       sum3 -= *v3-- * tmp0;
1075       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076       row--;
1077       sum3 -= *v3-- * tmp0;
1078       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079       row--;
1080 
1081       break;
1082     case 4:
1083       sum1 = tmp[row];
1084       sum2 = tmp[row - 1];
1085       sum3 = tmp[row - 2];
1086       sum4 = tmp[row - 3];
1087       v2   = aa + ai[row] - 1;
1088       v3   = aa + ai[row - 1] - 1;
1089       v4   = aa + ai[row - 2] - 1;
1090 
1091       for (j = nz; j > 1; j -= 2) {
1092         vi -= 2;
1093         i0   = vi[2];
1094         i1   = vi[1];
1095         tmp0 = tmps[i0];
1096         tmp1 = tmps[i1];
1097         v1 -= 2;
1098         v2 -= 2;
1099         v3 -= 2;
1100         v4 -= 2;
1101         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105       }
1106       if (j == 1) {
1107         tmp0 = tmps[*vi--];
1108         sum1 -= *v1-- * tmp0;
1109         sum2 -= *v2-- * tmp0;
1110         sum3 -= *v3-- * tmp0;
1111         sum4 -= *v4-- * tmp0;
1112       }
1113 
1114       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115       row--;
1116       sum2 -= *v2-- * tmp0;
1117       sum3 -= *v3-- * tmp0;
1118       sum4 -= *v4-- * tmp0;
1119       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120       row--;
1121       sum3 -= *v3-- * tmp0;
1122       sum4 -= *v4-- * tmp0;
1123       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124       row--;
1125       sum4 -= *v4-- * tmp0;
1126       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127       row--;
1128       break;
1129     case 5:
1130       sum1 = tmp[row];
1131       sum2 = tmp[row - 1];
1132       sum3 = tmp[row - 2];
1133       sum4 = tmp[row - 3];
1134       sum5 = tmp[row - 4];
1135       v2   = aa + ai[row] - 1;
1136       v3   = aa + ai[row - 1] - 1;
1137       v4   = aa + ai[row - 2] - 1;
1138       v5   = aa + ai[row - 3] - 1;
1139       for (j = nz; j > 1; j -= 2) {
1140         vi -= 2;
1141         i0   = vi[2];
1142         i1   = vi[1];
1143         tmp0 = tmps[i0];
1144         tmp1 = tmps[i1];
1145         v1 -= 2;
1146         v2 -= 2;
1147         v3 -= 2;
1148         v4 -= 2;
1149         v5 -= 2;
1150         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155       }
1156       if (j == 1) {
1157         tmp0 = tmps[*vi--];
1158         sum1 -= *v1-- * tmp0;
1159         sum2 -= *v2-- * tmp0;
1160         sum3 -= *v3-- * tmp0;
1161         sum4 -= *v4-- * tmp0;
1162         sum5 -= *v5-- * tmp0;
1163       }
1164 
1165       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166       row--;
1167       sum2 -= *v2-- * tmp0;
1168       sum3 -= *v3-- * tmp0;
1169       sum4 -= *v4-- * tmp0;
1170       sum5 -= *v5-- * tmp0;
1171       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172       row--;
1173       sum3 -= *v3-- * tmp0;
1174       sum4 -= *v4-- * tmp0;
1175       sum5 -= *v5-- * tmp0;
1176       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177       row--;
1178       sum4 -= *v4-- * tmp0;
1179       sum5 -= *v5-- * tmp0;
1180       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181       row--;
1182       sum5 -= *v5-- * tmp0;
1183       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184       row--;
1185       break;
1186     default:
1187       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188     }
1189   }
1190   PetscCall(ISRestoreIndices(isrow, &rout));
1191   PetscCall(ISRestoreIndices(iscol, &cout));
1192   PetscCall(VecRestoreArrayRead(bb, &b));
1193   PetscCall(VecRestoreArrayWrite(xx, &x));
1194   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199 {
1200   Mat              C = B;
1201   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202   IS               isrow = b->row, isicol = b->icol;
1203   const PetscInt  *r, *ic, *ics;
1204   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205   PetscInt         i, j, k, nz, nzL, row, *pj;
1206   const PetscInt  *ajtmp, *bjtmp;
1207   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209   FactorShiftCtx   sctx;
1210   const PetscInt  *ddiag;
1211   PetscReal        rs;
1212   MatScalar        d;
1213   PetscInt         inod, nodesz, node_max, col;
1214   const PetscInt  *ns;
1215   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1216 
1217   PetscFunctionBegin;
1218   /* MatPivotSetUp(): initialize shift context sctx */
1219   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1220 
1221   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222     ddiag          = a->diag;
1223     sctx.shift_top = info->zeropivot;
1224     for (i = 0; i < n; i++) {
1225       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226       d  = (aa)[ddiag[i]];
1227       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228       v  = aa + ai[i];
1229       nz = ai[i + 1] - ai[i];
1230       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231       if (rs > sctx.shift_top) sctx.shift_top = rs;
1232     }
1233     sctx.shift_top *= 1.1;
1234     sctx.nshift_max = 5;
1235     sctx.shift_lo   = 0.;
1236     sctx.shift_hi   = 1.;
1237   }
1238 
1239   PetscCall(ISGetIndices(isrow, &r));
1240   PetscCall(ISGetIndices(isicol, &ic));
1241 
1242   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243   ics = ic;
1244 
1245   node_max = a->inode.node_count;
1246   ns       = a->inode.size_csr;
1247   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1248 
1249   /* If max inode size > 4, split it into two inodes.*/
1250   /* also map the inode sizes according to the ordering */
1251   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253     nodesz = ns[i + 1] - ns[i];
1254     if (nodesz > 4) {
1255       tmp_vec1[j] = 4;
1256       ++j;
1257       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258     } else {
1259       tmp_vec1[j] = nodesz;
1260     }
1261   }
1262   /* Use the correct node_max */
1263   node_max = j;
1264 
1265   /* Now reorder the inode info based on mat re-ordering info */
1266   /* First create a row -> inode_size_array_index map */
1267   PetscCall(PetscMalloc1(n + 1, &nsmap));
1268   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269   tmp_vec2[0] = 0;
1270   for (i = 0, row = 0; i < node_max; i++) {
1271     nodesz = tmp_vec1[i];
1272     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273   }
1274   /* Using nsmap, create a reordered ns structure */
1275   for (i = 0, j = 0; i < node_max; i++) {
1276     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278     j += nodesz;
1279   }
1280   PetscCall(PetscFree(nsmap));
1281   PetscCall(PetscFree(tmp_vec1));
1282 
1283   /* Now use the correct ns */
1284   ns = tmp_vec2;
1285 
1286   do {
1287     sctx.newshift = PETSC_FALSE;
1288     /* Now loop over each block-row, and do the factorization */
1289     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290       nodesz = ns[inod + 1] - ns[inod];
1291 
1292       switch (nodesz) {
1293       case 1:
1294         /* zero rtmp1 */
1295         /* L part */
1296         nz    = bi[i + 1] - bi[i];
1297         bjtmp = bj + bi[i];
1298         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1299 
1300         /* U part */
1301         nz    = bdiag[i] - bdiag[i + 1];
1302         bjtmp = bj + bdiag[i + 1] + 1;
1303         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1304 
1305         /* load in initial (unfactored row) */
1306         nz    = ai[r[i] + 1] - ai[r[i]];
1307         ajtmp = aj + ai[r[i]];
1308         v     = aa + ai[r[i]];
1309         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1310 
1311         /* ZeropivotApply() */
1312         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1313 
1314         /* elimination */
1315         bjtmp = bj + bi[i];
1316         row   = *bjtmp++;
1317         nzL   = bi[i + 1] - bi[i];
1318         for (k = 0; k < nzL; k++) {
1319           pc = rtmp1 + row;
1320           if (*pc != 0.0) {
1321             pv   = b->a + bdiag[row];
1322             mul1 = *pc * (*pv);
1323             *pc  = mul1;
1324             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325             pv   = b->a + bdiag[row + 1] + 1;
1326             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329           }
1330           row = *bjtmp++;
1331         }
1332 
1333         /* finished row so stick it into b->a */
1334         rs = 0.0;
1335         /* L part */
1336         pv = b->a + bi[i];
1337         pj = b->j + bi[i];
1338         nz = bi[i + 1] - bi[i];
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* U part */
1345         pv = b->a + bdiag[i + 1] + 1;
1346         pj = b->j + bdiag[i + 1] + 1;
1347         nz = bdiag[i] - bdiag[i + 1] - 1;
1348         for (j = 0; j < nz; j++) {
1349           pv[j] = rtmp1[pj[j]];
1350           rs += PetscAbsScalar(pv[j]);
1351         }
1352 
1353         /* Check zero pivot */
1354         sctx.rs = rs;
1355         sctx.pv = rtmp1[i];
1356         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357         if (sctx.newshift) break;
1358 
1359         /* Mark diagonal and invert diagonal for simpler triangular solves */
1360         pv  = b->a + bdiag[i];
1361         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362         break;
1363 
1364       case 2:
1365         /* zero rtmp1 and rtmp2 */
1366         /* L part */
1367         nz    = bi[i + 1] - bi[i];
1368         bjtmp = bj + bi[i];
1369         for (j = 0; j < nz; j++) {
1370           col        = bjtmp[j];
1371           rtmp1[col] = 0.0;
1372           rtmp2[col] = 0.0;
1373         }
1374 
1375         /* U part */
1376         nz    = bdiag[i] - bdiag[i + 1];
1377         bjtmp = bj + bdiag[i + 1] + 1;
1378         for (j = 0; j < nz; j++) {
1379           col        = bjtmp[j];
1380           rtmp1[col] = 0.0;
1381           rtmp2[col] = 0.0;
1382         }
1383 
1384         /* load in initial (unfactored row) */
1385         nz    = ai[r[i] + 1] - ai[r[i]];
1386         ajtmp = aj + ai[r[i]];
1387         v1    = aa + ai[r[i]];
1388         v2    = aa + ai[r[i + 1]];
1389         for (j = 0; j < nz; j++) {
1390           col        = ics[ajtmp[j]];
1391           rtmp1[col] = v1[j];
1392           rtmp2[col] = v2[j];
1393         }
1394         /* ZeropivotApply(): shift the diagonal of the matrix  */
1395         rtmp1[i] += sctx.shift_amount;
1396         rtmp2[i + 1] += sctx.shift_amount;
1397 
1398         /* elimination */
1399         bjtmp = bj + bi[i];
1400         row   = *bjtmp++; /* pivot row */
1401         nzL   = bi[i + 1] - bi[i];
1402         for (k = 0; k < nzL; k++) {
1403           pc1 = rtmp1 + row;
1404           pc2 = rtmp2 + row;
1405           if (*pc1 != 0.0 || *pc2 != 0.0) {
1406             pv   = b->a + bdiag[row];
1407             mul1 = *pc1 * (*pv);
1408             mul2 = *pc2 * (*pv);
1409             *pc1 = mul1;
1410             *pc2 = mul2;
1411 
1412             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413             pv = b->a + bdiag[row + 1] + 1;
1414             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415             for (j = 0; j < nz; j++) {
1416               col = pj[j];
1417               rtmp1[col] -= mul1 * pv[j];
1418               rtmp2[col] -= mul2 * pv[j];
1419             }
1420             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421           }
1422           row = *bjtmp++;
1423         }
1424 
1425         /* finished row i; check zero pivot, then stick row i into b->a */
1426         rs = 0.0;
1427         /* L part */
1428         pc1 = b->a + bi[i];
1429         pj  = b->j + bi[i];
1430         nz  = bi[i + 1] - bi[i];
1431         for (j = 0; j < nz; j++) {
1432           col    = pj[j];
1433           pc1[j] = rtmp1[col];
1434           rs += PetscAbsScalar(pc1[j]);
1435         }
1436         /* U part */
1437         pc1 = b->a + bdiag[i + 1] + 1;
1438         pj  = b->j + bdiag[i + 1] + 1;
1439         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440         for (j = 0; j < nz; j++) {
1441           col    = pj[j];
1442           pc1[j] = rtmp1[col];
1443           rs += PetscAbsScalar(pc1[j]);
1444         }
1445 
1446         sctx.rs = rs;
1447         sctx.pv = rtmp1[i];
1448         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449         if (sctx.newshift) break;
1450         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451         *pc1 = 1.0 / sctx.pv;
1452 
1453         /* Now take care of diagonal 2x2 block. */
1454         pc2 = rtmp2 + i;
1455         if (*pc2 != 0.0) {
1456           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457           *pc2 = mul1;                        /* insert L entry */
1458           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460           for (j = 0; j < nz; j++) {
1461             col = pj[j];
1462             rtmp2[col] -= mul1 * rtmp1[col];
1463           }
1464           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465         }
1466 
1467         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468         rs = 0.0;
1469         /* L part */
1470         pc2 = b->a + bi[i + 1];
1471         pj  = b->j + bi[i + 1];
1472         nz  = bi[i + 2] - bi[i + 1];
1473         for (j = 0; j < nz; j++) {
1474           col    = pj[j];
1475           pc2[j] = rtmp2[col];
1476           rs += PetscAbsScalar(pc2[j]);
1477         }
1478         /* U part */
1479         pc2 = b->a + bdiag[i + 2] + 1;
1480         pj  = b->j + bdiag[i + 2] + 1;
1481         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482         for (j = 0; j < nz; j++) {
1483           col    = pj[j];
1484           pc2[j] = rtmp2[col];
1485           rs += PetscAbsScalar(pc2[j]);
1486         }
1487 
1488         sctx.rs = rs;
1489         sctx.pv = rtmp2[i + 1];
1490         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491         if (sctx.newshift) break;
1492         pc2  = b->a + bdiag[i + 1];
1493         *pc2 = 1.0 / sctx.pv;
1494         break;
1495 
1496       case 3:
1497         /* zero rtmp */
1498         /* L part */
1499         nz    = bi[i + 1] - bi[i];
1500         bjtmp = bj + bi[i];
1501         for (j = 0; j < nz; j++) {
1502           col        = bjtmp[j];
1503           rtmp1[col] = 0.0;
1504           rtmp2[col] = 0.0;
1505           rtmp3[col] = 0.0;
1506         }
1507 
1508         /* U part */
1509         nz    = bdiag[i] - bdiag[i + 1];
1510         bjtmp = bj + bdiag[i + 1] + 1;
1511         for (j = 0; j < nz; j++) {
1512           col        = bjtmp[j];
1513           rtmp1[col] = 0.0;
1514           rtmp2[col] = 0.0;
1515           rtmp3[col] = 0.0;
1516         }
1517 
1518         /* load in initial (unfactored row) */
1519         nz    = ai[r[i] + 1] - ai[r[i]];
1520         ajtmp = aj + ai[r[i]];
1521         v1    = aa + ai[r[i]];
1522         v2    = aa + ai[r[i + 1]];
1523         v3    = aa + ai[r[i + 2]];
1524         for (j = 0; j < nz; j++) {
1525           col        = ics[ajtmp[j]];
1526           rtmp1[col] = v1[j];
1527           rtmp2[col] = v2[j];
1528           rtmp3[col] = v3[j];
1529         }
1530         /* ZeropivotApply(): shift the diagonal of the matrix  */
1531         rtmp1[i] += sctx.shift_amount;
1532         rtmp2[i + 1] += sctx.shift_amount;
1533         rtmp3[i + 2] += sctx.shift_amount;
1534 
1535         /* elimination */
1536         bjtmp = bj + bi[i];
1537         row   = *bjtmp++; /* pivot row */
1538         nzL   = bi[i + 1] - bi[i];
1539         for (k = 0; k < nzL; k++) {
1540           pc1 = rtmp1 + row;
1541           pc2 = rtmp2 + row;
1542           pc3 = rtmp3 + row;
1543           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544             pv   = b->a + bdiag[row];
1545             mul1 = *pc1 * (*pv);
1546             mul2 = *pc2 * (*pv);
1547             mul3 = *pc3 * (*pv);
1548             *pc1 = mul1;
1549             *pc2 = mul2;
1550             *pc3 = mul3;
1551 
1552             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553             pv = b->a + bdiag[row + 1] + 1;
1554             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555             for (j = 0; j < nz; j++) {
1556               col = pj[j];
1557               rtmp1[col] -= mul1 * pv[j];
1558               rtmp2[col] -= mul2 * pv[j];
1559               rtmp3[col] -= mul3 * pv[j];
1560             }
1561             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562           }
1563           row = *bjtmp++;
1564         }
1565 
1566         /* finished row i; check zero pivot, then stick row i into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc1 = b->a + bi[i];
1570         pj  = b->j + bi[i];
1571         nz  = bi[i + 1] - bi[i];
1572         for (j = 0; j < nz; j++) {
1573           col    = pj[j];
1574           pc1[j] = rtmp1[col];
1575           rs += PetscAbsScalar(pc1[j]);
1576         }
1577         /* U part */
1578         pc1 = b->a + bdiag[i + 1] + 1;
1579         pj  = b->j + bdiag[i + 1] + 1;
1580         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581         for (j = 0; j < nz; j++) {
1582           col    = pj[j];
1583           pc1[j] = rtmp1[col];
1584           rs += PetscAbsScalar(pc1[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp1[i];
1589         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590         if (sctx.newshift) break;
1591         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592         *pc1 = 1.0 / sctx.pv;
1593 
1594         /* Now take care of 1st column of diagonal 3x3 block. */
1595         pc2 = rtmp2 + i;
1596         pc3 = rtmp3 + i;
1597         if (*pc2 != 0.0 || *pc3 != 0.0) {
1598           mul2 = (*pc2) * (*pc1);
1599           *pc2 = mul2;
1600           mul3 = (*pc3) * (*pc1);
1601           *pc3 = mul3;
1602           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1603           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604           for (j = 0; j < nz; j++) {
1605             col = pj[j];
1606             rtmp2[col] -= mul2 * rtmp1[col];
1607             rtmp3[col] -= mul3 * rtmp1[col];
1608           }
1609           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610         }
1611 
1612         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613         rs = 0.0;
1614         /* L part */
1615         pc2 = b->a + bi[i + 1];
1616         pj  = b->j + bi[i + 1];
1617         nz  = bi[i + 2] - bi[i + 1];
1618         for (j = 0; j < nz; j++) {
1619           col    = pj[j];
1620           pc2[j] = rtmp2[col];
1621           rs += PetscAbsScalar(pc2[j]);
1622         }
1623         /* U part */
1624         pc2 = b->a + bdiag[i + 2] + 1;
1625         pj  = b->j + bdiag[i + 2] + 1;
1626         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627         for (j = 0; j < nz; j++) {
1628           col    = pj[j];
1629           pc2[j] = rtmp2[col];
1630           rs += PetscAbsScalar(pc2[j]);
1631         }
1632 
1633         sctx.rs = rs;
1634         sctx.pv = rtmp2[i + 1];
1635         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636         if (sctx.newshift) break;
1637         pc2  = b->a + bdiag[i + 1];
1638         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639 
1640         /* Now take care of 2nd column of diagonal 3x3 block. */
1641         pc3 = rtmp3 + i + 1;
1642         if (*pc3 != 0.0) {
1643           mul3 = (*pc3) * (*pc2);
1644           *pc3 = mul3;
1645           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1646           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647           for (j = 0; j < nz; j++) {
1648             col = pj[j];
1649             rtmp3[col] -= mul3 * rtmp2[col];
1650           }
1651           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652         }
1653 
1654         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655         rs = 0.0;
1656         /* L part */
1657         pc3 = b->a + bi[i + 2];
1658         pj  = b->j + bi[i + 2];
1659         nz  = bi[i + 3] - bi[i + 2];
1660         for (j = 0; j < nz; j++) {
1661           col    = pj[j];
1662           pc3[j] = rtmp3[col];
1663           rs += PetscAbsScalar(pc3[j]);
1664         }
1665         /* U part */
1666         pc3 = b->a + bdiag[i + 3] + 1;
1667         pj  = b->j + bdiag[i + 3] + 1;
1668         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669         for (j = 0; j < nz; j++) {
1670           col    = pj[j];
1671           pc3[j] = rtmp3[col];
1672           rs += PetscAbsScalar(pc3[j]);
1673         }
1674 
1675         sctx.rs = rs;
1676         sctx.pv = rtmp3[i + 2];
1677         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678         if (sctx.newshift) break;
1679         pc3  = b->a + bdiag[i + 2];
1680         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681         break;
1682       case 4:
1683         /* zero rtmp */
1684         /* L part */
1685         nz    = bi[i + 1] - bi[i];
1686         bjtmp = bj + bi[i];
1687         for (j = 0; j < nz; j++) {
1688           col        = bjtmp[j];
1689           rtmp1[col] = 0.0;
1690           rtmp2[col] = 0.0;
1691           rtmp3[col] = 0.0;
1692           rtmp4[col] = 0.0;
1693         }
1694 
1695         /* U part */
1696         nz    = bdiag[i] - bdiag[i + 1];
1697         bjtmp = bj + bdiag[i + 1] + 1;
1698         for (j = 0; j < nz; j++) {
1699           col        = bjtmp[j];
1700           rtmp1[col] = 0.0;
1701           rtmp2[col] = 0.0;
1702           rtmp3[col] = 0.0;
1703           rtmp4[col] = 0.0;
1704         }
1705 
1706         /* load in initial (unfactored row) */
1707         nz    = ai[r[i] + 1] - ai[r[i]];
1708         ajtmp = aj + ai[r[i]];
1709         v1    = aa + ai[r[i]];
1710         v2    = aa + ai[r[i + 1]];
1711         v3    = aa + ai[r[i + 2]];
1712         v4    = aa + ai[r[i + 3]];
1713         for (j = 0; j < nz; j++) {
1714           col        = ics[ajtmp[j]];
1715           rtmp1[col] = v1[j];
1716           rtmp2[col] = v2[j];
1717           rtmp3[col] = v3[j];
1718           rtmp4[col] = v4[j];
1719         }
1720         /* ZeropivotApply(): shift the diagonal of the matrix  */
1721         rtmp1[i] += sctx.shift_amount;
1722         rtmp2[i + 1] += sctx.shift_amount;
1723         rtmp3[i + 2] += sctx.shift_amount;
1724         rtmp4[i + 3] += sctx.shift_amount;
1725 
1726         /* elimination */
1727         bjtmp = bj + bi[i];
1728         row   = *bjtmp++; /* pivot row */
1729         nzL   = bi[i + 1] - bi[i];
1730         for (k = 0; k < nzL; k++) {
1731           pc1 = rtmp1 + row;
1732           pc2 = rtmp2 + row;
1733           pc3 = rtmp3 + row;
1734           pc4 = rtmp4 + row;
1735           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736             pv   = b->a + bdiag[row];
1737             mul1 = *pc1 * (*pv);
1738             mul2 = *pc2 * (*pv);
1739             mul3 = *pc3 * (*pv);
1740             mul4 = *pc4 * (*pv);
1741             *pc1 = mul1;
1742             *pc2 = mul2;
1743             *pc3 = mul3;
1744             *pc4 = mul4;
1745 
1746             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747             pv = b->a + bdiag[row + 1] + 1;
1748             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749             for (j = 0; j < nz; j++) {
1750               col = pj[j];
1751               rtmp1[col] -= mul1 * pv[j];
1752               rtmp2[col] -= mul2 * pv[j];
1753               rtmp3[col] -= mul3 * pv[j];
1754               rtmp4[col] -= mul4 * pv[j];
1755             }
1756             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757           }
1758           row = *bjtmp++;
1759         }
1760 
1761         /* finished row i; check zero pivot, then stick row i into b->a */
1762         rs = 0.0;
1763         /* L part */
1764         pc1 = b->a + bi[i];
1765         pj  = b->j + bi[i];
1766         nz  = bi[i + 1] - bi[i];
1767         for (j = 0; j < nz; j++) {
1768           col    = pj[j];
1769           pc1[j] = rtmp1[col];
1770           rs += PetscAbsScalar(pc1[j]);
1771         }
1772         /* U part */
1773         pc1 = b->a + bdiag[i + 1] + 1;
1774         pj  = b->j + bdiag[i + 1] + 1;
1775         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776         for (j = 0; j < nz; j++) {
1777           col    = pj[j];
1778           pc1[j] = rtmp1[col];
1779           rs += PetscAbsScalar(pc1[j]);
1780         }
1781 
1782         sctx.rs = rs;
1783         sctx.pv = rtmp1[i];
1784         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785         if (sctx.newshift) break;
1786         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1787         *pc1 = 1.0 / sctx.pv;
1788 
1789         /* Now take care of 1st column of diagonal 4x4 block. */
1790         pc2 = rtmp2 + i;
1791         pc3 = rtmp3 + i;
1792         pc4 = rtmp4 + i;
1793         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794           mul2 = (*pc2) * (*pc1);
1795           *pc2 = mul2;
1796           mul3 = (*pc3) * (*pc1);
1797           *pc3 = mul3;
1798           mul4 = (*pc4) * (*pc1);
1799           *pc4 = mul4;
1800           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1801           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802           for (j = 0; j < nz; j++) {
1803             col = pj[j];
1804             rtmp2[col] -= mul2 * rtmp1[col];
1805             rtmp3[col] -= mul3 * rtmp1[col];
1806             rtmp4[col] -= mul4 * rtmp1[col];
1807           }
1808           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809         }
1810 
1811         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812         rs = 0.0;
1813         /* L part */
1814         pc2 = b->a + bi[i + 1];
1815         pj  = b->j + bi[i + 1];
1816         nz  = bi[i + 2] - bi[i + 1];
1817         for (j = 0; j < nz; j++) {
1818           col    = pj[j];
1819           pc2[j] = rtmp2[col];
1820           rs += PetscAbsScalar(pc2[j]);
1821         }
1822         /* U part */
1823         pc2 = b->a + bdiag[i + 2] + 1;
1824         pj  = b->j + bdiag[i + 2] + 1;
1825         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826         for (j = 0; j < nz; j++) {
1827           col    = pj[j];
1828           pc2[j] = rtmp2[col];
1829           rs += PetscAbsScalar(pc2[j]);
1830         }
1831 
1832         sctx.rs = rs;
1833         sctx.pv = rtmp2[i + 1];
1834         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835         if (sctx.newshift) break;
1836         pc2  = b->a + bdiag[i + 1];
1837         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1838 
1839         /* Now take care of 2nd column of diagonal 4x4 block. */
1840         pc3 = rtmp3 + i + 1;
1841         pc4 = rtmp4 + i + 1;
1842         if (*pc3 != 0.0 || *pc4 != 0.0) {
1843           mul3 = (*pc3) * (*pc2);
1844           *pc3 = mul3;
1845           mul4 = (*pc4) * (*pc2);
1846           *pc4 = mul4;
1847           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1848           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849           for (j = 0; j < nz; j++) {
1850             col = pj[j];
1851             rtmp3[col] -= mul3 * rtmp2[col];
1852             rtmp4[col] -= mul4 * rtmp2[col];
1853           }
1854           PetscCall(PetscLogFlops(4.0 * nz));
1855         }
1856 
1857         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858         rs = 0.0;
1859         /* L part */
1860         pc3 = b->a + bi[i + 2];
1861         pj  = b->j + bi[i + 2];
1862         nz  = bi[i + 3] - bi[i + 2];
1863         for (j = 0; j < nz; j++) {
1864           col    = pj[j];
1865           pc3[j] = rtmp3[col];
1866           rs += PetscAbsScalar(pc3[j]);
1867         }
1868         /* U part */
1869         pc3 = b->a + bdiag[i + 3] + 1;
1870         pj  = b->j + bdiag[i + 3] + 1;
1871         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872         for (j = 0; j < nz; j++) {
1873           col    = pj[j];
1874           pc3[j] = rtmp3[col];
1875           rs += PetscAbsScalar(pc3[j]);
1876         }
1877 
1878         sctx.rs = rs;
1879         sctx.pv = rtmp3[i + 2];
1880         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881         if (sctx.newshift) break;
1882         pc3  = b->a + bdiag[i + 2];
1883         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1884 
1885         /* Now take care of 3rd column of diagonal 4x4 block. */
1886         pc4 = rtmp4 + i + 2;
1887         if (*pc4 != 0.0) {
1888           mul4 = (*pc4) * (*pc3);
1889           *pc4 = mul4;
1890           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1891           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892           for (j = 0; j < nz; j++) {
1893             col = pj[j];
1894             rtmp4[col] -= mul4 * rtmp3[col];
1895           }
1896           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897         }
1898 
1899         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900         rs = 0.0;
1901         /* L part */
1902         pc4 = b->a + bi[i + 3];
1903         pj  = b->j + bi[i + 3];
1904         nz  = bi[i + 4] - bi[i + 3];
1905         for (j = 0; j < nz; j++) {
1906           col    = pj[j];
1907           pc4[j] = rtmp4[col];
1908           rs += PetscAbsScalar(pc4[j]);
1909         }
1910         /* U part */
1911         pc4 = b->a + bdiag[i + 4] + 1;
1912         pj  = b->j + bdiag[i + 4] + 1;
1913         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914         for (j = 0; j < nz; j++) {
1915           col    = pj[j];
1916           pc4[j] = rtmp4[col];
1917           rs += PetscAbsScalar(pc4[j]);
1918         }
1919 
1920         sctx.rs = rs;
1921         sctx.pv = rtmp4[i + 3];
1922         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923         if (sctx.newshift) break;
1924         pc4  = b->a + bdiag[i + 3];
1925         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926         break;
1927 
1928       default:
1929         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930       }
1931       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932       i += nodesz;              /* Update the row */
1933     }
1934 
1935     /* MatPivotRefine() */
1936     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937       /*
1938        * if no shift in this attempt & shifting & started shifting & can refine,
1939        * then try lower shift
1940        */
1941       sctx.shift_hi       = sctx.shift_fraction;
1942       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1944       sctx.newshift       = PETSC_TRUE;
1945       sctx.nshift++;
1946     }
1947   } while (sctx.newshift);
1948 
1949   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950   PetscCall(PetscFree(tmp_vec2));
1951   PetscCall(ISRestoreIndices(isicol, &ic));
1952   PetscCall(ISRestoreIndices(isrow, &r));
1953 
1954   if (b->inode.size_csr) {
1955     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956   } else {
1957     C->ops->solve = MatSolve_SeqAIJ;
1958   }
1959   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1960   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1961   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964   C->assembled              = PETSC_TRUE;
1965   C->preallocated           = PETSC_TRUE;
1966 
1967   PetscCall(PetscLogFlops(C->cmap->n));
1968 
1969   /* MatShiftView(A,info,&sctx) */
1970   if (sctx.nshift) {
1971     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977     }
1978   }
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
1982 #if 0
1983 // unused
1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985 {
1986   Mat              C = B;
1987   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1989   const PetscInt  *r, *ic, *c, *ics;
1990   PetscInt         n = A->rmap->n, *bi = b->i;
1991   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1993   PetscInt        *ai = a->i, *aj = a->j;
1994   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995   PetscScalar      mul1, mul2, mul3, tmp;
1996   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1998   PetscReal        rs = 0.0;
1999   FactorShiftCtx   sctx;
2000 
2001   PetscFunctionBegin;
2002   sctx.shift_top      = 0;
2003   sctx.nshift_max     = 0;
2004   sctx.shift_lo       = 0;
2005   sctx.shift_hi       = 0;
2006   sctx.shift_fraction = 0;
2007 
2008   /* if both shift schemes are chosen by user, only use info->shiftpd */
2009   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010     sctx.shift_top = 0;
2011     for (i = 0; i < n; i++) {
2012       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013       rs    = 0.0;
2014       ajtmp = aj + ai[i];
2015       rtmp1 = aa + ai[i];
2016       nz    = ai[i + 1] - ai[i];
2017       for (j = 0; j < nz; j++) {
2018         if (*ajtmp != i) {
2019           rs += PetscAbsScalar(*rtmp1++);
2020         } else {
2021           rs -= PetscRealPart(*rtmp1++);
2022         }
2023         ajtmp++;
2024       }
2025       if (rs > sctx.shift_top) sctx.shift_top = rs;
2026     }
2027     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028     sctx.shift_top *= 1.1;
2029     sctx.nshift_max = 5;
2030     sctx.shift_lo   = 0.;
2031     sctx.shift_hi   = 1.;
2032   }
2033   sctx.shift_amount = 0;
2034   sctx.nshift       = 0;
2035 
2036   PetscCall(ISGetIndices(isrow, &r));
2037   PetscCall(ISGetIndices(iscol, &c));
2038   PetscCall(ISGetIndices(isicol, &ic));
2039   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040   ics = ic;
2041 
2042   node_max = a->inode.node_count;
2043   ns       = a->inode.size;
2044   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2045 
2046   /* If max inode size > 3, split it into two inodes.*/
2047   /* also map the inode sizes according to the ordering */
2048   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050     if (ns[i] > 3) {
2051       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2052       ++j;
2053       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054     } else {
2055       tmp_vec1[j] = ns[i];
2056     }
2057   }
2058   /* Use the correct node_max */
2059   node_max = j;
2060 
2061   /* Now reorder the inode info based on mat re-ordering info */
2062   /* First create a row -> inode_size_array_index map */
2063   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064   for (i = 0, row = 0; i < node_max; i++) {
2065     nodesz = tmp_vec1[i];
2066     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067   }
2068   /* Using nsmap, create a reordered ns structure */
2069   for (i = 0, j = 0; i < node_max; i++) {
2070     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071     tmp_vec2[i] = nodesz;
2072     j += nodesz;
2073   }
2074   PetscCall(PetscFree2(nsmap, tmp_vec1));
2075   /* Now use the correct ns */
2076   ns = tmp_vec2;
2077 
2078   do {
2079     sctx.newshift = PETSC_FALSE;
2080     /* Now loop over each block-row, and do the factorization */
2081     for (i = 0, row = 0; i < node_max; i++) {
2082       nodesz = ns[i];
2083       nz     = bi[row + 1] - bi[row];
2084       bjtmp  = bj + bi[row];
2085 
2086       switch (nodesz) {
2087       case 1:
2088         for (j = 0; j < nz; j++) {
2089           idx         = bjtmp[j];
2090           rtmp11[idx] = 0.0;
2091         }
2092 
2093         /* load in initial (unfactored row) */
2094         idx    = r[row];
2095         nz_tmp = ai[idx + 1] - ai[idx];
2096         ajtmp  = aj + ai[idx];
2097         v1     = aa + ai[idx];
2098 
2099         for (j = 0; j < nz_tmp; j++) {
2100           idx         = ics[ajtmp[j]];
2101           rtmp11[idx] = v1[j];
2102         }
2103         rtmp11[ics[r[row]]] += sctx.shift_amount;
2104 
2105         prow = *bjtmp++;
2106         while (prow < row) {
2107           pc1 = rtmp11 + prow;
2108           if (*pc1 != 0.0) {
2109             pv     = ba + bd[prow];
2110             pj     = nbj + bd[prow];
2111             mul1   = *pc1 * *pv++;
2112             *pc1   = mul1;
2113             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115             for (j = 0; j < nz_tmp; j++) {
2116               tmp = pv[j];
2117               idx = pj[j];
2118               rtmp11[idx] -= mul1 * tmp;
2119             }
2120           }
2121           prow = *bjtmp++;
2122         }
2123         pj  = bj + bi[row];
2124         pc1 = ba + bi[row];
2125 
2126         sctx.pv     = rtmp11[row];
2127         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128         rs          = 0.0;
2129         for (j = 0; j < nz; j++) {
2130           idx    = pj[j];
2131           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133         }
2134         sctx.rs = rs;
2135         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136         if (sctx.newshift) goto endofwhile;
2137         break;
2138 
2139       case 2:
2140         for (j = 0; j < nz; j++) {
2141           idx         = bjtmp[j];
2142           rtmp11[idx] = 0.0;
2143           rtmp22[idx] = 0.0;
2144         }
2145 
2146         /* load in initial (unfactored row) */
2147         idx    = r[row];
2148         nz_tmp = ai[idx + 1] - ai[idx];
2149         ajtmp  = aj + ai[idx];
2150         v1     = aa + ai[idx];
2151         v2     = aa + ai[idx + 1];
2152         for (j = 0; j < nz_tmp; j++) {
2153           idx         = ics[ajtmp[j]];
2154           rtmp11[idx] = v1[j];
2155           rtmp22[idx] = v2[j];
2156         }
2157         rtmp11[ics[r[row]]] += sctx.shift_amount;
2158         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2159 
2160         prow = *bjtmp++;
2161         while (prow < row) {
2162           pc1 = rtmp11 + prow;
2163           pc2 = rtmp22 + prow;
2164           if (*pc1 != 0.0 || *pc2 != 0.0) {
2165             pv   = ba + bd[prow];
2166             pj   = nbj + bd[prow];
2167             mul1 = *pc1 * *pv;
2168             mul2 = *pc2 * *pv;
2169             ++pv;
2170             *pc1 = mul1;
2171             *pc2 = mul2;
2172 
2173             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174             for (j = 0; j < nz_tmp; j++) {
2175               tmp = pv[j];
2176               idx = pj[j];
2177               rtmp11[idx] -= mul1 * tmp;
2178               rtmp22[idx] -= mul2 * tmp;
2179             }
2180             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181           }
2182           prow = *bjtmp++;
2183         }
2184 
2185         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186         pc1 = rtmp11 + prow;
2187         pc2 = rtmp22 + prow;
2188 
2189         sctx.pv = *pc1;
2190         pj      = bj + bi[prow];
2191         rs      = 0.0;
2192         for (j = 0; j < nz; j++) {
2193           idx = pj[j];
2194           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195         }
2196         sctx.rs = rs;
2197         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198         if (sctx.newshift) goto endofwhile;
2199 
2200         if (*pc2 != 0.0) {
2201           pj     = nbj + bd[prow];
2202           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203           *pc2   = mul2;
2204           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205           for (j = 0; j < nz_tmp; j++) {
2206             idx = pj[j];
2207             tmp = rtmp11[idx];
2208             rtmp22[idx] -= mul2 * tmp;
2209           }
2210           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211         }
2212 
2213         pj  = bj + bi[row];
2214         pc1 = ba + bi[row];
2215         pc2 = ba + bi[row + 1];
2216 
2217         sctx.pv         = rtmp22[row + 1];
2218         rs              = 0.0;
2219         rtmp11[row]     = 1.0 / rtmp11[row];
2220         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221         /* copy row entries from dense representation to sparse */
2222         for (j = 0; j < nz; j++) {
2223           idx    = pj[j];
2224           pc1[j] = rtmp11[idx];
2225           pc2[j] = rtmp22[idx];
2226           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227         }
2228         sctx.rs = rs;
2229         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230         if (sctx.newshift) goto endofwhile;
2231         break;
2232 
2233       case 3:
2234         for (j = 0; j < nz; j++) {
2235           idx         = bjtmp[j];
2236           rtmp11[idx] = 0.0;
2237           rtmp22[idx] = 0.0;
2238           rtmp33[idx] = 0.0;
2239         }
2240         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241         idx    = r[row];
2242         nz_tmp = ai[idx + 1] - ai[idx];
2243         ajtmp  = aj + ai[idx];
2244         v1     = aa + ai[idx];
2245         v2     = aa + ai[idx + 1];
2246         v3     = aa + ai[idx + 2];
2247         for (j = 0; j < nz_tmp; j++) {
2248           idx         = ics[ajtmp[j]];
2249           rtmp11[idx] = v1[j];
2250           rtmp22[idx] = v2[j];
2251           rtmp33[idx] = v3[j];
2252         }
2253         rtmp11[ics[r[row]]] += sctx.shift_amount;
2254         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2256 
2257         /* loop over all pivot row blocks above this row block */
2258         prow = *bjtmp++;
2259         while (prow < row) {
2260           pc1 = rtmp11 + prow;
2261           pc2 = rtmp22 + prow;
2262           pc3 = rtmp33 + prow;
2263           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264             pv   = ba + bd[prow];
2265             pj   = nbj + bd[prow];
2266             mul1 = *pc1 * *pv;
2267             mul2 = *pc2 * *pv;
2268             mul3 = *pc3 * *pv;
2269             ++pv;
2270             *pc1 = mul1;
2271             *pc2 = mul2;
2272             *pc3 = mul3;
2273 
2274             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275             /* update this row based on pivot row */
2276             for (j = 0; j < nz_tmp; j++) {
2277               tmp = pv[j];
2278               idx = pj[j];
2279               rtmp11[idx] -= mul1 * tmp;
2280               rtmp22[idx] -= mul2 * tmp;
2281               rtmp33[idx] -= mul3 * tmp;
2282             }
2283             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284           }
2285           prow = *bjtmp++;
2286         }
2287 
2288         /* Now take care of diagonal 3x3 block in this set of rows */
2289         /* note: prow = row here */
2290         pc1 = rtmp11 + prow;
2291         pc2 = rtmp22 + prow;
2292         pc3 = rtmp33 + prow;
2293 
2294         sctx.pv = *pc1;
2295         pj      = bj + bi[prow];
2296         rs      = 0.0;
2297         for (j = 0; j < nz; j++) {
2298           idx = pj[j];
2299           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300         }
2301         sctx.rs = rs;
2302         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303         if (sctx.newshift) goto endofwhile;
2304 
2305         if (*pc2 != 0.0 || *pc3 != 0.0) {
2306           mul2   = (*pc2) / (*pc1);
2307           mul3   = (*pc3) / (*pc1);
2308           *pc2   = mul2;
2309           *pc3   = mul3;
2310           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311           pj     = nbj + bd[prow];
2312           for (j = 0; j < nz_tmp; j++) {
2313             idx = pj[j];
2314             tmp = rtmp11[idx];
2315             rtmp22[idx] -= mul2 * tmp;
2316             rtmp33[idx] -= mul3 * tmp;
2317           }
2318           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319         }
2320         ++prow;
2321 
2322         pc2     = rtmp22 + prow;
2323         pc3     = rtmp33 + prow;
2324         sctx.pv = *pc2;
2325         pj      = bj + bi[prow];
2326         rs      = 0.0;
2327         for (j = 0; j < nz; j++) {
2328           idx = pj[j];
2329           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330         }
2331         sctx.rs = rs;
2332         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333         if (sctx.newshift) goto endofwhile;
2334 
2335         if (*pc3 != 0.0) {
2336           mul3   = (*pc3) / (*pc2);
2337           *pc3   = mul3;
2338           pj     = nbj + bd[prow];
2339           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340           for (j = 0; j < nz_tmp; j++) {
2341             idx = pj[j];
2342             tmp = rtmp22[idx];
2343             rtmp33[idx] -= mul3 * tmp;
2344           }
2345           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346         }
2347 
2348         pj  = bj + bi[row];
2349         pc1 = ba + bi[row];
2350         pc2 = ba + bi[row + 1];
2351         pc3 = ba + bi[row + 2];
2352 
2353         sctx.pv         = rtmp33[row + 2];
2354         rs              = 0.0;
2355         rtmp11[row]     = 1.0 / rtmp11[row];
2356         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358         /* copy row entries from dense representation to sparse */
2359         for (j = 0; j < nz; j++) {
2360           idx    = pj[j];
2361           pc1[j] = rtmp11[idx];
2362           pc2[j] = rtmp22[idx];
2363           pc3[j] = rtmp33[idx];
2364           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365         }
2366 
2367         sctx.rs = rs;
2368         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369         if (sctx.newshift) goto endofwhile;
2370         break;
2371 
2372       default:
2373         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374       }
2375       row += nodesz; /* Update the row */
2376     }
2377   endofwhile:;
2378   } while (sctx.newshift);
2379   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380   PetscCall(PetscFree(tmp_vec2));
2381   PetscCall(ISRestoreIndices(isicol, &ic));
2382   PetscCall(ISRestoreIndices(isrow, &r));
2383   PetscCall(ISRestoreIndices(iscol, &c));
2384 
2385   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386   /* do not set solve add, since MatSolve_Inode + Add is faster */
2387   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2388   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389   C->assembled              = PETSC_TRUE;
2390   C->preallocated           = PETSC_TRUE;
2391   if (sctx.nshift) {
2392     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396     }
2397   }
2398   PetscCall(PetscLogFlops(C->cmap->n));
2399   PetscCall(MatSeqAIJCheckInode(C));
2400   PetscFunctionReturn(PETSC_SUCCESS);
2401 }
2402 #endif
2403 
2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405 {
2406   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2407   IS                 iscol = a->col, isrow = a->row;
2408   const PetscInt    *r, *c, *rout, *cout;
2409   PetscInt           i, j;
2410   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2411   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2413   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2414   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415   const PetscScalar *b;
2416 
2417   PetscFunctionBegin;
2418   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419   node_max = a->inode.node_count;
2420   ns       = a->inode.size_csr; /* Node Size array */
2421 
2422   PetscCall(VecGetArrayRead(bb, &b));
2423   PetscCall(VecGetArrayWrite(xx, &x));
2424   tmp = a->solve_work;
2425 
2426   PetscCall(ISGetIndices(isrow, &rout));
2427   r = rout;
2428   PetscCall(ISGetIndices(iscol, &cout));
2429   c = cout;
2430 
2431   /* forward solve the lower triangular */
2432   tmps = tmp;
2433   aa   = a_a;
2434   aj   = a_j;
2435   ad   = a->diag;
2436 
2437   for (i = 0; i < node_max; ++i) {
2438     row = ns[i];
2439     nsz = ns[i + 1] - ns[i];
2440     aii = ai[row];
2441     v1  = aa + aii;
2442     vi  = aj + aii;
2443     nz  = ai[row + 1] - ai[row];
2444 
2445     if (i < node_max - 1) {
2446       /* Prefetch the indices for the next block */
2447       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448       /* Prefetch the data for the next block */
2449       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450     }
2451 
2452     switch (nsz) { /* Each loop in 'case' is unrolled */
2453     case 1:
2454       sum1 = b[r[row]];
2455       for (j = 0; j < nz - 1; j += 2) {
2456         i0   = vi[j];
2457         i1   = vi[j + 1];
2458         tmp0 = tmps[i0];
2459         tmp1 = tmps[i1];
2460         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461       }
2462       if (j == nz - 1) {
2463         tmp0 = tmps[vi[j]];
2464         sum1 -= v1[j] * tmp0;
2465       }
2466       tmp[row++] = sum1;
2467       break;
2468     case 2:
2469       sum1 = b[r[row]];
2470       sum2 = b[r[row + 1]];
2471       v2   = aa + ai[row + 1];
2472 
2473       for (j = 0; j < nz - 1; j += 2) {
2474         i0   = vi[j];
2475         i1   = vi[j + 1];
2476         tmp0 = tmps[i0];
2477         tmp1 = tmps[i1];
2478         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480       }
2481       if (j == nz - 1) {
2482         tmp0 = tmps[vi[j]];
2483         sum1 -= v1[j] * tmp0;
2484         sum2 -= v2[j] * tmp0;
2485       }
2486       sum2 -= v2[nz] * sum1;
2487       tmp[row++] = sum1;
2488       tmp[row++] = sum2;
2489       break;
2490     case 3:
2491       sum1 = b[r[row]];
2492       sum2 = b[r[row + 1]];
2493       sum3 = b[r[row + 2]];
2494       v2   = aa + ai[row + 1];
2495       v3   = aa + ai[row + 2];
2496 
2497       for (j = 0; j < nz - 1; j += 2) {
2498         i0   = vi[j];
2499         i1   = vi[j + 1];
2500         tmp0 = tmps[i0];
2501         tmp1 = tmps[i1];
2502         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505       }
2506       if (j == nz - 1) {
2507         tmp0 = tmps[vi[j]];
2508         sum1 -= v1[j] * tmp0;
2509         sum2 -= v2[j] * tmp0;
2510         sum3 -= v3[j] * tmp0;
2511       }
2512       sum2 -= v2[nz] * sum1;
2513       sum3 -= v3[nz] * sum1;
2514       sum3 -= v3[nz + 1] * sum2;
2515       tmp[row++] = sum1;
2516       tmp[row++] = sum2;
2517       tmp[row++] = sum3;
2518       break;
2519 
2520     case 4:
2521       sum1 = b[r[row]];
2522       sum2 = b[r[row + 1]];
2523       sum3 = b[r[row + 2]];
2524       sum4 = b[r[row + 3]];
2525       v2   = aa + ai[row + 1];
2526       v3   = aa + ai[row + 2];
2527       v4   = aa + ai[row + 3];
2528 
2529       for (j = 0; j < nz - 1; j += 2) {
2530         i0   = vi[j];
2531         i1   = vi[j + 1];
2532         tmp0 = tmps[i0];
2533         tmp1 = tmps[i1];
2534         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538       }
2539       if (j == nz - 1) {
2540         tmp0 = tmps[vi[j]];
2541         sum1 -= v1[j] * tmp0;
2542         sum2 -= v2[j] * tmp0;
2543         sum3 -= v3[j] * tmp0;
2544         sum4 -= v4[j] * tmp0;
2545       }
2546       sum2 -= v2[nz] * sum1;
2547       sum3 -= v3[nz] * sum1;
2548       sum4 -= v4[nz] * sum1;
2549       sum3 -= v3[nz + 1] * sum2;
2550       sum4 -= v4[nz + 1] * sum2;
2551       sum4 -= v4[nz + 2] * sum3;
2552 
2553       tmp[row++] = sum1;
2554       tmp[row++] = sum2;
2555       tmp[row++] = sum3;
2556       tmp[row++] = sum4;
2557       break;
2558     case 5:
2559       sum1 = b[r[row]];
2560       sum2 = b[r[row + 1]];
2561       sum3 = b[r[row + 2]];
2562       sum4 = b[r[row + 3]];
2563       sum5 = b[r[row + 4]];
2564       v2   = aa + ai[row + 1];
2565       v3   = aa + ai[row + 2];
2566       v4   = aa + ai[row + 3];
2567       v5   = aa + ai[row + 4];
2568 
2569       for (j = 0; j < nz - 1; j += 2) {
2570         i0   = vi[j];
2571         i1   = vi[j + 1];
2572         tmp0 = tmps[i0];
2573         tmp1 = tmps[i1];
2574         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579       }
2580       if (j == nz - 1) {
2581         tmp0 = tmps[vi[j]];
2582         sum1 -= v1[j] * tmp0;
2583         sum2 -= v2[j] * tmp0;
2584         sum3 -= v3[j] * tmp0;
2585         sum4 -= v4[j] * tmp0;
2586         sum5 -= v5[j] * tmp0;
2587       }
2588 
2589       sum2 -= v2[nz] * sum1;
2590       sum3 -= v3[nz] * sum1;
2591       sum4 -= v4[nz] * sum1;
2592       sum5 -= v5[nz] * sum1;
2593       sum3 -= v3[nz + 1] * sum2;
2594       sum4 -= v4[nz + 1] * sum2;
2595       sum5 -= v5[nz + 1] * sum2;
2596       sum4 -= v4[nz + 2] * sum3;
2597       sum5 -= v5[nz + 2] * sum3;
2598       sum5 -= v5[nz + 3] * sum4;
2599 
2600       tmp[row++] = sum1;
2601       tmp[row++] = sum2;
2602       tmp[row++] = sum3;
2603       tmp[row++] = sum4;
2604       tmp[row++] = sum5;
2605       break;
2606     default:
2607       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608     }
2609   }
2610   /* backward solve the upper triangular */
2611   for (i = node_max - 1; i >= 0; i--) {
2612     row = ns[i + 1] - 1;
2613     nsz = ns[i + 1] - ns[i];
2614     aii = ad[row + 1] + 1;
2615     v1  = aa + aii;
2616     vi  = aj + aii;
2617     nz  = ad[row] - ad[row + 1] - 1;
2618 
2619     if (i > 0) {
2620       /* Prefetch the indices for the next block */
2621       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622       /* Prefetch the data for the next block */
2623       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624     }
2625 
2626     switch (nsz) { /* Each loop in 'case' is unrolled */
2627     case 1:
2628       sum1 = tmp[row];
2629 
2630       for (j = 0; j < nz - 1; j += 2) {
2631         i0   = vi[j];
2632         i1   = vi[j + 1];
2633         tmp0 = tmps[i0];
2634         tmp1 = tmps[i1];
2635         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636       }
2637       if (j == nz - 1) {
2638         tmp0 = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640       }
2641       x[c[row]] = tmp[row] = sum1 * v1[nz];
2642       row--;
2643       break;
2644     case 2:
2645       sum1 = tmp[row];
2646       sum2 = tmp[row - 1];
2647       v2   = aa + ad[row] + 1;
2648       for (j = 0; j < nz - 1; j += 2) {
2649         i0   = vi[j];
2650         i1   = vi[j + 1];
2651         tmp0 = tmps[i0];
2652         tmp1 = tmps[i1];
2653         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655       }
2656       if (j == nz - 1) {
2657         tmp0 = tmps[vi[j]];
2658         sum1 -= v1[j] * tmp0;
2659         sum2 -= v2[j + 1] * tmp0;
2660       }
2661 
2662       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663       row--;
2664       sum2 -= v2[0] * tmp0;
2665       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666       row--;
2667       break;
2668     case 3:
2669       sum1 = tmp[row];
2670       sum2 = tmp[row - 1];
2671       sum3 = tmp[row - 2];
2672       v2   = aa + ad[row] + 1;
2673       v3   = aa + ad[row - 1] + 1;
2674       for (j = 0; j < nz - 1; j += 2) {
2675         i0   = vi[j];
2676         i1   = vi[j + 1];
2677         tmp0 = tmps[i0];
2678         tmp1 = tmps[i1];
2679         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682       }
2683       if (j == nz - 1) {
2684         tmp0 = tmps[vi[j]];
2685         sum1 -= v1[j] * tmp0;
2686         sum2 -= v2[j + 1] * tmp0;
2687         sum3 -= v3[j + 2] * tmp0;
2688       }
2689       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690       row--;
2691       sum2 -= v2[0] * tmp0;
2692       sum3 -= v3[1] * tmp0;
2693       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694       row--;
2695       sum3 -= v3[0] * tmp0;
2696       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697       row--;
2698 
2699       break;
2700     case 4:
2701       sum1 = tmp[row];
2702       sum2 = tmp[row - 1];
2703       sum3 = tmp[row - 2];
2704       sum4 = tmp[row - 3];
2705       v2   = aa + ad[row] + 1;
2706       v3   = aa + ad[row - 1] + 1;
2707       v4   = aa + ad[row - 2] + 1;
2708 
2709       for (j = 0; j < nz - 1; j += 2) {
2710         i0   = vi[j];
2711         i1   = vi[j + 1];
2712         tmp0 = tmps[i0];
2713         tmp1 = tmps[i1];
2714         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718       }
2719       if (j == nz - 1) {
2720         tmp0 = tmps[vi[j]];
2721         sum1 -= v1[j] * tmp0;
2722         sum2 -= v2[j + 1] * tmp0;
2723         sum3 -= v3[j + 2] * tmp0;
2724         sum4 -= v4[j + 3] * tmp0;
2725       }
2726 
2727       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728       row--;
2729       sum2 -= v2[0] * tmp0;
2730       sum3 -= v3[1] * tmp0;
2731       sum4 -= v4[2] * tmp0;
2732       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733       row--;
2734       sum3 -= v3[0] * tmp0;
2735       sum4 -= v4[1] * tmp0;
2736       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737       row--;
2738       sum4 -= v4[0] * tmp0;
2739       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740       row--;
2741       break;
2742     case 5:
2743       sum1 = tmp[row];
2744       sum2 = tmp[row - 1];
2745       sum3 = tmp[row - 2];
2746       sum4 = tmp[row - 3];
2747       sum5 = tmp[row - 4];
2748       v2   = aa + ad[row] + 1;
2749       v3   = aa + ad[row - 1] + 1;
2750       v4   = aa + ad[row - 2] + 1;
2751       v5   = aa + ad[row - 3] + 1;
2752       for (j = 0; j < nz - 1; j += 2) {
2753         i0   = vi[j];
2754         i1   = vi[j + 1];
2755         tmp0 = tmps[i0];
2756         tmp1 = tmps[i1];
2757         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762       }
2763       if (j == nz - 1) {
2764         tmp0 = tmps[vi[j]];
2765         sum1 -= v1[j] * tmp0;
2766         sum2 -= v2[j + 1] * tmp0;
2767         sum3 -= v3[j + 2] * tmp0;
2768         sum4 -= v4[j + 3] * tmp0;
2769         sum5 -= v5[j + 4] * tmp0;
2770       }
2771 
2772       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773       row--;
2774       sum2 -= v2[0] * tmp0;
2775       sum3 -= v3[1] * tmp0;
2776       sum4 -= v4[2] * tmp0;
2777       sum5 -= v5[3] * tmp0;
2778       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779       row--;
2780       sum3 -= v3[0] * tmp0;
2781       sum4 -= v4[1] * tmp0;
2782       sum5 -= v5[2] * tmp0;
2783       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784       row--;
2785       sum4 -= v4[0] * tmp0;
2786       sum5 -= v5[1] * tmp0;
2787       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788       row--;
2789       sum5 -= v5[0] * tmp0;
2790       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791       row--;
2792       break;
2793     default:
2794       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795     }
2796   }
2797   PetscCall(ISRestoreIndices(isrow, &rout));
2798   PetscCall(ISRestoreIndices(iscol, &cout));
2799   PetscCall(VecRestoreArrayRead(bb, &b));
2800   PetscCall(VecRestoreArrayWrite(xx, &x));
2801   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802   PetscFunctionReturn(PETSC_SUCCESS);
2803 }
2804 
2805 /*
2806      Makes a longer coloring[] array and calls the usual code with that
2807 */
2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809 {
2810   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2811   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812   PetscInt        *colorused, i;
2813   ISColoringValue *newcolor;
2814 
2815   PetscFunctionBegin;
2816   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817   PetscCall(PetscMalloc1(n + 1, &newcolor));
2818   /* loop over inodes, marking a color for each column*/
2819   row = 0;
2820   for (i = 0; i < m; i++) {
2821     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822   }
2823 
2824   /* eliminate unneeded colors */
2825   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2827 
2828   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829   ncolors = colorused[5 * ncolors - 1];
2830   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831   PetscCall(PetscFree(colorused));
2832   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833   PetscCall(PetscFree(coloring));
2834   PetscFunctionReturn(PETSC_SUCCESS);
2835 }
2836 
2837 #include <petsc/private/kernels/blockinvert.h>
2838 
2839 /*
2840    Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
2841 */
2842 static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2843 {
2844   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data;
2845   MatScalar       *ibdiag, *bdiag, work[25];
2846   const MatScalar *v         = a->a;
2847   PetscReal        zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848   PetscInt         m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
2849   PetscInt         k, ipvt[5];
2850   PetscBool        allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2851   const PetscInt  *sizes = a->inode.size_csr, *diag = a->diag;
2852 
2853   PetscFunctionBegin;
2854   if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2855   PetscCall(MatMarkDiagonal_SeqAIJ(A));
2856   diag = a->diag;
2857   if (!a->inode.ibdiag) {
2858     /* calculate space needed for diagonal blocks */
2859     for (i = 0; i < m; i++) {
2860       nodesz = sizes[i + 1] - sizes[i];
2861       cnt += nodesz * nodesz;
2862     }
2863     a->inode.bdiagsize = cnt;
2864     PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2865   }
2866 
2867   /* copy over the diagonal blocks and invert them */
2868   ibdiag = a->inode.ibdiag;
2869   bdiag  = a->inode.bdiag;
2870   cnt    = 0;
2871   for (i = 0, row = 0; i < m; i++) {
2872     nodesz = sizes[i + 1] - sizes[i];
2873     for (j = 0; j < nodesz; j++) {
2874       for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2875     }
2876     PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2877 
2878     switch (nodesz) {
2879     case 1:
2880       /* Create matrix data structure */
2881       if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2882         PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2883         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2884         A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2885         A->factorerror_zeropivot_row   = row;
2886         PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2887       }
2888       ibdiag[cnt] = 1.0 / ibdiag[cnt];
2889       break;
2890     case 2:
2891       PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2892       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2893       break;
2894     case 3:
2895       PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2896       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2897       break;
2898     case 4:
2899       PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2900       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2901       break;
2902     case 5:
2903       PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2904       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2905       break;
2906     default:
2907       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2908     }
2909     cnt += nodesz * nodesz;
2910     row += nodesz;
2911   }
2912   a->inode.ibdiagState = ((PetscObject)A)->state;
2913   PetscFunctionReturn(PETSC_SUCCESS);
2914 }
2915 
2916 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2917 {
2918   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2919   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2920   MatScalar         *ibdiag, *bdiag, *t;
2921   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2922   const MatScalar   *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2923   const PetscScalar *xb, *b;
2924   PetscInt           n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
2925   PetscInt           sz;
2926   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
2927 
2928   PetscFunctionBegin;
2929   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2930   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2931   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2932   PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
2933   diag = a->diag;
2934 
2935   ibdiag = a->inode.ibdiag;
2936   bdiag  = a->inode.bdiag;
2937   t      = a->inode.ssor_work;
2938 
2939   PetscCall(VecGetArray(xx, &x));
2940   PetscCall(VecGetArrayRead(bb, &b));
2941   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2942   if (flag & SOR_ZERO_INITIAL_GUESS) {
2943     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2944       for (i = 0, row = 0; i < m; i++) {
2945         sz  = diag[row] - ii[row];
2946         v1  = a->a + ii[row];
2947         idx = a->j + ii[row];
2948 
2949         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2950         nodesz = sizes[i + 1] - sizes[i];
2951         switch (nodesz) {
2952         case 1:
2953 
2954           sum1 = b[row];
2955           for (n = 0; n < sz - 1; n += 2) {
2956             i1 = idx[0];
2957             i2 = idx[1];
2958             idx += 2;
2959             tmp0 = x[i1];
2960             tmp1 = x[i2];
2961             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2962             v1 += 2;
2963           }
2964 
2965           if (n == sz - 1) {
2966             tmp0 = x[*idx];
2967             sum1 -= *v1 * tmp0;
2968           }
2969           t[row]   = sum1;
2970           x[row++] = sum1 * (*ibdiag++);
2971           break;
2972         case 2:
2973           v2   = a->a + ii[row + 1];
2974           sum1 = b[row];
2975           sum2 = b[row + 1];
2976           for (n = 0; n < sz - 1; n += 2) {
2977             i1 = idx[0];
2978             i2 = idx[1];
2979             idx += 2;
2980             tmp0 = x[i1];
2981             tmp1 = x[i2];
2982             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2983             v1 += 2;
2984             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2985             v2 += 2;
2986           }
2987 
2988           if (n == sz - 1) {
2989             tmp0 = x[*idx];
2990             sum1 -= v1[0] * tmp0;
2991             sum2 -= v2[0] * tmp0;
2992           }
2993           t[row]     = sum1;
2994           t[row + 1] = sum2;
2995           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2996           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2997           ibdiag += 4;
2998           break;
2999         case 3:
3000           v2   = a->a + ii[row + 1];
3001           v3   = a->a + ii[row + 2];
3002           sum1 = b[row];
3003           sum2 = b[row + 1];
3004           sum3 = b[row + 2];
3005           for (n = 0; n < sz - 1; n += 2) {
3006             i1 = idx[0];
3007             i2 = idx[1];
3008             idx += 2;
3009             tmp0 = x[i1];
3010             tmp1 = x[i2];
3011             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3012             v1 += 2;
3013             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3014             v2 += 2;
3015             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3016             v3 += 2;
3017           }
3018 
3019           if (n == sz - 1) {
3020             tmp0 = x[*idx];
3021             sum1 -= v1[0] * tmp0;
3022             sum2 -= v2[0] * tmp0;
3023             sum3 -= v3[0] * tmp0;
3024           }
3025           t[row]     = sum1;
3026           t[row + 1] = sum2;
3027           t[row + 2] = sum3;
3028           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3029           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3030           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3031           ibdiag += 9;
3032           break;
3033         case 4:
3034           v2   = a->a + ii[row + 1];
3035           v3   = a->a + ii[row + 2];
3036           v4   = a->a + ii[row + 3];
3037           sum1 = b[row];
3038           sum2 = b[row + 1];
3039           sum3 = b[row + 2];
3040           sum4 = b[row + 3];
3041           for (n = 0; n < sz - 1; n += 2) {
3042             i1 = idx[0];
3043             i2 = idx[1];
3044             idx += 2;
3045             tmp0 = x[i1];
3046             tmp1 = x[i2];
3047             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3048             v1 += 2;
3049             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3050             v2 += 2;
3051             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3052             v3 += 2;
3053             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3054             v4 += 2;
3055           }
3056 
3057           if (n == sz - 1) {
3058             tmp0 = x[*idx];
3059             sum1 -= v1[0] * tmp0;
3060             sum2 -= v2[0] * tmp0;
3061             sum3 -= v3[0] * tmp0;
3062             sum4 -= v4[0] * tmp0;
3063           }
3064           t[row]     = sum1;
3065           t[row + 1] = sum2;
3066           t[row + 2] = sum3;
3067           t[row + 3] = sum4;
3068           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3069           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3070           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3071           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3072           ibdiag += 16;
3073           break;
3074         case 5:
3075           v2   = a->a + ii[row + 1];
3076           v3   = a->a + ii[row + 2];
3077           v4   = a->a + ii[row + 3];
3078           v5   = a->a + ii[row + 4];
3079           sum1 = b[row];
3080           sum2 = b[row + 1];
3081           sum3 = b[row + 2];
3082           sum4 = b[row + 3];
3083           sum5 = b[row + 4];
3084           for (n = 0; n < sz - 1; n += 2) {
3085             i1 = idx[0];
3086             i2 = idx[1];
3087             idx += 2;
3088             tmp0 = x[i1];
3089             tmp1 = x[i2];
3090             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3091             v1 += 2;
3092             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3093             v2 += 2;
3094             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3095             v3 += 2;
3096             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3097             v4 += 2;
3098             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3099             v5 += 2;
3100           }
3101 
3102           if (n == sz - 1) {
3103             tmp0 = x[*idx];
3104             sum1 -= v1[0] * tmp0;
3105             sum2 -= v2[0] * tmp0;
3106             sum3 -= v3[0] * tmp0;
3107             sum4 -= v4[0] * tmp0;
3108             sum5 -= v5[0] * tmp0;
3109           }
3110           t[row]     = sum1;
3111           t[row + 1] = sum2;
3112           t[row + 2] = sum3;
3113           t[row + 3] = sum4;
3114           t[row + 4] = sum5;
3115           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3116           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3117           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3118           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3119           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3120           ibdiag += 25;
3121           break;
3122         default:
3123           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3124         }
3125       }
3126 
3127       xb = t;
3128       PetscCall(PetscLogFlops(a->nz));
3129     } else xb = b;
3130     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3131       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3132       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3133         nodesz = sizes[i + 1] - sizes[i];
3134         ibdiag -= nodesz * nodesz;
3135         sz  = ii[row + 1] - diag[row] - 1;
3136         v1  = a->a + diag[row] + 1;
3137         idx = a->j + diag[row] + 1;
3138 
3139         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3140         switch (nodesz) {
3141         case 1:
3142 
3143           sum1 = xb[row];
3144           for (n = 0; n < sz - 1; n += 2) {
3145             i1 = idx[0];
3146             i2 = idx[1];
3147             idx += 2;
3148             tmp0 = x[i1];
3149             tmp1 = x[i2];
3150             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3151             v1 += 2;
3152           }
3153 
3154           if (n == sz - 1) {
3155             tmp0 = x[*idx];
3156             sum1 -= *v1 * tmp0;
3157           }
3158           x[row--] = sum1 * (*ibdiag);
3159           break;
3160 
3161         case 2:
3162 
3163           sum1 = xb[row];
3164           sum2 = xb[row - 1];
3165           /* note that sum1 is associated with the second of the two rows */
3166           v2 = a->a + diag[row - 1] + 2;
3167           for (n = 0; n < sz - 1; n += 2) {
3168             i1 = idx[0];
3169             i2 = idx[1];
3170             idx += 2;
3171             tmp0 = x[i1];
3172             tmp1 = x[i2];
3173             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3174             v1 += 2;
3175             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3176             v2 += 2;
3177           }
3178 
3179           if (n == sz - 1) {
3180             tmp0 = x[*idx];
3181             sum1 -= *v1 * tmp0;
3182             sum2 -= *v2 * tmp0;
3183           }
3184           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3185           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3186           break;
3187         case 3:
3188 
3189           sum1 = xb[row];
3190           sum2 = xb[row - 1];
3191           sum3 = xb[row - 2];
3192           v2   = a->a + diag[row - 1] + 2;
3193           v3   = a->a + diag[row - 2] + 3;
3194           for (n = 0; n < sz - 1; n += 2) {
3195             i1 = idx[0];
3196             i2 = idx[1];
3197             idx += 2;
3198             tmp0 = x[i1];
3199             tmp1 = x[i2];
3200             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3201             v1 += 2;
3202             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3203             v2 += 2;
3204             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3205             v3 += 2;
3206           }
3207 
3208           if (n == sz - 1) {
3209             tmp0 = x[*idx];
3210             sum1 -= *v1 * tmp0;
3211             sum2 -= *v2 * tmp0;
3212             sum3 -= *v3 * tmp0;
3213           }
3214           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3215           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3216           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3217           break;
3218         case 4:
3219 
3220           sum1 = xb[row];
3221           sum2 = xb[row - 1];
3222           sum3 = xb[row - 2];
3223           sum4 = xb[row - 3];
3224           v2   = a->a + diag[row - 1] + 2;
3225           v3   = a->a + diag[row - 2] + 3;
3226           v4   = a->a + diag[row - 3] + 4;
3227           for (n = 0; n < sz - 1; n += 2) {
3228             i1 = idx[0];
3229             i2 = idx[1];
3230             idx += 2;
3231             tmp0 = x[i1];
3232             tmp1 = x[i2];
3233             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3234             v1 += 2;
3235             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3236             v2 += 2;
3237             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3238             v3 += 2;
3239             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3240             v4 += 2;
3241           }
3242 
3243           if (n == sz - 1) {
3244             tmp0 = x[*idx];
3245             sum1 -= *v1 * tmp0;
3246             sum2 -= *v2 * tmp0;
3247             sum3 -= *v3 * tmp0;
3248             sum4 -= *v4 * tmp0;
3249           }
3250           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3251           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3252           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3253           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3254           break;
3255         case 5:
3256 
3257           sum1 = xb[row];
3258           sum2 = xb[row - 1];
3259           sum3 = xb[row - 2];
3260           sum4 = xb[row - 3];
3261           sum5 = xb[row - 4];
3262           v2   = a->a + diag[row - 1] + 2;
3263           v3   = a->a + diag[row - 2] + 3;
3264           v4   = a->a + diag[row - 3] + 4;
3265           v5   = a->a + diag[row - 4] + 5;
3266           for (n = 0; n < sz - 1; n += 2) {
3267             i1 = idx[0];
3268             i2 = idx[1];
3269             idx += 2;
3270             tmp0 = x[i1];
3271             tmp1 = x[i2];
3272             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3273             v1 += 2;
3274             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3275             v2 += 2;
3276             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3277             v3 += 2;
3278             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3279             v4 += 2;
3280             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3281             v5 += 2;
3282           }
3283 
3284           if (n == sz - 1) {
3285             tmp0 = x[*idx];
3286             sum1 -= *v1 * tmp0;
3287             sum2 -= *v2 * tmp0;
3288             sum3 -= *v3 * tmp0;
3289             sum4 -= *v4 * tmp0;
3290             sum5 -= *v5 * tmp0;
3291           }
3292           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3293           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3294           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3295           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3296           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3297           break;
3298         default:
3299           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3300         }
3301       }
3302 
3303       PetscCall(PetscLogFlops(a->nz));
3304     }
3305     its--;
3306   }
3307   while (its--) {
3308     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3309       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3310         nodesz = sizes[i + 1] - sizes[i];
3311         sz     = diag[row] - ii[row];
3312         v1     = a->a + ii[row];
3313         idx    = a->j + ii[row];
3314         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3315         switch (nodesz) {
3316         case 1:
3317           sum1 = b[row];
3318           for (n = 0; n < sz - 1; n += 2) {
3319             i1 = idx[0];
3320             i2 = idx[1];
3321             idx += 2;
3322             tmp0 = x[i1];
3323             tmp1 = x[i2];
3324             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3325             v1 += 2;
3326           }
3327           if (n == sz - 1) {
3328             tmp0 = x[*idx++];
3329             sum1 -= *v1 * tmp0;
3330             v1++;
3331           }
3332           t[row] = sum1;
3333           sz     = ii[row + 1] - diag[row] - 1;
3334           idx    = a->j + diag[row] + 1;
3335           v1 += 1;
3336           for (n = 0; n < sz - 1; n += 2) {
3337             i1 = idx[0];
3338             i2 = idx[1];
3339             idx += 2;
3340             tmp0 = x[i1];
3341             tmp1 = x[i2];
3342             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3343             v1 += 2;
3344           }
3345           if (n == sz - 1) {
3346             tmp0 = x[*idx++];
3347             sum1 -= *v1 * tmp0;
3348           }
3349           /* in MatSOR_SeqAIJ this line would be
3350            *
3351            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3352            *
3353            * but omega == 1, so this becomes
3354            *
3355            * x[row] = sum1*(*ibdiag++);
3356            *
3357            */
3358           x[row] = sum1 * (*ibdiag);
3359           break;
3360         case 2:
3361           v2   = a->a + ii[row + 1];
3362           sum1 = b[row];
3363           sum2 = b[row + 1];
3364           for (n = 0; n < sz - 1; n += 2) {
3365             i1 = idx[0];
3366             i2 = idx[1];
3367             idx += 2;
3368             tmp0 = x[i1];
3369             tmp1 = x[i2];
3370             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3371             v1 += 2;
3372             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3373             v2 += 2;
3374           }
3375           if (n == sz - 1) {
3376             tmp0 = x[*idx++];
3377             sum1 -= v1[0] * tmp0;
3378             sum2 -= v2[0] * tmp0;
3379             v1++;
3380             v2++;
3381           }
3382           t[row]     = sum1;
3383           t[row + 1] = sum2;
3384           sz         = ii[row + 1] - diag[row] - 2;
3385           idx        = a->j + diag[row] + 2;
3386           v1 += 2;
3387           v2 += 2;
3388           for (n = 0; n < sz - 1; n += 2) {
3389             i1 = idx[0];
3390             i2 = idx[1];
3391             idx += 2;
3392             tmp0 = x[i1];
3393             tmp1 = x[i2];
3394             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3395             v1 += 2;
3396             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3397             v2 += 2;
3398           }
3399           if (n == sz - 1) {
3400             tmp0 = x[*idx];
3401             sum1 -= v1[0] * tmp0;
3402             sum2 -= v2[0] * tmp0;
3403           }
3404           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3405           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3406           break;
3407         case 3:
3408           v2   = a->a + ii[row + 1];
3409           v3   = a->a + ii[row + 2];
3410           sum1 = b[row];
3411           sum2 = b[row + 1];
3412           sum3 = b[row + 2];
3413           for (n = 0; n < sz - 1; n += 2) {
3414             i1 = idx[0];
3415             i2 = idx[1];
3416             idx += 2;
3417             tmp0 = x[i1];
3418             tmp1 = x[i2];
3419             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3420             v1 += 2;
3421             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3422             v2 += 2;
3423             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3424             v3 += 2;
3425           }
3426           if (n == sz - 1) {
3427             tmp0 = x[*idx++];
3428             sum1 -= v1[0] * tmp0;
3429             sum2 -= v2[0] * tmp0;
3430             sum3 -= v3[0] * tmp0;
3431             v1++;
3432             v2++;
3433             v3++;
3434           }
3435           t[row]     = sum1;
3436           t[row + 1] = sum2;
3437           t[row + 2] = sum3;
3438           sz         = ii[row + 1] - diag[row] - 3;
3439           idx        = a->j + diag[row] + 3;
3440           v1 += 3;
3441           v2 += 3;
3442           v3 += 3;
3443           for (n = 0; n < sz - 1; n += 2) {
3444             i1 = idx[0];
3445             i2 = idx[1];
3446             idx += 2;
3447             tmp0 = x[i1];
3448             tmp1 = x[i2];
3449             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3450             v1 += 2;
3451             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3452             v2 += 2;
3453             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3454             v3 += 2;
3455           }
3456           if (n == sz - 1) {
3457             tmp0 = x[*idx];
3458             sum1 -= v1[0] * tmp0;
3459             sum2 -= v2[0] * tmp0;
3460             sum3 -= v3[0] * tmp0;
3461           }
3462           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3463           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3464           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3465           break;
3466         case 4:
3467           v2   = a->a + ii[row + 1];
3468           v3   = a->a + ii[row + 2];
3469           v4   = a->a + ii[row + 3];
3470           sum1 = b[row];
3471           sum2 = b[row + 1];
3472           sum3 = b[row + 2];
3473           sum4 = b[row + 3];
3474           for (n = 0; n < sz - 1; n += 2) {
3475             i1 = idx[0];
3476             i2 = idx[1];
3477             idx += 2;
3478             tmp0 = x[i1];
3479             tmp1 = x[i2];
3480             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3481             v1 += 2;
3482             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3483             v2 += 2;
3484             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3485             v3 += 2;
3486             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3487             v4 += 2;
3488           }
3489           if (n == sz - 1) {
3490             tmp0 = x[*idx++];
3491             sum1 -= v1[0] * tmp0;
3492             sum2 -= v2[0] * tmp0;
3493             sum3 -= v3[0] * tmp0;
3494             sum4 -= v4[0] * tmp0;
3495             v1++;
3496             v2++;
3497             v3++;
3498             v4++;
3499           }
3500           t[row]     = sum1;
3501           t[row + 1] = sum2;
3502           t[row + 2] = sum3;
3503           t[row + 3] = sum4;
3504           sz         = ii[row + 1] - diag[row] - 4;
3505           idx        = a->j + diag[row] + 4;
3506           v1 += 4;
3507           v2 += 4;
3508           v3 += 4;
3509           v4 += 4;
3510           for (n = 0; n < sz - 1; n += 2) {
3511             i1 = idx[0];
3512             i2 = idx[1];
3513             idx += 2;
3514             tmp0 = x[i1];
3515             tmp1 = x[i2];
3516             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3517             v1 += 2;
3518             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3519             v2 += 2;
3520             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3521             v3 += 2;
3522             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3523             v4 += 2;
3524           }
3525           if (n == sz - 1) {
3526             tmp0 = x[*idx];
3527             sum1 -= v1[0] * tmp0;
3528             sum2 -= v2[0] * tmp0;
3529             sum3 -= v3[0] * tmp0;
3530             sum4 -= v4[0] * tmp0;
3531           }
3532           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3533           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3534           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3535           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3536           break;
3537         case 5:
3538           v2   = a->a + ii[row + 1];
3539           v3   = a->a + ii[row + 2];
3540           v4   = a->a + ii[row + 3];
3541           v5   = a->a + ii[row + 4];
3542           sum1 = b[row];
3543           sum2 = b[row + 1];
3544           sum3 = b[row + 2];
3545           sum4 = b[row + 3];
3546           sum5 = b[row + 4];
3547           for (n = 0; n < sz - 1; n += 2) {
3548             i1 = idx[0];
3549             i2 = idx[1];
3550             idx += 2;
3551             tmp0 = x[i1];
3552             tmp1 = x[i2];
3553             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3554             v1 += 2;
3555             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3556             v2 += 2;
3557             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3558             v3 += 2;
3559             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3560             v4 += 2;
3561             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3562             v5 += 2;
3563           }
3564           if (n == sz - 1) {
3565             tmp0 = x[*idx++];
3566             sum1 -= v1[0] * tmp0;
3567             sum2 -= v2[0] * tmp0;
3568             sum3 -= v3[0] * tmp0;
3569             sum4 -= v4[0] * tmp0;
3570             sum5 -= v5[0] * tmp0;
3571             v1++;
3572             v2++;
3573             v3++;
3574             v4++;
3575             v5++;
3576           }
3577           t[row]     = sum1;
3578           t[row + 1] = sum2;
3579           t[row + 2] = sum3;
3580           t[row + 3] = sum4;
3581           t[row + 4] = sum5;
3582           sz         = ii[row + 1] - diag[row] - 5;
3583           idx        = a->j + diag[row] + 5;
3584           v1 += 5;
3585           v2 += 5;
3586           v3 += 5;
3587           v4 += 5;
3588           v5 += 5;
3589           for (n = 0; n < sz - 1; n += 2) {
3590             i1 = idx[0];
3591             i2 = idx[1];
3592             idx += 2;
3593             tmp0 = x[i1];
3594             tmp1 = x[i2];
3595             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3596             v1 += 2;
3597             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3598             v2 += 2;
3599             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3600             v3 += 2;
3601             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3602             v4 += 2;
3603             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3604             v5 += 2;
3605           }
3606           if (n == sz - 1) {
3607             tmp0 = x[*idx];
3608             sum1 -= v1[0] * tmp0;
3609             sum2 -= v2[0] * tmp0;
3610             sum3 -= v3[0] * tmp0;
3611             sum4 -= v4[0] * tmp0;
3612             sum5 -= v5[0] * tmp0;
3613           }
3614           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3615           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3616           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3617           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3618           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3619           break;
3620         default:
3621           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3622         }
3623       }
3624       xb = t;
3625       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3626     } else xb = b;
3627 
3628     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3629       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3630       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3631         nodesz = sizes[i + 1] - sizes[i];
3632         ibdiag -= nodesz * nodesz;
3633 
3634         /* set RHS */
3635         if (xb == b) {
3636           /* whole (old way) */
3637           sz  = ii[row + 1] - ii[row];
3638           idx = a->j + ii[row];
3639           switch (nodesz) {
3640           case 5:
3641             v5 = a->a + ii[row - 4]; /* fall through */
3642           case 4:
3643             v4 = a->a + ii[row - 3]; /* fall through */
3644           case 3:
3645             v3 = a->a + ii[row - 2]; /* fall through */
3646           case 2:
3647             v2 = a->a + ii[row - 1]; /* fall through */
3648           case 1:
3649             v1 = a->a + ii[row];
3650             break;
3651           default:
3652             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3653           }
3654         } else {
3655           /* upper, no diag */
3656           sz  = ii[row + 1] - diag[row] - 1;
3657           idx = a->j + diag[row] + 1;
3658           switch (nodesz) {
3659           case 5:
3660             v5 = a->a + diag[row - 4] + 5; /* fall through */
3661           case 4:
3662             v4 = a->a + diag[row - 3] + 4; /* fall through */
3663           case 3:
3664             v3 = a->a + diag[row - 2] + 3; /* fall through */
3665           case 2:
3666             v2 = a->a + diag[row - 1] + 2; /* fall through */
3667           case 1:
3668             v1 = a->a + diag[row] + 1;
3669           }
3670         }
3671         /* set sum */
3672         switch (nodesz) {
3673         case 5:
3674           sum5 = xb[row - 4]; /* fall through */
3675         case 4:
3676           sum4 = xb[row - 3]; /* fall through */
3677         case 3:
3678           sum3 = xb[row - 2]; /* fall through */
3679         case 2:
3680           sum2 = xb[row - 1]; /* fall through */
3681         case 1:
3682           /* note that sum1 is associated with the last row */
3683           sum1 = xb[row];
3684         }
3685         /* do sums */
3686         for (n = 0; n < sz - 1; n += 2) {
3687           i1 = idx[0];
3688           i2 = idx[1];
3689           idx += 2;
3690           tmp0 = x[i1];
3691           tmp1 = x[i2];
3692           switch (nodesz) {
3693           case 5:
3694             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3695             v5 += 2; /* fall through */
3696           case 4:
3697             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3698             v4 += 2; /* fall through */
3699           case 3:
3700             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3701             v3 += 2; /* fall through */
3702           case 2:
3703             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3704             v2 += 2; /* fall through */
3705           case 1:
3706             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3707             v1 += 2;
3708           }
3709         }
3710         /* ragged edge */
3711         if (n == sz - 1) {
3712           tmp0 = x[*idx];
3713           switch (nodesz) {
3714           case 5:
3715             sum5 -= *v5 * tmp0; /* fall through */
3716           case 4:
3717             sum4 -= *v4 * tmp0; /* fall through */
3718           case 3:
3719             sum3 -= *v3 * tmp0; /* fall through */
3720           case 2:
3721             sum2 -= *v2 * tmp0; /* fall through */
3722           case 1:
3723             sum1 -= *v1 * tmp0;
3724           }
3725         }
3726         /* update */
3727         if (xb == b) {
3728           /* whole (old way) w/ diag */
3729           switch (nodesz) {
3730           case 5:
3731             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3732             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3733             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3734             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3735             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3736             break;
3737           case 4:
3738             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3739             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3740             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3741             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3742             break;
3743           case 3:
3744             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3745             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3746             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3747             break;
3748           case 2:
3749             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3750             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3751             break;
3752           case 1:
3753             x[row--] += sum1 * (*ibdiag);
3754             break;
3755           }
3756         } else {
3757           /* no diag so set =  */
3758           switch (nodesz) {
3759           case 5:
3760             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3761             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3762             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3763             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3764             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3765             break;
3766           case 4:
3767             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3768             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3769             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3770             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3771             break;
3772           case 3:
3773             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3774             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3775             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3776             break;
3777           case 2:
3778             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3779             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3780             break;
3781           case 1:
3782             x[row--] = sum1 * (*ibdiag);
3783             break;
3784           }
3785         }
3786       }
3787       if (xb == b) {
3788         PetscCall(PetscLogFlops(2.0 * a->nz));
3789       } else {
3790         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3791       }
3792     }
3793   }
3794   if (flag & SOR_EISENSTAT) {
3795     /*
3796           Apply  (U + D)^-1  where D is now the block diagonal
3797     */
3798     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3799     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3800       nodesz = sizes[i + 1] - sizes[i];
3801       ibdiag -= nodesz * nodesz;
3802       sz  = ii[row + 1] - diag[row] - 1;
3803       v1  = a->a + diag[row] + 1;
3804       idx = a->j + diag[row] + 1;
3805       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3806       switch (nodesz) {
3807       case 1:
3808 
3809         sum1 = b[row];
3810         for (n = 0; n < sz - 1; n += 2) {
3811           i1 = idx[0];
3812           i2 = idx[1];
3813           idx += 2;
3814           tmp0 = x[i1];
3815           tmp1 = x[i2];
3816           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3817           v1 += 2;
3818         }
3819 
3820         if (n == sz - 1) {
3821           tmp0 = x[*idx];
3822           sum1 -= *v1 * tmp0;
3823         }
3824         x[row] = sum1 * (*ibdiag);
3825         row--;
3826         break;
3827 
3828       case 2:
3829 
3830         sum1 = b[row];
3831         sum2 = b[row - 1];
3832         /* note that sum1 is associated with the second of the two rows */
3833         v2 = a->a + diag[row - 1] + 2;
3834         for (n = 0; n < sz - 1; n += 2) {
3835           i1 = idx[0];
3836           i2 = idx[1];
3837           idx += 2;
3838           tmp0 = x[i1];
3839           tmp1 = x[i2];
3840           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3841           v1 += 2;
3842           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3843           v2 += 2;
3844         }
3845 
3846         if (n == sz - 1) {
3847           tmp0 = x[*idx];
3848           sum1 -= *v1 * tmp0;
3849           sum2 -= *v2 * tmp0;
3850         }
3851         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3852         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3853         row -= 2;
3854         break;
3855       case 3:
3856 
3857         sum1 = b[row];
3858         sum2 = b[row - 1];
3859         sum3 = b[row - 2];
3860         v2   = a->a + diag[row - 1] + 2;
3861         v3   = a->a + diag[row - 2] + 3;
3862         for (n = 0; n < sz - 1; n += 2) {
3863           i1 = idx[0];
3864           i2 = idx[1];
3865           idx += 2;
3866           tmp0 = x[i1];
3867           tmp1 = x[i2];
3868           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3869           v1 += 2;
3870           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3871           v2 += 2;
3872           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3873           v3 += 2;
3874         }
3875 
3876         if (n == sz - 1) {
3877           tmp0 = x[*idx];
3878           sum1 -= *v1 * tmp0;
3879           sum2 -= *v2 * tmp0;
3880           sum3 -= *v3 * tmp0;
3881         }
3882         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3883         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3884         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3885         row -= 3;
3886         break;
3887       case 4:
3888 
3889         sum1 = b[row];
3890         sum2 = b[row - 1];
3891         sum3 = b[row - 2];
3892         sum4 = b[row - 3];
3893         v2   = a->a + diag[row - 1] + 2;
3894         v3   = a->a + diag[row - 2] + 3;
3895         v4   = a->a + diag[row - 3] + 4;
3896         for (n = 0; n < sz - 1; n += 2) {
3897           i1 = idx[0];
3898           i2 = idx[1];
3899           idx += 2;
3900           tmp0 = x[i1];
3901           tmp1 = x[i2];
3902           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3903           v1 += 2;
3904           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3905           v2 += 2;
3906           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3907           v3 += 2;
3908           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3909           v4 += 2;
3910         }
3911 
3912         if (n == sz - 1) {
3913           tmp0 = x[*idx];
3914           sum1 -= *v1 * tmp0;
3915           sum2 -= *v2 * tmp0;
3916           sum3 -= *v3 * tmp0;
3917           sum4 -= *v4 * tmp0;
3918         }
3919         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3920         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3921         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3922         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3923         row -= 4;
3924         break;
3925       case 5:
3926 
3927         sum1 = b[row];
3928         sum2 = b[row - 1];
3929         sum3 = b[row - 2];
3930         sum4 = b[row - 3];
3931         sum5 = b[row - 4];
3932         v2   = a->a + diag[row - 1] + 2;
3933         v3   = a->a + diag[row - 2] + 3;
3934         v4   = a->a + diag[row - 3] + 4;
3935         v5   = a->a + diag[row - 4] + 5;
3936         for (n = 0; n < sz - 1; n += 2) {
3937           i1 = idx[0];
3938           i2 = idx[1];
3939           idx += 2;
3940           tmp0 = x[i1];
3941           tmp1 = x[i2];
3942           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3943           v1 += 2;
3944           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3945           v2 += 2;
3946           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3947           v3 += 2;
3948           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3949           v4 += 2;
3950           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3951           v5 += 2;
3952         }
3953 
3954         if (n == sz - 1) {
3955           tmp0 = x[*idx];
3956           sum1 -= *v1 * tmp0;
3957           sum2 -= *v2 * tmp0;
3958           sum3 -= *v3 * tmp0;
3959           sum4 -= *v4 * tmp0;
3960           sum5 -= *v5 * tmp0;
3961         }
3962         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3963         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3964         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3965         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3966         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3967         row -= 5;
3968         break;
3969       default:
3970         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3971       }
3972     }
3973     PetscCall(PetscLogFlops(a->nz));
3974 
3975     /*
3976            t = b - D x    where D is the block diagonal
3977     */
3978     cnt = 0;
3979     for (i = 0, row = 0; i < m; i++) {
3980       nodesz = sizes[i + 1] - sizes[i];
3981       switch (nodesz) {
3982       case 1:
3983         t[row] = b[row] - bdiag[cnt++] * x[row];
3984         row++;
3985         break;
3986       case 2:
3987         x1         = x[row];
3988         x2         = x[row + 1];
3989         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3990         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3991         t[row]     = b[row] - tmp1;
3992         t[row + 1] = b[row + 1] - tmp2;
3993         row += 2;
3994         cnt += 4;
3995         break;
3996       case 3:
3997         x1         = x[row];
3998         x2         = x[row + 1];
3999         x3         = x[row + 2];
4000         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4001         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4002         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4003         t[row]     = b[row] - tmp1;
4004         t[row + 1] = b[row + 1] - tmp2;
4005         t[row + 2] = b[row + 2] - tmp3;
4006         row += 3;
4007         cnt += 9;
4008         break;
4009       case 4:
4010         x1         = x[row];
4011         x2         = x[row + 1];
4012         x3         = x[row + 2];
4013         x4         = x[row + 3];
4014         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4015         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4016         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4017         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4018         t[row]     = b[row] - tmp1;
4019         t[row + 1] = b[row + 1] - tmp2;
4020         t[row + 2] = b[row + 2] - tmp3;
4021         t[row + 3] = b[row + 3] - tmp4;
4022         row += 4;
4023         cnt += 16;
4024         break;
4025       case 5:
4026         x1         = x[row];
4027         x2         = x[row + 1];
4028         x3         = x[row + 2];
4029         x4         = x[row + 3];
4030         x5         = x[row + 4];
4031         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4032         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4033         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4034         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4035         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4036         t[row]     = b[row] - tmp1;
4037         t[row + 1] = b[row + 1] - tmp2;
4038         t[row + 2] = b[row + 2] - tmp3;
4039         t[row + 3] = b[row + 3] - tmp4;
4040         t[row + 4] = b[row + 4] - tmp5;
4041         row += 5;
4042         cnt += 25;
4043         break;
4044       default:
4045         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4046       }
4047     }
4048     PetscCall(PetscLogFlops(m));
4049 
4050     /*
4051           Apply (L + D)^-1 where D is the block diagonal
4052     */
4053     for (i = 0, row = 0; i < m; i++) {
4054       nodesz = sizes[i + 1] - sizes[i];
4055       sz     = diag[row] - ii[row];
4056       v1     = a->a + ii[row];
4057       idx    = a->j + ii[row];
4058       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4059       switch (nodesz) {
4060       case 1:
4061 
4062         sum1 = t[row];
4063         for (n = 0; n < sz - 1; n += 2) {
4064           i1 = idx[0];
4065           i2 = idx[1];
4066           idx += 2;
4067           tmp0 = t[i1];
4068           tmp1 = t[i2];
4069           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4070           v1 += 2;
4071         }
4072 
4073         if (n == sz - 1) {
4074           tmp0 = t[*idx];
4075           sum1 -= *v1 * tmp0;
4076         }
4077         x[row] += t[row] = sum1 * (*ibdiag++);
4078         row++;
4079         break;
4080       case 2:
4081         v2   = a->a + ii[row + 1];
4082         sum1 = t[row];
4083         sum2 = t[row + 1];
4084         for (n = 0; n < sz - 1; n += 2) {
4085           i1 = idx[0];
4086           i2 = idx[1];
4087           idx += 2;
4088           tmp0 = t[i1];
4089           tmp1 = t[i2];
4090           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4091           v1 += 2;
4092           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4093           v2 += 2;
4094         }
4095 
4096         if (n == sz - 1) {
4097           tmp0 = t[*idx];
4098           sum1 -= v1[0] * tmp0;
4099           sum2 -= v2[0] * tmp0;
4100         }
4101         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4102         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4103         ibdiag += 4;
4104         row += 2;
4105         break;
4106       case 3:
4107         v2   = a->a + ii[row + 1];
4108         v3   = a->a + ii[row + 2];
4109         sum1 = t[row];
4110         sum2 = t[row + 1];
4111         sum3 = t[row + 2];
4112         for (n = 0; n < sz - 1; n += 2) {
4113           i1 = idx[0];
4114           i2 = idx[1];
4115           idx += 2;
4116           tmp0 = t[i1];
4117           tmp1 = t[i2];
4118           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4119           v1 += 2;
4120           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4121           v2 += 2;
4122           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4123           v3 += 2;
4124         }
4125 
4126         if (n == sz - 1) {
4127           tmp0 = t[*idx];
4128           sum1 -= v1[0] * tmp0;
4129           sum2 -= v2[0] * tmp0;
4130           sum3 -= v3[0] * tmp0;
4131         }
4132         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4133         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4134         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4135         ibdiag += 9;
4136         row += 3;
4137         break;
4138       case 4:
4139         v2   = a->a + ii[row + 1];
4140         v3   = a->a + ii[row + 2];
4141         v4   = a->a + ii[row + 3];
4142         sum1 = t[row];
4143         sum2 = t[row + 1];
4144         sum3 = t[row + 2];
4145         sum4 = t[row + 3];
4146         for (n = 0; n < sz - 1; n += 2) {
4147           i1 = idx[0];
4148           i2 = idx[1];
4149           idx += 2;
4150           tmp0 = t[i1];
4151           tmp1 = t[i2];
4152           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4153           v1 += 2;
4154           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4155           v2 += 2;
4156           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4157           v3 += 2;
4158           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4159           v4 += 2;
4160         }
4161 
4162         if (n == sz - 1) {
4163           tmp0 = t[*idx];
4164           sum1 -= v1[0] * tmp0;
4165           sum2 -= v2[0] * tmp0;
4166           sum3 -= v3[0] * tmp0;
4167           sum4 -= v4[0] * tmp0;
4168         }
4169         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4170         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4171         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4172         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4173         ibdiag += 16;
4174         row += 4;
4175         break;
4176       case 5:
4177         v2   = a->a + ii[row + 1];
4178         v3   = a->a + ii[row + 2];
4179         v4   = a->a + ii[row + 3];
4180         v5   = a->a + ii[row + 4];
4181         sum1 = t[row];
4182         sum2 = t[row + 1];
4183         sum3 = t[row + 2];
4184         sum4 = t[row + 3];
4185         sum5 = t[row + 4];
4186         for (n = 0; n < sz - 1; n += 2) {
4187           i1 = idx[0];
4188           i2 = idx[1];
4189           idx += 2;
4190           tmp0 = t[i1];
4191           tmp1 = t[i2];
4192           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4193           v1 += 2;
4194           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4195           v2 += 2;
4196           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4197           v3 += 2;
4198           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4199           v4 += 2;
4200           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4201           v5 += 2;
4202         }
4203 
4204         if (n == sz - 1) {
4205           tmp0 = t[*idx];
4206           sum1 -= v1[0] * tmp0;
4207           sum2 -= v2[0] * tmp0;
4208           sum3 -= v3[0] * tmp0;
4209           sum4 -= v4[0] * tmp0;
4210           sum5 -= v5[0] * tmp0;
4211         }
4212         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4213         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4214         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4215         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4216         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4217         ibdiag += 25;
4218         row += 5;
4219         break;
4220       default:
4221         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4222       }
4223     }
4224     PetscCall(PetscLogFlops(a->nz));
4225   }
4226   PetscCall(VecRestoreArray(xx, &x));
4227   PetscCall(VecRestoreArrayRead(bb, &b));
4228   PetscFunctionReturn(PETSC_SUCCESS);
4229 }
4230 
4231 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4232 {
4233   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4234   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4235   const MatScalar   *bdiag = a->inode.bdiag;
4236   const PetscScalar *b;
4237   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4238   const PetscInt    *sizes = a->inode.size_csr;
4239 
4240   PetscFunctionBegin;
4241   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4242   PetscCall(VecGetArray(xx, &x));
4243   PetscCall(VecGetArrayRead(bb, &b));
4244   cnt = 0;
4245   for (i = 0, row = 0; i < m; i++) {
4246     nodesz = sizes[i + 1] - sizes[i];
4247     switch (nodesz) {
4248     case 1:
4249       x[row] = b[row] * bdiag[cnt++];
4250       row++;
4251       break;
4252     case 2:
4253       x1       = b[row];
4254       x2       = b[row + 1];
4255       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4256       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4257       x[row++] = tmp1;
4258       x[row++] = tmp2;
4259       cnt += 4;
4260       break;
4261     case 3:
4262       x1       = b[row];
4263       x2       = b[row + 1];
4264       x3       = b[row + 2];
4265       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4266       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4267       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4268       x[row++] = tmp1;
4269       x[row++] = tmp2;
4270       x[row++] = tmp3;
4271       cnt += 9;
4272       break;
4273     case 4:
4274       x1       = b[row];
4275       x2       = b[row + 1];
4276       x3       = b[row + 2];
4277       x4       = b[row + 3];
4278       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4279       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4280       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4281       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4282       x[row++] = tmp1;
4283       x[row++] = tmp2;
4284       x[row++] = tmp3;
4285       x[row++] = tmp4;
4286       cnt += 16;
4287       break;
4288     case 5:
4289       x1       = b[row];
4290       x2       = b[row + 1];
4291       x3       = b[row + 2];
4292       x4       = b[row + 3];
4293       x5       = b[row + 4];
4294       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4295       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4296       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4297       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4298       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4299       x[row++] = tmp1;
4300       x[row++] = tmp2;
4301       x[row++] = tmp3;
4302       x[row++] = tmp4;
4303       x[row++] = tmp5;
4304       cnt += 25;
4305       break;
4306     default:
4307       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4308     }
4309   }
4310   PetscCall(PetscLogFlops(2.0 * cnt));
4311   PetscCall(VecRestoreArray(xx, &x));
4312   PetscCall(VecRestoreArrayRead(bb, &b));
4313   PetscFunctionReturn(PETSC_SUCCESS);
4314 }
4315 
4316 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4317 {
4318   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4319 
4320   PetscFunctionBegin;
4321   a->inode.node_count       = 0;
4322   a->inode.use              = PETSC_FALSE;
4323   a->inode.checked          = PETSC_FALSE;
4324   a->inode.mat_nonzerostate = -1;
4325   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4326   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4327   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4328   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4329   A->ops->coloringpatch     = NULL;
4330   A->ops->multdiagonalblock = NULL;
4331   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4332   PetscFunctionReturn(PETSC_SUCCESS);
4333 }
4334 
4335 /*
4336     samestructure indicates that the matrix has not changed its nonzero structure so we
4337     do not need to recompute the inodes
4338 */
4339 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4340 {
4341   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4342   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4343   PetscBool       flag;
4344   const PetscInt *idx, *idy, *ii;
4345 
4346   PetscFunctionBegin;
4347   if (!a->inode.use) {
4348     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4349     PetscCall(PetscFree(a->inode.size_csr));
4350     PetscFunctionReturn(PETSC_SUCCESS);
4351   }
4352   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4353 
4354   m = A->rmap->n;
4355   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4356   ns    = a->inode.size_csr;
4357   ns[0] = 0;
4358 
4359   i          = 0;
4360   node_count = 0;
4361   idx        = a->j;
4362   ii         = a->i;
4363   if (idx) {
4364     while (i < m) {            /* For each row */
4365       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4366       /* Limits the number of elements in a node to 'a->inode.limit' */
4367       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4368         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4369         if (nzy != nzx) break;
4370         idy += nzx; /* Same nonzero pattern */
4371         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4372         if (!flag) break;
4373       }
4374       ns[node_count + 1] = ns[node_count] + blk_size;
4375       node_count++;
4376       idx += blk_size * nzx;
4377       i = j;
4378     }
4379   }
4380   /* If not enough inodes found,, do not use inode version of the routines */
4381   if (!m || !idx || node_count > .8 * m) {
4382     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4383     PetscCall(PetscFree(a->inode.size_csr));
4384     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4385   } else {
4386     if (!A->factortype) {
4387       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4388       if (A->rmap->n == A->cmap->n) {
4389         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4390         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4391         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4392         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4393         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4394       }
4395     } else {
4396       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4397     }
4398     a->inode.node_count = node_count;
4399     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4400   }
4401   a->inode.checked          = PETSC_TRUE;
4402   a->inode.mat_nonzerostate = A->nonzerostate;
4403   PetscFunctionReturn(PETSC_SUCCESS);
4404 }
4405 
4406 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4407 {
4408   Mat         B = *C;
4409   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4410   PetscInt    m = A->rmap->n;
4411 
4412   PetscFunctionBegin;
4413   c->inode.use              = a->inode.use;
4414   c->inode.limit            = a->inode.limit;
4415   c->inode.max_limit        = a->inode.max_limit;
4416   c->inode.checked          = PETSC_FALSE;
4417   c->inode.size_csr         = NULL;
4418   c->inode.node_count       = 0;
4419   c->inode.ibdiag           = NULL;
4420   c->inode.bdiag            = NULL;
4421   c->inode.mat_nonzerostate = -1;
4422   if (a->inode.use) {
4423     if (a->inode.checked && a->inode.size_csr) {
4424       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4425       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4426 
4427       c->inode.checked          = PETSC_TRUE;
4428       c->inode.node_count       = a->inode.node_count;
4429       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4430     }
4431     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4432     if (!B->factortype) {
4433       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4434       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4435       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4436       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4437       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4438       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4439     } else {
4440       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4441     }
4442   }
4443   PetscFunctionReturn(PETSC_SUCCESS);
4444 }
4445 
4446 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4447 {
4448   PetscInt        k;
4449   const PetscInt *vi;
4450 
4451   PetscFunctionBegin;
4452   vi = aj + ai[row];
4453   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4454   vi        = aj + adiag[row];
4455   cols[nzl] = vi[0];
4456   vi        = aj + adiag[row + 1] + 1;
4457   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4458   PetscFunctionReturn(PETSC_SUCCESS);
4459 }
4460 /*
4461    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4462    Modified from MatSeqAIJCheckInode().
4463 
4464    Input Parameters:
4465 .  Mat A - ILU or LU matrix factor
4466 
4467 */
4468 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4469 {
4470   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4471   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4472   PetscInt       *cols1, *cols2, *ns;
4473   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4474   PetscBool       flag;
4475 
4476   PetscFunctionBegin;
4477   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4478   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4479 
4480   m = A->rmap->n;
4481   if (a->inode.size_csr) ns = a->inode.size_csr;
4482   else PetscCall(PetscMalloc1(m + 1, &ns));
4483   ns[0] = 0;
4484 
4485   i          = 0;
4486   node_count = 0;
4487   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4488   while (i < m) {                       /* For each row */
4489     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4490     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4491     nzx  = nzl1 + nzu1 + 1;
4492     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4493 
4494     /* Limits the number of elements in a node to 'a->inode.limit' */
4495     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4496       nzl2 = ai[j + 1] - ai[j];
4497       nzu2 = adiag[j] - adiag[j + 1] - 1;
4498       nzy  = nzl2 + nzu2 + 1;
4499       if (nzy != nzx) break;
4500       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4501       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4502       if (!flag) break;
4503     }
4504     ns[node_count + 1] = ns[node_count] + blk_size;
4505     node_count++;
4506     i = j;
4507   }
4508   PetscCall(PetscFree2(cols1, cols2));
4509   /* If not enough inodes found,, do not use inode version of the routines */
4510   if (!m || node_count > .8 * m) {
4511     PetscCall(PetscFree(ns));
4512 
4513     a->inode.node_count = 0;
4514     a->inode.size_csr   = NULL;
4515     a->inode.use        = PETSC_FALSE;
4516 
4517     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4518   } else {
4519     A->ops->mult              = NULL;
4520     A->ops->sor               = NULL;
4521     A->ops->multadd           = NULL;
4522     A->ops->getrowij          = NULL;
4523     A->ops->restorerowij      = NULL;
4524     A->ops->getcolumnij       = NULL;
4525     A->ops->restorecolumnij   = NULL;
4526     A->ops->coloringpatch     = NULL;
4527     A->ops->multdiagonalblock = NULL;
4528     a->inode.node_count       = node_count;
4529     a->inode.size_csr         = ns;
4530     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4531   }
4532   a->inode.checked = PETSC_TRUE;
4533   PetscFunctionReturn(PETSC_SUCCESS);
4534 }
4535 
4536 /*
4537      This is really ugly. if inodes are used this replaces the
4538   permutations with ones that correspond to rows/cols of the matrix
4539   rather than inode blocks
4540 */
4541 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4542 {
4543   PetscFunctionBegin;
4544   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4545   PetscFunctionReturn(PETSC_SUCCESS);
4546 }
4547 
4548 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4549 {
4550   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4551   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4552   const PetscInt *ridx, *cidx;
4553   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4554   PetscInt        nslim_col, *ns_col;
4555   IS              ris = *rperm, cis = *cperm;
4556 
4557   PetscFunctionBegin;
4558   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4559   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4560 
4561   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4562   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4563   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4564 
4565   PetscCall(ISGetIndices(ris, &ridx));
4566   PetscCall(ISGetIndices(cis, &cidx));
4567 
4568   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4569   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4570 
4571   /* Construct the permutations for rows*/
4572   for (i = 0, row = 0; i < nslim_row; ++i) {
4573     indx      = ridx[i];
4574     start_val = tns[indx];
4575     end_val   = tns[indx + 1];
4576     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4577   }
4578 
4579   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4580   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4581 
4582   /* Construct permutations for columns */
4583   for (i = 0, col = 0; i < nslim_col; ++i) {
4584     indx      = cidx[i];
4585     start_val = tns[indx];
4586     end_val   = tns[indx + 1];
4587     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4588   }
4589 
4590   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4591   PetscCall(ISSetPermutation(*rperm));
4592   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4593   PetscCall(ISSetPermutation(*cperm));
4594 
4595   PetscCall(ISRestoreIndices(ris, &ridx));
4596   PetscCall(ISRestoreIndices(cis, &cidx));
4597 
4598   PetscCall(PetscFree(ns_col));
4599   PetscCall(PetscFree2(permr, permc));
4600   PetscCall(ISDestroy(&cis));
4601   PetscCall(ISDestroy(&ris));
4602   PetscCall(PetscFree(tns));
4603   PetscFunctionReturn(PETSC_SUCCESS);
4604 }
4605 
4606 /*@C
4607   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4608 
4609   Not Collective
4610 
4611   Input Parameter:
4612 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4613 
4614   Output Parameters:
4615 + node_count - no of inodes present in the matrix.
4616 . sizes      - an array of size `node_count`, with the sizes of each inode.
4617 - limit      - the max size used to generate the inodes.
4618 
4619   Level: advanced
4620 
4621   Note:
4622   It should be called after the matrix is assembled.
4623   The contents of the sizes[] array should not be changed.
4624   `NULL` may be passed for information not needed
4625 
4626 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4627 @*/
4628 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4629 {
4630   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4631 
4632   PetscFunctionBegin;
4633   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4634   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4635   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4636   PetscFunctionReturn(PETSC_SUCCESS);
4637 }
4638 
4639 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4640 {
4641   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4642 
4643   PetscFunctionBegin;
4644   if (node_count) *node_count = a->inode.node_count;
4645   if (sizes) *sizes = a->inode.size_csr;
4646   if (limit) *limit = a->inode.limit;
4647   PetscFunctionReturn(PETSC_SUCCESS);
4648 }
4649