xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 57d508425293f0bb93f59574d14951d8faac9af8)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size_csr;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30   ns_col[0] = 0;
31 
32   /* Use the same row structure wherever feasible. */
33   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34 
35   /* if m < n; pad up the remainder with inode_limit */
36   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37   /* The last node is the odd ball. pad it up with the remaining rows; */
38   if (count < n) {
39     ns_col[i + 1] = ns_col[i] + (n - count);
40     i++;
41   } else if (count > n) {
42     /* Adjust for the over estimation */
43     ns_col[i] += n - count;
44   }
45   *size = i;
46   *ns   = ns_col;
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /*
51       This builds symmetric version of nonzero structure,
52 */
53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
56   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59 
60   PetscFunctionBegin;
61   nslim_row = a->inode.node_count;
62   m         = A->rmap->n;
63   n         = A->cmap->n;
64   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66 
67   /* Use the row_inode as column_inode */
68   nslim_col = nslim_row;
69   ns_col    = ns_row;
70 
71   /* allocate space for reformatted inode structure */
72   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74 
75   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76     nsz = ns_col[i1 + 1] - ns_col[i1];
77     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78   }
79   /* allocate space for row pointers */
80   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81   *iia = ia;
82   PetscCall(PetscMalloc1(nslim_row + 1, &work));
83 
84   /* determine the number of columns in each row */
85   ia[0] = oshift;
86   for (i1 = 0; i1 < nslim_row; i1++) {
87     row  = ns_row[i1];
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0; i1 < nslim_row; i1++) {
117     row  = ns_row[i1];
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1 + 1] - ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0; i1 < nslim_row; i1++) {
171     row = ns_row[i1];
172     j   = aj + ai[row] + ishift;
173     nz  = ai[row + 1] - ai[row];
174     if (!nz) continue; /* empty row */
175     col = *j++ + ishift;
176     i2  = tvc[col];
177     while (nz-- > 0) { /* off-diagonal elements */
178       ia[i1 + 1]++;
179       i2++; /* Start col of next node */
180       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181       if (nz > 0) i2 = tvc[col];
182     }
183   }
184 
185   /* shift ia[i] to point to next row */
186   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187     row = ia[i1 - 1];
188     ia[i1] += row;
189     work[i1 - 1] = row - oshift;
190   }
191 
192   /* allocate space for column pointers */
193   nz = ia[nslim_row] + (!ishift);
194   PetscCall(PetscMalloc1(nz, &ja));
195   *jja = ja;
196 
197   /* loop over matrix putting into ja */
198   for (i1 = 0; i1 < nslim_row; i1++) {
199     row = ns_row[i1];
200     j   = aj + ai[row] + ishift;
201     nz  = ai[row + 1] - ai[row];
202     if (!nz) continue; /* empty row */
203     col = *j++ + ishift;
204     i2  = tvc[col];
205     while (nz-- > 0) {
206       ja[work[i1]++] = i2 + oshift;
207       ++i2;
208       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209       if (nz > 0) i2 = tvc[col];
210     }
211   }
212   PetscCall(PetscFree(ns_col));
213   PetscCall(PetscFree(work));
214   PetscCall(PetscFree2(tns, tvc));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221 
222   PetscFunctionBegin;
223   if (n) *n = a->inode.node_count;
224   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225   if (!blockcompressed) {
226     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227   } else if (symmetric) {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229   } else {
230     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231   }
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237   PetscFunctionBegin;
238   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239 
240   if (!blockcompressed) {
241     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242   } else {
243     PetscCall(PetscFree(*ia));
244     PetscCall(PetscFree(*ja));
245   }
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254 
255   PetscFunctionBegin;
256   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257   nslim_row = a->inode.node_count;
258   n         = A->cmap->n;
259 
260   /* Create The column_inode for this matrix */
261   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262 
263   /* allocate space for reformatted column_inode structure */
264   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266 
267   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268     nsz = ns_col[i1 + 1] - ns_col[i1];
269     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270   }
271   /* allocate space for column pointers */
272   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273   *iia = ia;
274   PetscCall(PetscMalloc1(nslim_col + 1, &work));
275 
276   /* determine the number of columns in each row */
277   ia[0] = oshift;
278   for (i1 = 0; i1 < nslim_row; i1++) {
279     row = ns_row[i1];
280     j   = aj + ai[row] + ishift;
281     col = *j++ + ishift;
282     i2  = tvc[col];
283     nz  = ai[row + 1] - ai[row];
284     while (nz-- > 0) { /* off-diagonal elements */
285       /* ia[i1+1]++; */
286       ia[i2 + 1]++;
287       i2++;
288       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289       if (nz > 0) i2 = tvc[col];
290     }
291   }
292 
293   /* shift ia[i] to point to next col */
294   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295     col = ia[i1 - 1];
296     ia[i1] += col;
297     work[i1 - 1] = col - oshift;
298   }
299 
300   /* allocate space for column pointers */
301   nz = ia[nslim_col] + (!ishift);
302   PetscCall(PetscMalloc1(nz, &ja));
303   *jja = ja;
304 
305   /* loop over matrix putting into ja */
306   for (i1 = 0; i1 < nslim_row; i1++) {
307     row = ns_row[i1];
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row + 1] - ai[row];
312     while (nz-- > 0) {
313       /* ja[work[i1]++] = i2 + oshift; */
314       ja[work[i2]++] = i1 + oshift;
315       i2++;
316       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320   PetscCall(PetscFree(ns_col));
321   PetscCall(PetscFree(work));
322   PetscCall(PetscFree2(tns, tvc));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreateColInode_Private(A, n, NULL));
330   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331 
332   if (!blockcompressed) {
333     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334   } else if (symmetric) {
335     /* Since the indices are symmetric it doesn't matter */
336     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337   } else {
338     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345   PetscFunctionBegin;
346   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347   if (!blockcompressed) {
348     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349   } else {
350     PetscCall(PetscFree(*ia));
351     PetscCall(PetscFree(*ja));
352   }
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359   PetscScalar       *y;
360   const PetscScalar *x;
361   PetscInt           row, node_max, nonzerorow = 0;
362   PetscInt          *ns;
363 
364   PetscFunctionBegin;
365   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366   node_max = a->inode.node_count;
367   ns       = a->inode.size_csr; /* Node Size array */
368   PetscCall(VecGetArrayRead(xx, &x));
369   PetscCall(VecGetArray(yy, &y));
370 
371   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372   for (PetscInt i = 0; i < node_max; ++i) {
373     PetscInt         i1, i2, nsz, n, sz;
374     const MatScalar *v1, *v2, *v3, *v4, *v5;
375     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376     const PetscInt  *idx;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380 #endif
381     row = ns[i];
382     nsz = ns[i + 1] - ns[i];
383     n   = a->i[row + 1] - a->i[row];
384     nonzerorow += (n > 0) * nsz;
385 
386     idx = &a->j[a->i[row]];
387     v1  = &a->a[a->i[row]];
388     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
389     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390     sz = n;                                                                /* No of non zeros in this row */
391                                                                            /* Switch on the size of Node */
392     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
393     case 1:
394       sum1 = 0.;
395 
396       for (n = 0; n < sz - 1; n += 2) {
397         i1 = idx[0]; /* The instructions are ordered to */
398         i2 = idx[1]; /* make the compiler's job easy */
399         idx += 2;
400         tmp0 = x[i1];
401         tmp1 = x[i2];
402         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403         v1 += 2;
404       }
405 
406       if (n == sz - 1) { /* Take care of the last nonzero  */
407         tmp0 = x[*idx++];
408         sum1 += *v1++ * tmp0;
409       }
410       y[row++] = sum1;
411       break;
412     case 2:
413       sum1 = 0.;
414       sum2 = 0.;
415       v2   = v1 + n;
416 
417       for (n = 0; n < sz - 1; n += 2) {
418         i1 = idx[0];
419         i2 = idx[1];
420         idx += 2;
421         tmp0 = x[i1];
422         tmp1 = x[i2];
423         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424         v1 += 2;
425         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426         v2 += 2;
427       }
428       if (n == sz - 1) {
429         tmp0 = x[*idx++];
430         sum1 += *v1++ * tmp0;
431         sum2 += *v2++ * tmp0;
432       }
433       y[row++] = sum1;
434       y[row++] = sum2;
435       v1       = v2; /* Since the next block to be processed starts there*/
436       idx += sz;
437       break;
438     case 3:
439       sum1 = 0.;
440       sum2 = 0.;
441       sum3 = 0.;
442       v2   = v1 + n;
443       v3   = v2 + n;
444 
445       for (n = 0; n < sz - 1; n += 2) {
446         i1 = idx[0];
447         i2 = idx[1];
448         idx += 2;
449         tmp0 = x[i1];
450         tmp1 = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452         v1 += 2;
453         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454         v2 += 2;
455         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456         v3 += 2;
457       }
458       if (n == sz - 1) {
459         tmp0 = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462         sum3 += *v3++ * tmp0;
463       }
464       y[row++] = sum1;
465       y[row++] = sum2;
466       y[row++] = sum3;
467       v1       = v3; /* Since the next block to be processed starts there*/
468       idx += 2 * sz;
469       break;
470     case 4:
471       sum1 = 0.;
472       sum2 = 0.;
473       sum3 = 0.;
474       sum4 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477       v4   = v3 + n;
478 
479       for (n = 0; n < sz - 1; n += 2) {
480         i1 = idx[0];
481         i2 = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486         v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488         v2 += 2;
489         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490         v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492         v4 += 2;
493       }
494       if (n == sz - 1) {
495         tmp0 = x[*idx++];
496         sum1 += *v1++ * tmp0;
497         sum2 += *v2++ * tmp0;
498         sum3 += *v3++ * tmp0;
499         sum4 += *v4++ * tmp0;
500       }
501       y[row++] = sum1;
502       y[row++] = sum2;
503       y[row++] = sum3;
504       y[row++] = sum4;
505       v1       = v4; /* Since the next block to be processed starts there*/
506       idx += 3 * sz;
507       break;
508     case 5:
509       sum1 = 0.;
510       sum2 = 0.;
511       sum3 = 0.;
512       sum4 = 0.;
513       sum5 = 0.;
514       v2   = v1 + n;
515       v3   = v2 + n;
516       v4   = v3 + n;
517       v5   = v4 + n;
518 
519       for (n = 0; n < sz - 1; n += 2) {
520         i1 = idx[0];
521         i2 = idx[1];
522         idx += 2;
523         tmp0 = x[i1];
524         tmp1 = x[i2];
525         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526         v1 += 2;
527         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528         v2 += 2;
529         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530         v3 += 2;
531         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532         v4 += 2;
533         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534         v5 += 2;
535       }
536       if (n == sz - 1) {
537         tmp0 = x[*idx++];
538         sum1 += *v1++ * tmp0;
539         sum2 += *v2++ * tmp0;
540         sum3 += *v3++ * tmp0;
541         sum4 += *v4++ * tmp0;
542         sum5 += *v5++ * tmp0;
543       }
544       y[row++] = sum1;
545       y[row++] = sum2;
546       y[row++] = sum3;
547       y[row++] = sum4;
548       y[row++] = sum5;
549       v1       = v5; /* Since the next block to be processed starts there */
550       idx += 4 * sz;
551       break;
552     default:
553       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554     }
555   }
556   PetscCall(VecRestoreArrayRead(xx, &x));
557   PetscCall(VecRestoreArray(yy, &y));
558   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559   PetscFunctionReturn(PETSC_SUCCESS);
560 }
561 
562 /* Almost same code as the MatMult_SeqAIJ_Inode() */
563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564 {
565   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
566   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567   const MatScalar   *v1, *v2, *v3, *v4, *v5;
568   const PetscScalar *x;
569   PetscScalar       *y, *z, *zt;
570   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
571   const PetscInt    *idx, *ns, *ii;
572 
573   PetscFunctionBegin;
574   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575   node_max = a->inode.node_count;
576   ns       = a->inode.size_csr; /* Node Size array */
577 
578   PetscCall(VecGetArrayRead(xx, &x));
579   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580   zt = z;
581 
582   idx = a->j;
583   v1  = a->a;
584   ii  = a->i;
585 
586   for (i = 0; i < node_max; ++i) {
587     row = ns[i];
588     nsz = ns[i + 1] - ns[i];
589     n   = ii[1] - ii[0];
590     ii += nsz;
591     sz = n;        /* No of non zeros in this row */
592                    /* Switch on the size of Node */
593     switch (nsz) { /* Each loop in 'case' is unrolled */
594     case 1:
595       sum1 = *zt++;
596 
597       for (n = 0; n < sz - 1; n += 2) {
598         i1 = idx[0]; /* The instructions are ordered to */
599         i2 = idx[1]; /* make the compiler's job easy */
600         idx += 2;
601         tmp0 = x[i1];
602         tmp1 = x[i2];
603         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604         v1 += 2;
605       }
606 
607       if (n == sz - 1) { /* Take care of the last nonzero  */
608         tmp0 = x[*idx++];
609         sum1 += *v1++ * tmp0;
610       }
611       y[row++] = sum1;
612       break;
613     case 2:
614       sum1 = *zt++;
615       sum2 = *zt++;
616       v2   = v1 + n;
617 
618       for (n = 0; n < sz - 1; n += 2) {
619         i1 = idx[0];
620         i2 = idx[1];
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625         v1 += 2;
626         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627         v2 += 2;
628       }
629       if (n == sz - 1) {
630         tmp0 = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++] = sum1;
635       y[row++] = sum2;
636       v1       = v2; /* Since the next block to be processed starts there*/
637       idx += sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n < sz - 1; n += 2) {
647         i1 = idx[0];
648         i2 = idx[1];
649         idx += 2;
650         tmp0 = x[i1];
651         tmp1 = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653         v1 += 2;
654         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655         v2 += 2;
656         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657         v3 += 2;
658       }
659       if (n == sz - 1) {
660         tmp0 = x[*idx++];
661         sum1 += *v1++ * tmp0;
662         sum2 += *v2++ * tmp0;
663         sum3 += *v3++ * tmp0;
664       }
665       y[row++] = sum1;
666       y[row++] = sum2;
667       y[row++] = sum3;
668       v1       = v3; /* Since the next block to be processed starts there*/
669       idx += 2 * sz;
670       break;
671     case 4:
672       sum1 = *zt++;
673       sum2 = *zt++;
674       sum3 = *zt++;
675       sum4 = *zt++;
676       v2   = v1 + n;
677       v3   = v2 + n;
678       v4   = v3 + n;
679 
680       for (n = 0; n < sz - 1; n += 2) {
681         i1 = idx[0];
682         i2 = idx[1];
683         idx += 2;
684         tmp0 = x[i1];
685         tmp1 = x[i2];
686         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687         v1 += 2;
688         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689         v2 += 2;
690         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691         v3 += 2;
692         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693         v4 += 2;
694       }
695       if (n == sz - 1) {
696         tmp0 = x[*idx++];
697         sum1 += *v1++ * tmp0;
698         sum2 += *v2++ * tmp0;
699         sum3 += *v3++ * tmp0;
700         sum4 += *v4++ * tmp0;
701       }
702       y[row++] = sum1;
703       y[row++] = sum2;
704       y[row++] = sum3;
705       y[row++] = sum4;
706       v1       = v4; /* Since the next block to be processed starts there*/
707       idx += 3 * sz;
708       break;
709     case 5:
710       sum1 = *zt++;
711       sum2 = *zt++;
712       sum3 = *zt++;
713       sum4 = *zt++;
714       sum5 = *zt++;
715       v2   = v1 + n;
716       v3   = v2 + n;
717       v4   = v3 + n;
718       v5   = v4 + n;
719 
720       for (n = 0; n < sz - 1; n += 2) {
721         i1 = idx[0];
722         i2 = idx[1];
723         idx += 2;
724         tmp0 = x[i1];
725         tmp1 = x[i2];
726         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727         v1 += 2;
728         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729         v2 += 2;
730         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731         v3 += 2;
732         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733         v4 += 2;
734         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735         v5 += 2;
736       }
737       if (n == sz - 1) {
738         tmp0 = x[*idx++];
739         sum1 += *v1++ * tmp0;
740         sum2 += *v2++ * tmp0;
741         sum3 += *v3++ * tmp0;
742         sum4 += *v4++ * tmp0;
743         sum5 += *v5++ * tmp0;
744       }
745       y[row++] = sum1;
746       y[row++] = sum2;
747       y[row++] = sum3;
748       y[row++] = sum4;
749       y[row++] = sum5;
750       v1       = v5; /* Since the next block to be processed starts there */
751       idx += 4 * sz;
752       break;
753     default:
754       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755     }
756   }
757   PetscCall(VecRestoreArrayRead(xx, &x));
758   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759   PetscCall(PetscLogFlops(2.0 * a->nz));
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764 {
765   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
766   IS                 iscol = a->col, isrow = a->row;
767   const PetscInt    *r, *c, *rout, *cout;
768   PetscInt           i, j, n = A->rmap->n, nz;
769   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
770   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774   const PetscScalar *b;
775 
776   PetscFunctionBegin;
777   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778   node_max = a->inode.node_count;
779   ns       = a->inode.size_csr; /* Node Size array */
780 
781   PetscCall(VecGetArrayRead(bb, &b));
782   PetscCall(VecGetArrayWrite(xx, &x));
783   tmp = a->solve_work;
784 
785   PetscCall(ISGetIndices(isrow, &rout));
786   r = rout;
787   PetscCall(ISGetIndices(iscol, &cout));
788   c = cout + (n - 1);
789 
790   /* forward solve the lower triangular */
791   tmps = tmp;
792   aa   = a_a;
793   aj   = a_j;
794   ad   = a->diag;
795 
796   for (i = 0, row = 0; i < node_max; ++i) {
797     row = ns[i];
798     nsz = ns[i + 1] - ns[i];
799     aii = ai[row];
800     v1  = aa + aii;
801     vi  = aj + aii;
802     nz  = ad[row] - aii;
803     if (i < node_max - 1) {
804       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805       * but our indexing to determine its size could. */
806       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810     }
811 
812     switch (nsz) { /* Each loop in 'case' is unrolled */
813     case 1:
814       sum1 = b[*r++];
815       for (j = 0; j < nz - 1; j += 2) {
816         i0 = vi[0];
817         i1 = vi[1];
818         vi += 2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822         v1 += 2;
823       }
824       if (j == nz - 1) {
825         tmp0 = tmps[*vi++];
826         sum1 -= *v1++ * tmp0;
827       }
828       tmp[row++] = sum1;
829       break;
830     case 2:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       v2   = aa + ai[row + 1];
834 
835       for (j = 0; j < nz - 1; j += 2) {
836         i0 = vi[0];
837         i1 = vi[1];
838         vi += 2;
839         tmp0 = tmps[i0];
840         tmp1 = tmps[i1];
841         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842         v1 += 2;
843         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844         v2 += 2;
845       }
846       if (j == nz - 1) {
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ * tmp0;
849         sum2 -= *v2++ * tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row++] = sum1;
853       tmp[row++] = sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row + 1];
860       v3   = aa + ai[row + 2];
861 
862       for (j = 0; j < nz - 1; j += 2) {
863         i0 = vi[0];
864         i1 = vi[1];
865         vi += 2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869         v1 += 2;
870         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871         v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873         v3 += 2;
874       }
875       if (j == nz - 1) {
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ * tmp0;
878         sum2 -= *v2++ * tmp0;
879         sum3 -= *v3++ * tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884 
885       tmp[row++] = sum1;
886       tmp[row++] = sum2;
887       tmp[row++] = sum3;
888       break;
889 
890     case 4:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       v2   = aa + ai[row + 1];
896       v3   = aa + ai[row + 2];
897       v4   = aa + ai[row + 3];
898 
899       for (j = 0; j < nz - 1; j += 2) {
900         i0 = vi[0];
901         i1 = vi[1];
902         vi += 2;
903         tmp0 = tmps[i0];
904         tmp1 = tmps[i1];
905         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906         v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908         v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910         v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912         v4 += 2;
913       }
914       if (j == nz - 1) {
915         tmp0 = tmps[*vi++];
916         sum1 -= *v1++ * tmp0;
917         sum2 -= *v2++ * tmp0;
918         sum3 -= *v3++ * tmp0;
919         sum4 -= *v4++ * tmp0;
920       }
921       sum2 -= *v2++ * sum1;
922       sum3 -= *v3++ * sum1;
923       sum4 -= *v4++ * sum1;
924       sum3 -= *v3++ * sum2;
925       sum4 -= *v4++ * sum2;
926       sum4 -= *v4++ * sum3;
927 
928       tmp[row++] = sum1;
929       tmp[row++] = sum2;
930       tmp[row++] = sum3;
931       tmp[row++] = sum4;
932       break;
933     case 5:
934       sum1 = b[*r++];
935       sum2 = b[*r++];
936       sum3 = b[*r++];
937       sum4 = b[*r++];
938       sum5 = b[*r++];
939       v2   = aa + ai[row + 1];
940       v3   = aa + ai[row + 2];
941       v4   = aa + ai[row + 3];
942       v5   = aa + ai[row + 4];
943 
944       for (j = 0; j < nz - 1; j += 2) {
945         i0 = vi[0];
946         i1 = vi[1];
947         vi += 2;
948         tmp0 = tmps[i0];
949         tmp1 = tmps[i1];
950         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951         v1 += 2;
952         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953         v2 += 2;
954         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955         v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957         v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959         v5 += 2;
960       }
961       if (j == nz - 1) {
962         tmp0 = tmps[*vi++];
963         sum1 -= *v1++ * tmp0;
964         sum2 -= *v2++ * tmp0;
965         sum3 -= *v3++ * tmp0;
966         sum4 -= *v4++ * tmp0;
967         sum5 -= *v5++ * tmp0;
968       }
969 
970       sum2 -= *v2++ * sum1;
971       sum3 -= *v3++ * sum1;
972       sum4 -= *v4++ * sum1;
973       sum5 -= *v5++ * sum1;
974       sum3 -= *v3++ * sum2;
975       sum4 -= *v4++ * sum2;
976       sum5 -= *v5++ * sum2;
977       sum4 -= *v4++ * sum3;
978       sum5 -= *v5++ * sum3;
979       sum5 -= *v5++ * sum4;
980 
981       tmp[row++] = sum1;
982       tmp[row++] = sum2;
983       tmp[row++] = sum3;
984       tmp[row++] = sum4;
985       tmp[row++] = sum5;
986       break;
987     default:
988       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989     }
990   }
991   /* backward solve the upper triangular */
992   for (i = node_max - 1; i >= 0; i--) {
993     row = ns[i + 1];
994     nsz = ns[i + 1] - ns[i];
995     aii = ai[row + 1] - 1;
996     v1  = aa + aii;
997     vi  = aj + aii;
998     nz  = aii - ad[row];
999     switch (nsz) { /* Each loop in 'case' is unrolled */
1000     case 1:
1001       sum1 = tmp[row];
1002 
1003       for (j = nz; j > 1; j -= 2) {
1004         vi -= 2;
1005         i0   = vi[2];
1006         i1   = vi[1];
1007         tmp0 = tmps[i0];
1008         tmp1 = tmps[i1];
1009         v1 -= 2;
1010         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011       }
1012       if (j == 1) {
1013         tmp0 = tmps[*vi--];
1014         sum1 -= *v1-- * tmp0;
1015       }
1016       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017       row--;
1018       break;
1019     case 2:
1020       sum1 = tmp[row];
1021       sum2 = tmp[row - 1];
1022       v2   = aa + ai[row] - 1;
1023       for (j = nz; j > 1; j -= 2) {
1024         vi -= 2;
1025         i0   = vi[2];
1026         i1   = vi[1];
1027         tmp0 = tmps[i0];
1028         tmp1 = tmps[i1];
1029         v1 -= 2;
1030         v2 -= 2;
1031         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033       }
1034       if (j == 1) {
1035         tmp0 = tmps[*vi--];
1036         sum1 -= *v1-- * tmp0;
1037         sum2 -= *v2-- * tmp0;
1038       }
1039 
1040       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041       row--;
1042       sum2 -= *v2-- * tmp0;
1043       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044       row--;
1045       break;
1046     case 3:
1047       sum1 = tmp[row];
1048       sum2 = tmp[row - 1];
1049       sum3 = tmp[row - 2];
1050       v2   = aa + ai[row] - 1;
1051       v3   = aa + ai[row - 1] - 1;
1052       for (j = nz; j > 1; j -= 2) {
1053         vi -= 2;
1054         i0   = vi[2];
1055         i1   = vi[1];
1056         tmp0 = tmps[i0];
1057         tmp1 = tmps[i1];
1058         v1 -= 2;
1059         v2 -= 2;
1060         v3 -= 2;
1061         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064       }
1065       if (j == 1) {
1066         tmp0 = tmps[*vi--];
1067         sum1 -= *v1-- * tmp0;
1068         sum2 -= *v2-- * tmp0;
1069         sum3 -= *v3-- * tmp0;
1070       }
1071       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072       row--;
1073       sum2 -= *v2-- * tmp0;
1074       sum3 -= *v3-- * tmp0;
1075       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076       row--;
1077       sum3 -= *v3-- * tmp0;
1078       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079       row--;
1080 
1081       break;
1082     case 4:
1083       sum1 = tmp[row];
1084       sum2 = tmp[row - 1];
1085       sum3 = tmp[row - 2];
1086       sum4 = tmp[row - 3];
1087       v2   = aa + ai[row] - 1;
1088       v3   = aa + ai[row - 1] - 1;
1089       v4   = aa + ai[row - 2] - 1;
1090 
1091       for (j = nz; j > 1; j -= 2) {
1092         vi -= 2;
1093         i0   = vi[2];
1094         i1   = vi[1];
1095         tmp0 = tmps[i0];
1096         tmp1 = tmps[i1];
1097         v1 -= 2;
1098         v2 -= 2;
1099         v3 -= 2;
1100         v4 -= 2;
1101         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105       }
1106       if (j == 1) {
1107         tmp0 = tmps[*vi--];
1108         sum1 -= *v1-- * tmp0;
1109         sum2 -= *v2-- * tmp0;
1110         sum3 -= *v3-- * tmp0;
1111         sum4 -= *v4-- * tmp0;
1112       }
1113 
1114       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115       row--;
1116       sum2 -= *v2-- * tmp0;
1117       sum3 -= *v3-- * tmp0;
1118       sum4 -= *v4-- * tmp0;
1119       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120       row--;
1121       sum3 -= *v3-- * tmp0;
1122       sum4 -= *v4-- * tmp0;
1123       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124       row--;
1125       sum4 -= *v4-- * tmp0;
1126       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127       row--;
1128       break;
1129     case 5:
1130       sum1 = tmp[row];
1131       sum2 = tmp[row - 1];
1132       sum3 = tmp[row - 2];
1133       sum4 = tmp[row - 3];
1134       sum5 = tmp[row - 4];
1135       v2   = aa + ai[row] - 1;
1136       v3   = aa + ai[row - 1] - 1;
1137       v4   = aa + ai[row - 2] - 1;
1138       v5   = aa + ai[row - 3] - 1;
1139       for (j = nz; j > 1; j -= 2) {
1140         vi -= 2;
1141         i0   = vi[2];
1142         i1   = vi[1];
1143         tmp0 = tmps[i0];
1144         tmp1 = tmps[i1];
1145         v1 -= 2;
1146         v2 -= 2;
1147         v3 -= 2;
1148         v4 -= 2;
1149         v5 -= 2;
1150         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155       }
1156       if (j == 1) {
1157         tmp0 = tmps[*vi--];
1158         sum1 -= *v1-- * tmp0;
1159         sum2 -= *v2-- * tmp0;
1160         sum3 -= *v3-- * tmp0;
1161         sum4 -= *v4-- * tmp0;
1162         sum5 -= *v5-- * tmp0;
1163       }
1164 
1165       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166       row--;
1167       sum2 -= *v2-- * tmp0;
1168       sum3 -= *v3-- * tmp0;
1169       sum4 -= *v4-- * tmp0;
1170       sum5 -= *v5-- * tmp0;
1171       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172       row--;
1173       sum3 -= *v3-- * tmp0;
1174       sum4 -= *v4-- * tmp0;
1175       sum5 -= *v5-- * tmp0;
1176       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177       row--;
1178       sum4 -= *v4-- * tmp0;
1179       sum5 -= *v5-- * tmp0;
1180       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181       row--;
1182       sum5 -= *v5-- * tmp0;
1183       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184       row--;
1185       break;
1186     default:
1187       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188     }
1189   }
1190   PetscCall(ISRestoreIndices(isrow, &rout));
1191   PetscCall(ISRestoreIndices(iscol, &cout));
1192   PetscCall(VecRestoreArrayRead(bb, &b));
1193   PetscCall(VecRestoreArrayWrite(xx, &x));
1194   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199 {
1200   Mat              C = B;
1201   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202   IS               isrow = b->row, isicol = b->icol;
1203   const PetscInt  *r, *ic, *ics;
1204   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205   PetscInt         i, j, k, nz, nzL, row, *pj;
1206   const PetscInt  *ajtmp, *bjtmp;
1207   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209   FactorShiftCtx   sctx;
1210   const PetscInt  *ddiag;
1211   PetscReal        rs;
1212   MatScalar        d;
1213   PetscInt         inod, nodesz, node_max, col;
1214   const PetscInt  *ns;
1215   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1216 
1217   PetscFunctionBegin;
1218   /* MatPivotSetUp(): initialize shift context sctx */
1219   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1220 
1221   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222     ddiag          = a->diag;
1223     sctx.shift_top = info->zeropivot;
1224     for (i = 0; i < n; i++) {
1225       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226       d  = (aa)[ddiag[i]];
1227       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228       v  = aa + ai[i];
1229       nz = ai[i + 1] - ai[i];
1230       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231       if (rs > sctx.shift_top) sctx.shift_top = rs;
1232     }
1233     sctx.shift_top *= 1.1;
1234     sctx.nshift_max = 5;
1235     sctx.shift_lo   = 0.;
1236     sctx.shift_hi   = 1.;
1237   }
1238 
1239   PetscCall(ISGetIndices(isrow, &r));
1240   PetscCall(ISGetIndices(isicol, &ic));
1241 
1242   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243   ics = ic;
1244 
1245   node_max = a->inode.node_count;
1246   ns       = a->inode.size_csr;
1247   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1248 
1249   /* If max inode size > 4, split it into two inodes.*/
1250   /* also map the inode sizes according to the ordering */
1251   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253     nodesz = ns[i + 1] - ns[i];
1254     if (nodesz > 4) {
1255       tmp_vec1[j] = 4;
1256       ++j;
1257       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258     } else {
1259       tmp_vec1[j] = nodesz;
1260     }
1261   }
1262   /* Use the correct node_max */
1263   node_max = j;
1264 
1265   /* Now reorder the inode info based on mat re-ordering info */
1266   /* First create a row -> inode_size_array_index map */
1267   PetscCall(PetscMalloc1(n + 1, &nsmap));
1268   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269   tmp_vec2[0] = 0;
1270   for (i = 0, row = 0; i < node_max; i++) {
1271     nodesz = tmp_vec1[i];
1272     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273   }
1274   /* Using nsmap, create a reordered ns structure */
1275   for (i = 0, j = 0; i < node_max; i++) {
1276     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278     j += nodesz;
1279   }
1280   PetscCall(PetscFree(nsmap));
1281   PetscCall(PetscFree(tmp_vec1));
1282 
1283   /* Now use the correct ns */
1284   ns = tmp_vec2;
1285 
1286   do {
1287     sctx.newshift = PETSC_FALSE;
1288     /* Now loop over each block-row, and do the factorization */
1289     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290       nodesz = ns[inod + 1] - ns[inod];
1291 
1292       switch (nodesz) {
1293       case 1:
1294         /* zero rtmp1 */
1295         /* L part */
1296         nz    = bi[i + 1] - bi[i];
1297         bjtmp = bj + bi[i];
1298         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1299 
1300         /* U part */
1301         nz    = bdiag[i] - bdiag[i + 1];
1302         bjtmp = bj + bdiag[i + 1] + 1;
1303         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1304 
1305         /* load in initial (unfactored row) */
1306         nz    = ai[r[i] + 1] - ai[r[i]];
1307         ajtmp = aj + ai[r[i]];
1308         v     = aa + ai[r[i]];
1309         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1310 
1311         /* ZeropivotApply() */
1312         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1313 
1314         /* elimination */
1315         bjtmp = bj + bi[i];
1316         row   = *bjtmp++;
1317         nzL   = bi[i + 1] - bi[i];
1318         for (k = 0; k < nzL; k++) {
1319           pc = rtmp1 + row;
1320           if (*pc != 0.0) {
1321             pv   = b->a + bdiag[row];
1322             mul1 = *pc * (*pv);
1323             *pc  = mul1;
1324             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325             pv   = b->a + bdiag[row + 1] + 1;
1326             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329           }
1330           row = *bjtmp++;
1331         }
1332 
1333         /* finished row so stick it into b->a */
1334         rs = 0.0;
1335         /* L part */
1336         pv = b->a + bi[i];
1337         pj = b->j + bi[i];
1338         nz = bi[i + 1] - bi[i];
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* U part */
1345         pv = b->a + bdiag[i + 1] + 1;
1346         pj = b->j + bdiag[i + 1] + 1;
1347         nz = bdiag[i] - bdiag[i + 1] - 1;
1348         for (j = 0; j < nz; j++) {
1349           pv[j] = rtmp1[pj[j]];
1350           rs += PetscAbsScalar(pv[j]);
1351         }
1352 
1353         /* Check zero pivot */
1354         sctx.rs = rs;
1355         sctx.pv = rtmp1[i];
1356         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357         if (sctx.newshift) break;
1358 
1359         /* Mark diagonal and invert diagonal for simpler triangular solves */
1360         pv  = b->a + bdiag[i];
1361         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362         break;
1363 
1364       case 2:
1365         /* zero rtmp1 and rtmp2 */
1366         /* L part */
1367         nz    = bi[i + 1] - bi[i];
1368         bjtmp = bj + bi[i];
1369         for (j = 0; j < nz; j++) {
1370           col        = bjtmp[j];
1371           rtmp1[col] = 0.0;
1372           rtmp2[col] = 0.0;
1373         }
1374 
1375         /* U part */
1376         nz    = bdiag[i] - bdiag[i + 1];
1377         bjtmp = bj + bdiag[i + 1] + 1;
1378         for (j = 0; j < nz; j++) {
1379           col        = bjtmp[j];
1380           rtmp1[col] = 0.0;
1381           rtmp2[col] = 0.0;
1382         }
1383 
1384         /* load in initial (unfactored row) */
1385         nz    = ai[r[i] + 1] - ai[r[i]];
1386         ajtmp = aj + ai[r[i]];
1387         v1    = aa + ai[r[i]];
1388         v2    = aa + ai[r[i + 1]];
1389         for (j = 0; j < nz; j++) {
1390           col        = ics[ajtmp[j]];
1391           rtmp1[col] = v1[j];
1392           rtmp2[col] = v2[j];
1393         }
1394         /* ZeropivotApply(): shift the diagonal of the matrix  */
1395         rtmp1[i] += sctx.shift_amount;
1396         rtmp2[i + 1] += sctx.shift_amount;
1397 
1398         /* elimination */
1399         bjtmp = bj + bi[i];
1400         row   = *bjtmp++; /* pivot row */
1401         nzL   = bi[i + 1] - bi[i];
1402         for (k = 0; k < nzL; k++) {
1403           pc1 = rtmp1 + row;
1404           pc2 = rtmp2 + row;
1405           if (*pc1 != 0.0 || *pc2 != 0.0) {
1406             pv   = b->a + bdiag[row];
1407             mul1 = *pc1 * (*pv);
1408             mul2 = *pc2 * (*pv);
1409             *pc1 = mul1;
1410             *pc2 = mul2;
1411 
1412             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413             pv = b->a + bdiag[row + 1] + 1;
1414             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415             for (j = 0; j < nz; j++) {
1416               col = pj[j];
1417               rtmp1[col] -= mul1 * pv[j];
1418               rtmp2[col] -= mul2 * pv[j];
1419             }
1420             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421           }
1422           row = *bjtmp++;
1423         }
1424 
1425         /* finished row i; check zero pivot, then stick row i into b->a */
1426         rs = 0.0;
1427         /* L part */
1428         pc1 = b->a + bi[i];
1429         pj  = b->j + bi[i];
1430         nz  = bi[i + 1] - bi[i];
1431         for (j = 0; j < nz; j++) {
1432           col    = pj[j];
1433           pc1[j] = rtmp1[col];
1434           rs += PetscAbsScalar(pc1[j]);
1435         }
1436         /* U part */
1437         pc1 = b->a + bdiag[i + 1] + 1;
1438         pj  = b->j + bdiag[i + 1] + 1;
1439         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440         for (j = 0; j < nz; j++) {
1441           col    = pj[j];
1442           pc1[j] = rtmp1[col];
1443           rs += PetscAbsScalar(pc1[j]);
1444         }
1445 
1446         sctx.rs = rs;
1447         sctx.pv = rtmp1[i];
1448         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449         if (sctx.newshift) break;
1450         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451         *pc1 = 1.0 / sctx.pv;
1452 
1453         /* Now take care of diagonal 2x2 block. */
1454         pc2 = rtmp2 + i;
1455         if (*pc2 != 0.0) {
1456           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457           *pc2 = mul1;                        /* insert L entry */
1458           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460           for (j = 0; j < nz; j++) {
1461             col = pj[j];
1462             rtmp2[col] -= mul1 * rtmp1[col];
1463           }
1464           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465         }
1466 
1467         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468         rs = 0.0;
1469         /* L part */
1470         pc2 = b->a + bi[i + 1];
1471         pj  = b->j + bi[i + 1];
1472         nz  = bi[i + 2] - bi[i + 1];
1473         for (j = 0; j < nz; j++) {
1474           col    = pj[j];
1475           pc2[j] = rtmp2[col];
1476           rs += PetscAbsScalar(pc2[j]);
1477         }
1478         /* U part */
1479         pc2 = b->a + bdiag[i + 2] + 1;
1480         pj  = b->j + bdiag[i + 2] + 1;
1481         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482         for (j = 0; j < nz; j++) {
1483           col    = pj[j];
1484           pc2[j] = rtmp2[col];
1485           rs += PetscAbsScalar(pc2[j]);
1486         }
1487 
1488         sctx.rs = rs;
1489         sctx.pv = rtmp2[i + 1];
1490         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491         if (sctx.newshift) break;
1492         pc2  = b->a + bdiag[i + 1];
1493         *pc2 = 1.0 / sctx.pv;
1494         break;
1495 
1496       case 3:
1497         /* zero rtmp */
1498         /* L part */
1499         nz    = bi[i + 1] - bi[i];
1500         bjtmp = bj + bi[i];
1501         for (j = 0; j < nz; j++) {
1502           col        = bjtmp[j];
1503           rtmp1[col] = 0.0;
1504           rtmp2[col] = 0.0;
1505           rtmp3[col] = 0.0;
1506         }
1507 
1508         /* U part */
1509         nz    = bdiag[i] - bdiag[i + 1];
1510         bjtmp = bj + bdiag[i + 1] + 1;
1511         for (j = 0; j < nz; j++) {
1512           col        = bjtmp[j];
1513           rtmp1[col] = 0.0;
1514           rtmp2[col] = 0.0;
1515           rtmp3[col] = 0.0;
1516         }
1517 
1518         /* load in initial (unfactored row) */
1519         nz    = ai[r[i] + 1] - ai[r[i]];
1520         ajtmp = aj + ai[r[i]];
1521         v1    = aa + ai[r[i]];
1522         v2    = aa + ai[r[i + 1]];
1523         v3    = aa + ai[r[i + 2]];
1524         for (j = 0; j < nz; j++) {
1525           col        = ics[ajtmp[j]];
1526           rtmp1[col] = v1[j];
1527           rtmp2[col] = v2[j];
1528           rtmp3[col] = v3[j];
1529         }
1530         /* ZeropivotApply(): shift the diagonal of the matrix  */
1531         rtmp1[i] += sctx.shift_amount;
1532         rtmp2[i + 1] += sctx.shift_amount;
1533         rtmp3[i + 2] += sctx.shift_amount;
1534 
1535         /* elimination */
1536         bjtmp = bj + bi[i];
1537         row   = *bjtmp++; /* pivot row */
1538         nzL   = bi[i + 1] - bi[i];
1539         for (k = 0; k < nzL; k++) {
1540           pc1 = rtmp1 + row;
1541           pc2 = rtmp2 + row;
1542           pc3 = rtmp3 + row;
1543           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544             pv   = b->a + bdiag[row];
1545             mul1 = *pc1 * (*pv);
1546             mul2 = *pc2 * (*pv);
1547             mul3 = *pc3 * (*pv);
1548             *pc1 = mul1;
1549             *pc2 = mul2;
1550             *pc3 = mul3;
1551 
1552             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553             pv = b->a + bdiag[row + 1] + 1;
1554             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555             for (j = 0; j < nz; j++) {
1556               col = pj[j];
1557               rtmp1[col] -= mul1 * pv[j];
1558               rtmp2[col] -= mul2 * pv[j];
1559               rtmp3[col] -= mul3 * pv[j];
1560             }
1561             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562           }
1563           row = *bjtmp++;
1564         }
1565 
1566         /* finished row i; check zero pivot, then stick row i into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc1 = b->a + bi[i];
1570         pj  = b->j + bi[i];
1571         nz  = bi[i + 1] - bi[i];
1572         for (j = 0; j < nz; j++) {
1573           col    = pj[j];
1574           pc1[j] = rtmp1[col];
1575           rs += PetscAbsScalar(pc1[j]);
1576         }
1577         /* U part */
1578         pc1 = b->a + bdiag[i + 1] + 1;
1579         pj  = b->j + bdiag[i + 1] + 1;
1580         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581         for (j = 0; j < nz; j++) {
1582           col    = pj[j];
1583           pc1[j] = rtmp1[col];
1584           rs += PetscAbsScalar(pc1[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp1[i];
1589         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590         if (sctx.newshift) break;
1591         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592         *pc1 = 1.0 / sctx.pv;
1593 
1594         /* Now take care of 1st column of diagonal 3x3 block. */
1595         pc2 = rtmp2 + i;
1596         pc3 = rtmp3 + i;
1597         if (*pc2 != 0.0 || *pc3 != 0.0) {
1598           mul2 = (*pc2) * (*pc1);
1599           *pc2 = mul2;
1600           mul3 = (*pc3) * (*pc1);
1601           *pc3 = mul3;
1602           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1603           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604           for (j = 0; j < nz; j++) {
1605             col = pj[j];
1606             rtmp2[col] -= mul2 * rtmp1[col];
1607             rtmp3[col] -= mul3 * rtmp1[col];
1608           }
1609           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610         }
1611 
1612         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613         rs = 0.0;
1614         /* L part */
1615         pc2 = b->a + bi[i + 1];
1616         pj  = b->j + bi[i + 1];
1617         nz  = bi[i + 2] - bi[i + 1];
1618         for (j = 0; j < nz; j++) {
1619           col    = pj[j];
1620           pc2[j] = rtmp2[col];
1621           rs += PetscAbsScalar(pc2[j]);
1622         }
1623         /* U part */
1624         pc2 = b->a + bdiag[i + 2] + 1;
1625         pj  = b->j + bdiag[i + 2] + 1;
1626         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627         for (j = 0; j < nz; j++) {
1628           col    = pj[j];
1629           pc2[j] = rtmp2[col];
1630           rs += PetscAbsScalar(pc2[j]);
1631         }
1632 
1633         sctx.rs = rs;
1634         sctx.pv = rtmp2[i + 1];
1635         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636         if (sctx.newshift) break;
1637         pc2  = b->a + bdiag[i + 1];
1638         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639 
1640         /* Now take care of 2nd column of diagonal 3x3 block. */
1641         pc3 = rtmp3 + i + 1;
1642         if (*pc3 != 0.0) {
1643           mul3 = (*pc3) * (*pc2);
1644           *pc3 = mul3;
1645           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1646           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647           for (j = 0; j < nz; j++) {
1648             col = pj[j];
1649             rtmp3[col] -= mul3 * rtmp2[col];
1650           }
1651           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652         }
1653 
1654         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655         rs = 0.0;
1656         /* L part */
1657         pc3 = b->a + bi[i + 2];
1658         pj  = b->j + bi[i + 2];
1659         nz  = bi[i + 3] - bi[i + 2];
1660         for (j = 0; j < nz; j++) {
1661           col    = pj[j];
1662           pc3[j] = rtmp3[col];
1663           rs += PetscAbsScalar(pc3[j]);
1664         }
1665         /* U part */
1666         pc3 = b->a + bdiag[i + 3] + 1;
1667         pj  = b->j + bdiag[i + 3] + 1;
1668         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669         for (j = 0; j < nz; j++) {
1670           col    = pj[j];
1671           pc3[j] = rtmp3[col];
1672           rs += PetscAbsScalar(pc3[j]);
1673         }
1674 
1675         sctx.rs = rs;
1676         sctx.pv = rtmp3[i + 2];
1677         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678         if (sctx.newshift) break;
1679         pc3  = b->a + bdiag[i + 2];
1680         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681         break;
1682       case 4:
1683         /* zero rtmp */
1684         /* L part */
1685         nz    = bi[i + 1] - bi[i];
1686         bjtmp = bj + bi[i];
1687         for (j = 0; j < nz; j++) {
1688           col        = bjtmp[j];
1689           rtmp1[col] = 0.0;
1690           rtmp2[col] = 0.0;
1691           rtmp3[col] = 0.0;
1692           rtmp4[col] = 0.0;
1693         }
1694 
1695         /* U part */
1696         nz    = bdiag[i] - bdiag[i + 1];
1697         bjtmp = bj + bdiag[i + 1] + 1;
1698         for (j = 0; j < nz; j++) {
1699           col        = bjtmp[j];
1700           rtmp1[col] = 0.0;
1701           rtmp2[col] = 0.0;
1702           rtmp3[col] = 0.0;
1703           rtmp4[col] = 0.0;
1704         }
1705 
1706         /* load in initial (unfactored row) */
1707         nz    = ai[r[i] + 1] - ai[r[i]];
1708         ajtmp = aj + ai[r[i]];
1709         v1    = aa + ai[r[i]];
1710         v2    = aa + ai[r[i + 1]];
1711         v3    = aa + ai[r[i + 2]];
1712         v4    = aa + ai[r[i + 3]];
1713         for (j = 0; j < nz; j++) {
1714           col        = ics[ajtmp[j]];
1715           rtmp1[col] = v1[j];
1716           rtmp2[col] = v2[j];
1717           rtmp3[col] = v3[j];
1718           rtmp4[col] = v4[j];
1719         }
1720         /* ZeropivotApply(): shift the diagonal of the matrix  */
1721         rtmp1[i] += sctx.shift_amount;
1722         rtmp2[i + 1] += sctx.shift_amount;
1723         rtmp3[i + 2] += sctx.shift_amount;
1724         rtmp4[i + 3] += sctx.shift_amount;
1725 
1726         /* elimination */
1727         bjtmp = bj + bi[i];
1728         row   = *bjtmp++; /* pivot row */
1729         nzL   = bi[i + 1] - bi[i];
1730         for (k = 0; k < nzL; k++) {
1731           pc1 = rtmp1 + row;
1732           pc2 = rtmp2 + row;
1733           pc3 = rtmp3 + row;
1734           pc4 = rtmp4 + row;
1735           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736             pv   = b->a + bdiag[row];
1737             mul1 = *pc1 * (*pv);
1738             mul2 = *pc2 * (*pv);
1739             mul3 = *pc3 * (*pv);
1740             mul4 = *pc4 * (*pv);
1741             *pc1 = mul1;
1742             *pc2 = mul2;
1743             *pc3 = mul3;
1744             *pc4 = mul4;
1745 
1746             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747             pv = b->a + bdiag[row + 1] + 1;
1748             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749             for (j = 0; j < nz; j++) {
1750               col = pj[j];
1751               rtmp1[col] -= mul1 * pv[j];
1752               rtmp2[col] -= mul2 * pv[j];
1753               rtmp3[col] -= mul3 * pv[j];
1754               rtmp4[col] -= mul4 * pv[j];
1755             }
1756             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757           }
1758           row = *bjtmp++;
1759         }
1760 
1761         /* finished row i; check zero pivot, then stick row i into b->a */
1762         rs = 0.0;
1763         /* L part */
1764         pc1 = b->a + bi[i];
1765         pj  = b->j + bi[i];
1766         nz  = bi[i + 1] - bi[i];
1767         for (j = 0; j < nz; j++) {
1768           col    = pj[j];
1769           pc1[j] = rtmp1[col];
1770           rs += PetscAbsScalar(pc1[j]);
1771         }
1772         /* U part */
1773         pc1 = b->a + bdiag[i + 1] + 1;
1774         pj  = b->j + bdiag[i + 1] + 1;
1775         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776         for (j = 0; j < nz; j++) {
1777           col    = pj[j];
1778           pc1[j] = rtmp1[col];
1779           rs += PetscAbsScalar(pc1[j]);
1780         }
1781 
1782         sctx.rs = rs;
1783         sctx.pv = rtmp1[i];
1784         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785         if (sctx.newshift) break;
1786         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1787         *pc1 = 1.0 / sctx.pv;
1788 
1789         /* Now take care of 1st column of diagonal 4x4 block. */
1790         pc2 = rtmp2 + i;
1791         pc3 = rtmp3 + i;
1792         pc4 = rtmp4 + i;
1793         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794           mul2 = (*pc2) * (*pc1);
1795           *pc2 = mul2;
1796           mul3 = (*pc3) * (*pc1);
1797           *pc3 = mul3;
1798           mul4 = (*pc4) * (*pc1);
1799           *pc4 = mul4;
1800           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1801           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802           for (j = 0; j < nz; j++) {
1803             col = pj[j];
1804             rtmp2[col] -= mul2 * rtmp1[col];
1805             rtmp3[col] -= mul3 * rtmp1[col];
1806             rtmp4[col] -= mul4 * rtmp1[col];
1807           }
1808           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809         }
1810 
1811         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812         rs = 0.0;
1813         /* L part */
1814         pc2 = b->a + bi[i + 1];
1815         pj  = b->j + bi[i + 1];
1816         nz  = bi[i + 2] - bi[i + 1];
1817         for (j = 0; j < nz; j++) {
1818           col    = pj[j];
1819           pc2[j] = rtmp2[col];
1820           rs += PetscAbsScalar(pc2[j]);
1821         }
1822         /* U part */
1823         pc2 = b->a + bdiag[i + 2] + 1;
1824         pj  = b->j + bdiag[i + 2] + 1;
1825         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826         for (j = 0; j < nz; j++) {
1827           col    = pj[j];
1828           pc2[j] = rtmp2[col];
1829           rs += PetscAbsScalar(pc2[j]);
1830         }
1831 
1832         sctx.rs = rs;
1833         sctx.pv = rtmp2[i + 1];
1834         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835         if (sctx.newshift) break;
1836         pc2  = b->a + bdiag[i + 1];
1837         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1838 
1839         /* Now take care of 2nd column of diagonal 4x4 block. */
1840         pc3 = rtmp3 + i + 1;
1841         pc4 = rtmp4 + i + 1;
1842         if (*pc3 != 0.0 || *pc4 != 0.0) {
1843           mul3 = (*pc3) * (*pc2);
1844           *pc3 = mul3;
1845           mul4 = (*pc4) * (*pc2);
1846           *pc4 = mul4;
1847           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1848           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849           for (j = 0; j < nz; j++) {
1850             col = pj[j];
1851             rtmp3[col] -= mul3 * rtmp2[col];
1852             rtmp4[col] -= mul4 * rtmp2[col];
1853           }
1854           PetscCall(PetscLogFlops(4.0 * nz));
1855         }
1856 
1857         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858         rs = 0.0;
1859         /* L part */
1860         pc3 = b->a + bi[i + 2];
1861         pj  = b->j + bi[i + 2];
1862         nz  = bi[i + 3] - bi[i + 2];
1863         for (j = 0; j < nz; j++) {
1864           col    = pj[j];
1865           pc3[j] = rtmp3[col];
1866           rs += PetscAbsScalar(pc3[j]);
1867         }
1868         /* U part */
1869         pc3 = b->a + bdiag[i + 3] + 1;
1870         pj  = b->j + bdiag[i + 3] + 1;
1871         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872         for (j = 0; j < nz; j++) {
1873           col    = pj[j];
1874           pc3[j] = rtmp3[col];
1875           rs += PetscAbsScalar(pc3[j]);
1876         }
1877 
1878         sctx.rs = rs;
1879         sctx.pv = rtmp3[i + 2];
1880         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881         if (sctx.newshift) break;
1882         pc3  = b->a + bdiag[i + 2];
1883         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1884 
1885         /* Now take care of 3rd column of diagonal 4x4 block. */
1886         pc4 = rtmp4 + i + 2;
1887         if (*pc4 != 0.0) {
1888           mul4 = (*pc4) * (*pc3);
1889           *pc4 = mul4;
1890           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1891           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892           for (j = 0; j < nz; j++) {
1893             col = pj[j];
1894             rtmp4[col] -= mul4 * rtmp3[col];
1895           }
1896           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897         }
1898 
1899         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900         rs = 0.0;
1901         /* L part */
1902         pc4 = b->a + bi[i + 3];
1903         pj  = b->j + bi[i + 3];
1904         nz  = bi[i + 4] - bi[i + 3];
1905         for (j = 0; j < nz; j++) {
1906           col    = pj[j];
1907           pc4[j] = rtmp4[col];
1908           rs += PetscAbsScalar(pc4[j]);
1909         }
1910         /* U part */
1911         pc4 = b->a + bdiag[i + 4] + 1;
1912         pj  = b->j + bdiag[i + 4] + 1;
1913         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914         for (j = 0; j < nz; j++) {
1915           col    = pj[j];
1916           pc4[j] = rtmp4[col];
1917           rs += PetscAbsScalar(pc4[j]);
1918         }
1919 
1920         sctx.rs = rs;
1921         sctx.pv = rtmp4[i + 3];
1922         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923         if (sctx.newshift) break;
1924         pc4  = b->a + bdiag[i + 3];
1925         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926         break;
1927 
1928       default:
1929         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930       }
1931       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932       i += nodesz;              /* Update the row */
1933     }
1934 
1935     /* MatPivotRefine() */
1936     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937       /*
1938        * if no shift in this attempt & shifting & started shifting & can refine,
1939        * then try lower shift
1940        */
1941       sctx.shift_hi       = sctx.shift_fraction;
1942       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1944       sctx.newshift       = PETSC_TRUE;
1945       sctx.nshift++;
1946     }
1947   } while (sctx.newshift);
1948 
1949   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950   PetscCall(PetscFree(tmp_vec2));
1951   PetscCall(ISRestoreIndices(isicol, &ic));
1952   PetscCall(ISRestoreIndices(isrow, &r));
1953 
1954   if (b->inode.size_csr) {
1955     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956   } else {
1957     C->ops->solve = MatSolve_SeqAIJ;
1958   }
1959   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1960   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1961   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964   C->assembled              = PETSC_TRUE;
1965   C->preallocated           = PETSC_TRUE;
1966 
1967   PetscCall(PetscLogFlops(C->cmap->n));
1968 
1969   /* MatShiftView(A,info,&sctx) */
1970   if (sctx.nshift) {
1971     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977     }
1978   }
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
1982 #if 0
1983 // unused
1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1985 {
1986   Mat              C = B;
1987   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1988   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1989   const PetscInt  *r, *ic, *c, *ics;
1990   PetscInt         n = A->rmap->n, *bi = b->i;
1991   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1992   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1993   PetscInt        *ai = a->i, *aj = a->j;
1994   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1995   PetscScalar      mul1, mul2, mul3, tmp;
1996   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1997   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1998   PetscReal        rs = 0.0;
1999   FactorShiftCtx   sctx;
2000 
2001   PetscFunctionBegin;
2002   sctx.shift_top      = 0;
2003   sctx.nshift_max     = 0;
2004   sctx.shift_lo       = 0;
2005   sctx.shift_hi       = 0;
2006   sctx.shift_fraction = 0;
2007 
2008   /* if both shift schemes are chosen by user, only use info->shiftpd */
2009   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2010     sctx.shift_top = 0;
2011     for (i = 0; i < n; i++) {
2012       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2013       rs    = 0.0;
2014       ajtmp = aj + ai[i];
2015       rtmp1 = aa + ai[i];
2016       nz    = ai[i + 1] - ai[i];
2017       for (j = 0; j < nz; j++) {
2018         if (*ajtmp != i) {
2019           rs += PetscAbsScalar(*rtmp1++);
2020         } else {
2021           rs -= PetscRealPart(*rtmp1++);
2022         }
2023         ajtmp++;
2024       }
2025       if (rs > sctx.shift_top) sctx.shift_top = rs;
2026     }
2027     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2028     sctx.shift_top *= 1.1;
2029     sctx.nshift_max = 5;
2030     sctx.shift_lo   = 0.;
2031     sctx.shift_hi   = 1.;
2032   }
2033   sctx.shift_amount = 0;
2034   sctx.nshift       = 0;
2035 
2036   PetscCall(ISGetIndices(isrow, &r));
2037   PetscCall(ISGetIndices(iscol, &c));
2038   PetscCall(ISGetIndices(isicol, &ic));
2039   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2040   ics = ic;
2041 
2042   node_max = a->inode.node_count;
2043   ns       = a->inode.size;
2044   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2045 
2046   /* If max inode size > 3, split it into two inodes.*/
2047   /* also map the inode sizes according to the ordering */
2048   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2049   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2050     if (ns[i] > 3) {
2051       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2052       ++j;
2053       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2054     } else {
2055       tmp_vec1[j] = ns[i];
2056     }
2057   }
2058   /* Use the correct node_max */
2059   node_max = j;
2060 
2061   /* Now reorder the inode info based on mat re-ordering info */
2062   /* First create a row -> inode_size_array_index map */
2063   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2064   for (i = 0, row = 0; i < node_max; i++) {
2065     nodesz = tmp_vec1[i];
2066     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2067   }
2068   /* Using nsmap, create a reordered ns structure */
2069   for (i = 0, j = 0; i < node_max; i++) {
2070     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2071     tmp_vec2[i] = nodesz;
2072     j += nodesz;
2073   }
2074   PetscCall(PetscFree2(nsmap, tmp_vec1));
2075   /* Now use the correct ns */
2076   ns = tmp_vec2;
2077 
2078   do {
2079     sctx.newshift = PETSC_FALSE;
2080     /* Now loop over each block-row, and do the factorization */
2081     for (i = 0, row = 0; i < node_max; i++) {
2082       nodesz = ns[i];
2083       nz     = bi[row + 1] - bi[row];
2084       bjtmp  = bj + bi[row];
2085 
2086       switch (nodesz) {
2087       case 1:
2088         for (j = 0; j < nz; j++) {
2089           idx         = bjtmp[j];
2090           rtmp11[idx] = 0.0;
2091         }
2092 
2093         /* load in initial (unfactored row) */
2094         idx    = r[row];
2095         nz_tmp = ai[idx + 1] - ai[idx];
2096         ajtmp  = aj + ai[idx];
2097         v1     = aa + ai[idx];
2098 
2099         for (j = 0; j < nz_tmp; j++) {
2100           idx         = ics[ajtmp[j]];
2101           rtmp11[idx] = v1[j];
2102         }
2103         rtmp11[ics[r[row]]] += sctx.shift_amount;
2104 
2105         prow = *bjtmp++;
2106         while (prow < row) {
2107           pc1 = rtmp11 + prow;
2108           if (*pc1 != 0.0) {
2109             pv     = ba + bd[prow];
2110             pj     = nbj + bd[prow];
2111             mul1   = *pc1 * *pv++;
2112             *pc1   = mul1;
2113             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2114             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2115             for (j = 0; j < nz_tmp; j++) {
2116               tmp = pv[j];
2117               idx = pj[j];
2118               rtmp11[idx] -= mul1 * tmp;
2119             }
2120           }
2121           prow = *bjtmp++;
2122         }
2123         pj  = bj + bi[row];
2124         pc1 = ba + bi[row];
2125 
2126         sctx.pv     = rtmp11[row];
2127         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2128         rs          = 0.0;
2129         for (j = 0; j < nz; j++) {
2130           idx    = pj[j];
2131           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2132           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2133         }
2134         sctx.rs = rs;
2135         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2136         if (sctx.newshift) goto endofwhile;
2137         break;
2138 
2139       case 2:
2140         for (j = 0; j < nz; j++) {
2141           idx         = bjtmp[j];
2142           rtmp11[idx] = 0.0;
2143           rtmp22[idx] = 0.0;
2144         }
2145 
2146         /* load in initial (unfactored row) */
2147         idx    = r[row];
2148         nz_tmp = ai[idx + 1] - ai[idx];
2149         ajtmp  = aj + ai[idx];
2150         v1     = aa + ai[idx];
2151         v2     = aa + ai[idx + 1];
2152         for (j = 0; j < nz_tmp; j++) {
2153           idx         = ics[ajtmp[j]];
2154           rtmp11[idx] = v1[j];
2155           rtmp22[idx] = v2[j];
2156         }
2157         rtmp11[ics[r[row]]] += sctx.shift_amount;
2158         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2159 
2160         prow = *bjtmp++;
2161         while (prow < row) {
2162           pc1 = rtmp11 + prow;
2163           pc2 = rtmp22 + prow;
2164           if (*pc1 != 0.0 || *pc2 != 0.0) {
2165             pv   = ba + bd[prow];
2166             pj   = nbj + bd[prow];
2167             mul1 = *pc1 * *pv;
2168             mul2 = *pc2 * *pv;
2169             ++pv;
2170             *pc1 = mul1;
2171             *pc2 = mul2;
2172 
2173             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2174             for (j = 0; j < nz_tmp; j++) {
2175               tmp = pv[j];
2176               idx = pj[j];
2177               rtmp11[idx] -= mul1 * tmp;
2178               rtmp22[idx] -= mul2 * tmp;
2179             }
2180             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2181           }
2182           prow = *bjtmp++;
2183         }
2184 
2185         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2186         pc1 = rtmp11 + prow;
2187         pc2 = rtmp22 + prow;
2188 
2189         sctx.pv = *pc1;
2190         pj      = bj + bi[prow];
2191         rs      = 0.0;
2192         for (j = 0; j < nz; j++) {
2193           idx = pj[j];
2194           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2195         }
2196         sctx.rs = rs;
2197         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2198         if (sctx.newshift) goto endofwhile;
2199 
2200         if (*pc2 != 0.0) {
2201           pj     = nbj + bd[prow];
2202           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2203           *pc2   = mul2;
2204           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2205           for (j = 0; j < nz_tmp; j++) {
2206             idx = pj[j];
2207             tmp = rtmp11[idx];
2208             rtmp22[idx] -= mul2 * tmp;
2209           }
2210           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2211         }
2212 
2213         pj  = bj + bi[row];
2214         pc1 = ba + bi[row];
2215         pc2 = ba + bi[row + 1];
2216 
2217         sctx.pv         = rtmp22[row + 1];
2218         rs              = 0.0;
2219         rtmp11[row]     = 1.0 / rtmp11[row];
2220         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2221         /* copy row entries from dense representation to sparse */
2222         for (j = 0; j < nz; j++) {
2223           idx    = pj[j];
2224           pc1[j] = rtmp11[idx];
2225           pc2[j] = rtmp22[idx];
2226           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2227         }
2228         sctx.rs = rs;
2229         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2230         if (sctx.newshift) goto endofwhile;
2231         break;
2232 
2233       case 3:
2234         for (j = 0; j < nz; j++) {
2235           idx         = bjtmp[j];
2236           rtmp11[idx] = 0.0;
2237           rtmp22[idx] = 0.0;
2238           rtmp33[idx] = 0.0;
2239         }
2240         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2241         idx    = r[row];
2242         nz_tmp = ai[idx + 1] - ai[idx];
2243         ajtmp  = aj + ai[idx];
2244         v1     = aa + ai[idx];
2245         v2     = aa + ai[idx + 1];
2246         v3     = aa + ai[idx + 2];
2247         for (j = 0; j < nz_tmp; j++) {
2248           idx         = ics[ajtmp[j]];
2249           rtmp11[idx] = v1[j];
2250           rtmp22[idx] = v2[j];
2251           rtmp33[idx] = v3[j];
2252         }
2253         rtmp11[ics[r[row]]] += sctx.shift_amount;
2254         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2255         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2256 
2257         /* loop over all pivot row blocks above this row block */
2258         prow = *bjtmp++;
2259         while (prow < row) {
2260           pc1 = rtmp11 + prow;
2261           pc2 = rtmp22 + prow;
2262           pc3 = rtmp33 + prow;
2263           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2264             pv   = ba + bd[prow];
2265             pj   = nbj + bd[prow];
2266             mul1 = *pc1 * *pv;
2267             mul2 = *pc2 * *pv;
2268             mul3 = *pc3 * *pv;
2269             ++pv;
2270             *pc1 = mul1;
2271             *pc2 = mul2;
2272             *pc3 = mul3;
2273 
2274             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2275             /* update this row based on pivot row */
2276             for (j = 0; j < nz_tmp; j++) {
2277               tmp = pv[j];
2278               idx = pj[j];
2279               rtmp11[idx] -= mul1 * tmp;
2280               rtmp22[idx] -= mul2 * tmp;
2281               rtmp33[idx] -= mul3 * tmp;
2282             }
2283             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2284           }
2285           prow = *bjtmp++;
2286         }
2287 
2288         /* Now take care of diagonal 3x3 block in this set of rows */
2289         /* note: prow = row here */
2290         pc1 = rtmp11 + prow;
2291         pc2 = rtmp22 + prow;
2292         pc3 = rtmp33 + prow;
2293 
2294         sctx.pv = *pc1;
2295         pj      = bj + bi[prow];
2296         rs      = 0.0;
2297         for (j = 0; j < nz; j++) {
2298           idx = pj[j];
2299           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2300         }
2301         sctx.rs = rs;
2302         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2303         if (sctx.newshift) goto endofwhile;
2304 
2305         if (*pc2 != 0.0 || *pc3 != 0.0) {
2306           mul2   = (*pc2) / (*pc1);
2307           mul3   = (*pc3) / (*pc1);
2308           *pc2   = mul2;
2309           *pc3   = mul3;
2310           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2311           pj     = nbj + bd[prow];
2312           for (j = 0; j < nz_tmp; j++) {
2313             idx = pj[j];
2314             tmp = rtmp11[idx];
2315             rtmp22[idx] -= mul2 * tmp;
2316             rtmp33[idx] -= mul3 * tmp;
2317           }
2318           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2319         }
2320         ++prow;
2321 
2322         pc2     = rtmp22 + prow;
2323         pc3     = rtmp33 + prow;
2324         sctx.pv = *pc2;
2325         pj      = bj + bi[prow];
2326         rs      = 0.0;
2327         for (j = 0; j < nz; j++) {
2328           idx = pj[j];
2329           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2330         }
2331         sctx.rs = rs;
2332         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2333         if (sctx.newshift) goto endofwhile;
2334 
2335         if (*pc3 != 0.0) {
2336           mul3   = (*pc3) / (*pc2);
2337           *pc3   = mul3;
2338           pj     = nbj + bd[prow];
2339           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2340           for (j = 0; j < nz_tmp; j++) {
2341             idx = pj[j];
2342             tmp = rtmp22[idx];
2343             rtmp33[idx] -= mul3 * tmp;
2344           }
2345           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2346         }
2347 
2348         pj  = bj + bi[row];
2349         pc1 = ba + bi[row];
2350         pc2 = ba + bi[row + 1];
2351         pc3 = ba + bi[row + 2];
2352 
2353         sctx.pv         = rtmp33[row + 2];
2354         rs              = 0.0;
2355         rtmp11[row]     = 1.0 / rtmp11[row];
2356         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2357         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2358         /* copy row entries from dense representation to sparse */
2359         for (j = 0; j < nz; j++) {
2360           idx    = pj[j];
2361           pc1[j] = rtmp11[idx];
2362           pc2[j] = rtmp22[idx];
2363           pc3[j] = rtmp33[idx];
2364           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2365         }
2366 
2367         sctx.rs = rs;
2368         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2369         if (sctx.newshift) goto endofwhile;
2370         break;
2371 
2372       default:
2373         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2374       }
2375       row += nodesz; /* Update the row */
2376     }
2377   endofwhile:;
2378   } while (sctx.newshift);
2379   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2380   PetscCall(PetscFree(tmp_vec2));
2381   PetscCall(ISRestoreIndices(isicol, &ic));
2382   PetscCall(ISRestoreIndices(isrow, &r));
2383   PetscCall(ISRestoreIndices(iscol, &c));
2384 
2385   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2386   /* do not set solve add, since MatSolve_Inode + Add is faster */
2387   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2388   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2389   C->assembled              = PETSC_TRUE;
2390   C->preallocated           = PETSC_TRUE;
2391   if (sctx.nshift) {
2392     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2393       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2394     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2395       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2396     }
2397   }
2398   PetscCall(PetscLogFlops(C->cmap->n));
2399   PetscCall(MatSeqAIJCheckInode(C));
2400   PetscFunctionReturn(PETSC_SUCCESS);
2401 }
2402 #endif
2403 
2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2405 {
2406   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2407   IS                 iscol = a->col, isrow = a->row;
2408   const PetscInt    *r, *c, *rout, *cout;
2409   PetscInt           i, j;
2410   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2411   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2412   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2413   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2414   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2415   const PetscScalar *b;
2416 
2417   PetscFunctionBegin;
2418   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2419   node_max = a->inode.node_count;
2420   ns       = a->inode.size_csr; /* Node Size array */
2421 
2422   PetscCall(VecGetArrayRead(bb, &b));
2423   PetscCall(VecGetArrayWrite(xx, &x));
2424   tmp = a->solve_work;
2425 
2426   PetscCall(ISGetIndices(isrow, &rout));
2427   r = rout;
2428   PetscCall(ISGetIndices(iscol, &cout));
2429   c = cout;
2430 
2431   /* forward solve the lower triangular */
2432   tmps = tmp;
2433   aa   = a_a;
2434   aj   = a_j;
2435   ad   = a->diag;
2436 
2437   for (i = 0; i < node_max; ++i) {
2438     row = ns[i];
2439     nsz = ns[i + 1] - ns[i];
2440     aii = ai[row];
2441     v1  = aa + aii;
2442     vi  = aj + aii;
2443     nz  = ai[row + 1] - ai[row];
2444 
2445     if (i < node_max - 1) {
2446       /* Prefetch the indices for the next block */
2447       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2448       /* Prefetch the data for the next block */
2449       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2450     }
2451 
2452     switch (nsz) { /* Each loop in 'case' is unrolled */
2453     case 1:
2454       sum1 = b[r[row]];
2455       for (j = 0; j < nz - 1; j += 2) {
2456         i0   = vi[j];
2457         i1   = vi[j + 1];
2458         tmp0 = tmps[i0];
2459         tmp1 = tmps[i1];
2460         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2461       }
2462       if (j == nz - 1) {
2463         tmp0 = tmps[vi[j]];
2464         sum1 -= v1[j] * tmp0;
2465       }
2466       tmp[row++] = sum1;
2467       break;
2468     case 2:
2469       sum1 = b[r[row]];
2470       sum2 = b[r[row + 1]];
2471       v2   = aa + ai[row + 1];
2472 
2473       for (j = 0; j < nz - 1; j += 2) {
2474         i0   = vi[j];
2475         i1   = vi[j + 1];
2476         tmp0 = tmps[i0];
2477         tmp1 = tmps[i1];
2478         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2479         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2480       }
2481       if (j == nz - 1) {
2482         tmp0 = tmps[vi[j]];
2483         sum1 -= v1[j] * tmp0;
2484         sum2 -= v2[j] * tmp0;
2485       }
2486       sum2 -= v2[nz] * sum1;
2487       tmp[row++] = sum1;
2488       tmp[row++] = sum2;
2489       break;
2490     case 3:
2491       sum1 = b[r[row]];
2492       sum2 = b[r[row + 1]];
2493       sum3 = b[r[row + 2]];
2494       v2   = aa + ai[row + 1];
2495       v3   = aa + ai[row + 2];
2496 
2497       for (j = 0; j < nz - 1; j += 2) {
2498         i0   = vi[j];
2499         i1   = vi[j + 1];
2500         tmp0 = tmps[i0];
2501         tmp1 = tmps[i1];
2502         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2503         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2504         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2505       }
2506       if (j == nz - 1) {
2507         tmp0 = tmps[vi[j]];
2508         sum1 -= v1[j] * tmp0;
2509         sum2 -= v2[j] * tmp0;
2510         sum3 -= v3[j] * tmp0;
2511       }
2512       sum2 -= v2[nz] * sum1;
2513       sum3 -= v3[nz] * sum1;
2514       sum3 -= v3[nz + 1] * sum2;
2515       tmp[row++] = sum1;
2516       tmp[row++] = sum2;
2517       tmp[row++] = sum3;
2518       break;
2519 
2520     case 4:
2521       sum1 = b[r[row]];
2522       sum2 = b[r[row + 1]];
2523       sum3 = b[r[row + 2]];
2524       sum4 = b[r[row + 3]];
2525       v2   = aa + ai[row + 1];
2526       v3   = aa + ai[row + 2];
2527       v4   = aa + ai[row + 3];
2528 
2529       for (j = 0; j < nz - 1; j += 2) {
2530         i0   = vi[j];
2531         i1   = vi[j + 1];
2532         tmp0 = tmps[i0];
2533         tmp1 = tmps[i1];
2534         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2535         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2536         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2537         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2538       }
2539       if (j == nz - 1) {
2540         tmp0 = tmps[vi[j]];
2541         sum1 -= v1[j] * tmp0;
2542         sum2 -= v2[j] * tmp0;
2543         sum3 -= v3[j] * tmp0;
2544         sum4 -= v4[j] * tmp0;
2545       }
2546       sum2 -= v2[nz] * sum1;
2547       sum3 -= v3[nz] * sum1;
2548       sum4 -= v4[nz] * sum1;
2549       sum3 -= v3[nz + 1] * sum2;
2550       sum4 -= v4[nz + 1] * sum2;
2551       sum4 -= v4[nz + 2] * sum3;
2552 
2553       tmp[row++] = sum1;
2554       tmp[row++] = sum2;
2555       tmp[row++] = sum3;
2556       tmp[row++] = sum4;
2557       break;
2558     case 5:
2559       sum1 = b[r[row]];
2560       sum2 = b[r[row + 1]];
2561       sum3 = b[r[row + 2]];
2562       sum4 = b[r[row + 3]];
2563       sum5 = b[r[row + 4]];
2564       v2   = aa + ai[row + 1];
2565       v3   = aa + ai[row + 2];
2566       v4   = aa + ai[row + 3];
2567       v5   = aa + ai[row + 4];
2568 
2569       for (j = 0; j < nz - 1; j += 2) {
2570         i0   = vi[j];
2571         i1   = vi[j + 1];
2572         tmp0 = tmps[i0];
2573         tmp1 = tmps[i1];
2574         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2575         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2576         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2577         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2578         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2579       }
2580       if (j == nz - 1) {
2581         tmp0 = tmps[vi[j]];
2582         sum1 -= v1[j] * tmp0;
2583         sum2 -= v2[j] * tmp0;
2584         sum3 -= v3[j] * tmp0;
2585         sum4 -= v4[j] * tmp0;
2586         sum5 -= v5[j] * tmp0;
2587       }
2588 
2589       sum2 -= v2[nz] * sum1;
2590       sum3 -= v3[nz] * sum1;
2591       sum4 -= v4[nz] * sum1;
2592       sum5 -= v5[nz] * sum1;
2593       sum3 -= v3[nz + 1] * sum2;
2594       sum4 -= v4[nz + 1] * sum2;
2595       sum5 -= v5[nz + 1] * sum2;
2596       sum4 -= v4[nz + 2] * sum3;
2597       sum5 -= v5[nz + 2] * sum3;
2598       sum5 -= v5[nz + 3] * sum4;
2599 
2600       tmp[row++] = sum1;
2601       tmp[row++] = sum2;
2602       tmp[row++] = sum3;
2603       tmp[row++] = sum4;
2604       tmp[row++] = sum5;
2605       break;
2606     default:
2607       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2608     }
2609   }
2610   /* backward solve the upper triangular */
2611   for (i = node_max - 1; i >= 0; i--) {
2612     row = ns[i + 1] - 1;
2613     nsz = ns[i + 1] - ns[i];
2614     aii = ad[row + 1] + 1;
2615     v1  = aa + aii;
2616     vi  = aj + aii;
2617     nz  = ad[row] - ad[row + 1] - 1;
2618 
2619     if (i > 0) {
2620       /* Prefetch the indices for the next block */
2621       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2622       /* Prefetch the data for the next block */
2623       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2624     }
2625 
2626     switch (nsz) { /* Each loop in 'case' is unrolled */
2627     case 1:
2628       sum1 = tmp[row];
2629 
2630       for (j = 0; j < nz - 1; j += 2) {
2631         i0   = vi[j];
2632         i1   = vi[j + 1];
2633         tmp0 = tmps[i0];
2634         tmp1 = tmps[i1];
2635         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2636       }
2637       if (j == nz - 1) {
2638         tmp0 = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640       }
2641       x[c[row]] = tmp[row] = sum1 * v1[nz];
2642       row--;
2643       break;
2644     case 2:
2645       sum1 = tmp[row];
2646       sum2 = tmp[row - 1];
2647       v2   = aa + ad[row] + 1;
2648       for (j = 0; j < nz - 1; j += 2) {
2649         i0   = vi[j];
2650         i1   = vi[j + 1];
2651         tmp0 = tmps[i0];
2652         tmp1 = tmps[i1];
2653         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2654         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2655       }
2656       if (j == nz - 1) {
2657         tmp0 = tmps[vi[j]];
2658         sum1 -= v1[j] * tmp0;
2659         sum2 -= v2[j + 1] * tmp0;
2660       }
2661 
2662       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2663       row--;
2664       sum2 -= v2[0] * tmp0;
2665       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2666       row--;
2667       break;
2668     case 3:
2669       sum1 = tmp[row];
2670       sum2 = tmp[row - 1];
2671       sum3 = tmp[row - 2];
2672       v2   = aa + ad[row] + 1;
2673       v3   = aa + ad[row - 1] + 1;
2674       for (j = 0; j < nz - 1; j += 2) {
2675         i0   = vi[j];
2676         i1   = vi[j + 1];
2677         tmp0 = tmps[i0];
2678         tmp1 = tmps[i1];
2679         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2680         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2681         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2682       }
2683       if (j == nz - 1) {
2684         tmp0 = tmps[vi[j]];
2685         sum1 -= v1[j] * tmp0;
2686         sum2 -= v2[j + 1] * tmp0;
2687         sum3 -= v3[j + 2] * tmp0;
2688       }
2689       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2690       row--;
2691       sum2 -= v2[0] * tmp0;
2692       sum3 -= v3[1] * tmp0;
2693       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2694       row--;
2695       sum3 -= v3[0] * tmp0;
2696       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2697       row--;
2698 
2699       break;
2700     case 4:
2701       sum1 = tmp[row];
2702       sum2 = tmp[row - 1];
2703       sum3 = tmp[row - 2];
2704       sum4 = tmp[row - 3];
2705       v2   = aa + ad[row] + 1;
2706       v3   = aa + ad[row - 1] + 1;
2707       v4   = aa + ad[row - 2] + 1;
2708 
2709       for (j = 0; j < nz - 1; j += 2) {
2710         i0   = vi[j];
2711         i1   = vi[j + 1];
2712         tmp0 = tmps[i0];
2713         tmp1 = tmps[i1];
2714         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2715         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2716         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2717         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2718       }
2719       if (j == nz - 1) {
2720         tmp0 = tmps[vi[j]];
2721         sum1 -= v1[j] * tmp0;
2722         sum2 -= v2[j + 1] * tmp0;
2723         sum3 -= v3[j + 2] * tmp0;
2724         sum4 -= v4[j + 3] * tmp0;
2725       }
2726 
2727       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2728       row--;
2729       sum2 -= v2[0] * tmp0;
2730       sum3 -= v3[1] * tmp0;
2731       sum4 -= v4[2] * tmp0;
2732       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2733       row--;
2734       sum3 -= v3[0] * tmp0;
2735       sum4 -= v4[1] * tmp0;
2736       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2737       row--;
2738       sum4 -= v4[0] * tmp0;
2739       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2740       row--;
2741       break;
2742     case 5:
2743       sum1 = tmp[row];
2744       sum2 = tmp[row - 1];
2745       sum3 = tmp[row - 2];
2746       sum4 = tmp[row - 3];
2747       sum5 = tmp[row - 4];
2748       v2   = aa + ad[row] + 1;
2749       v3   = aa + ad[row - 1] + 1;
2750       v4   = aa + ad[row - 2] + 1;
2751       v5   = aa + ad[row - 3] + 1;
2752       for (j = 0; j < nz - 1; j += 2) {
2753         i0   = vi[j];
2754         i1   = vi[j + 1];
2755         tmp0 = tmps[i0];
2756         tmp1 = tmps[i1];
2757         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2758         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2759         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2760         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2761         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2762       }
2763       if (j == nz - 1) {
2764         tmp0 = tmps[vi[j]];
2765         sum1 -= v1[j] * tmp0;
2766         sum2 -= v2[j + 1] * tmp0;
2767         sum3 -= v3[j + 2] * tmp0;
2768         sum4 -= v4[j + 3] * tmp0;
2769         sum5 -= v5[j + 4] * tmp0;
2770       }
2771 
2772       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2773       row--;
2774       sum2 -= v2[0] * tmp0;
2775       sum3 -= v3[1] * tmp0;
2776       sum4 -= v4[2] * tmp0;
2777       sum5 -= v5[3] * tmp0;
2778       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2779       row--;
2780       sum3 -= v3[0] * tmp0;
2781       sum4 -= v4[1] * tmp0;
2782       sum5 -= v5[2] * tmp0;
2783       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2784       row--;
2785       sum4 -= v4[0] * tmp0;
2786       sum5 -= v5[1] * tmp0;
2787       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2788       row--;
2789       sum5 -= v5[0] * tmp0;
2790       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2791       row--;
2792       break;
2793     default:
2794       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2795     }
2796   }
2797   PetscCall(ISRestoreIndices(isrow, &rout));
2798   PetscCall(ISRestoreIndices(iscol, &cout));
2799   PetscCall(VecRestoreArrayRead(bb, &b));
2800   PetscCall(VecRestoreArrayWrite(xx, &x));
2801   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2802   PetscFunctionReturn(PETSC_SUCCESS);
2803 }
2804 
2805 /*
2806      Makes a longer coloring[] array and calls the usual code with that
2807 */
2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2809 {
2810   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2811   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2812   PetscInt        *colorused, i;
2813   ISColoringValue *newcolor;
2814 
2815   PetscFunctionBegin;
2816   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2817   PetscCall(PetscMalloc1(n + 1, &newcolor));
2818   /* loop over inodes, marking a color for each column*/
2819   row = 0;
2820   for (i = 0; i < m; i++) {
2821     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2822   }
2823 
2824   /* eliminate unneeded colors */
2825   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2826   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2827 
2828   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2829   ncolors = colorused[5 * ncolors - 1];
2830   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2831   PetscCall(PetscFree(colorused));
2832   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2833   PetscCall(PetscFree(coloring));
2834   PetscFunctionReturn(PETSC_SUCCESS);
2835 }
2836 
2837 #include <petsc/private/kernels/blockinvert.h>
2838 
2839 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2840 {
2841   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2842   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2843   MatScalar         *ibdiag, *bdiag, work[25], *t;
2844   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2845   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2846   const PetscScalar *xb, *b;
2847   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2848   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
2849   PetscInt           sz, k, ipvt[5];
2850   PetscBool          allowzeropivot, zeropivotdetected;
2851   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
2852 
2853   PetscFunctionBegin;
2854   allowzeropivot = PetscNot(A->erroriffailure);
2855   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2856   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2857   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2858 
2859   if (!a->inode.ibdiagvalid) {
2860     if (!a->inode.ibdiag) {
2861       /* calculate space needed for diagonal blocks */
2862       for (i = 0; i < m; i++) {
2863         nodesz = sizes[i + 1] - sizes[i];
2864         cnt += nodesz * nodesz;
2865       }
2866       a->inode.bdiagsize = cnt;
2867 
2868       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2869     }
2870 
2871     /* copy over the diagonal blocks and invert them */
2872     ibdiag = a->inode.ibdiag;
2873     bdiag  = a->inode.bdiag;
2874     cnt    = 0;
2875     for (i = 0, row = 0; i < m; i++) {
2876       nodesz = sizes[i + 1] - sizes[i];
2877       for (j = 0; j < nodesz; j++) {
2878         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2879       }
2880       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2881 
2882       switch (nodesz) {
2883       case 1:
2884         /* Create matrix data structure */
2885         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2886           PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2887           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2888           A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2889           A->factorerror_zeropivot_row   = row;
2890           PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2891         }
2892         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2893         break;
2894       case 2:
2895         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2896         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2897         break;
2898       case 3:
2899         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2900         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2901         break;
2902       case 4:
2903         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2904         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2905         break;
2906       case 5:
2907         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2908         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2909         break;
2910       default:
2911         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2912       }
2913       cnt += nodesz * nodesz;
2914       row += nodesz;
2915     }
2916     a->inode.ibdiagvalid = PETSC_TRUE;
2917   }
2918   ibdiag = a->inode.ibdiag;
2919   bdiag  = a->inode.bdiag;
2920   t      = a->inode.ssor_work;
2921 
2922   PetscCall(VecGetArray(xx, &x));
2923   PetscCall(VecGetArrayRead(bb, &b));
2924   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2925   if (flag & SOR_ZERO_INITIAL_GUESS) {
2926     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2927       for (i = 0, row = 0; i < m; i++) {
2928         sz  = diag[row] - ii[row];
2929         v1  = a->a + ii[row];
2930         idx = a->j + ii[row];
2931 
2932         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2933         nodesz = sizes[i + 1] - sizes[i];
2934         switch (nodesz) {
2935         case 1:
2936 
2937           sum1 = b[row];
2938           for (n = 0; n < sz - 1; n += 2) {
2939             i1 = idx[0];
2940             i2 = idx[1];
2941             idx += 2;
2942             tmp0 = x[i1];
2943             tmp1 = x[i2];
2944             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2945             v1 += 2;
2946           }
2947 
2948           if (n == sz - 1) {
2949             tmp0 = x[*idx];
2950             sum1 -= *v1 * tmp0;
2951           }
2952           t[row]   = sum1;
2953           x[row++] = sum1 * (*ibdiag++);
2954           break;
2955         case 2:
2956           v2   = a->a + ii[row + 1];
2957           sum1 = b[row];
2958           sum2 = b[row + 1];
2959           for (n = 0; n < sz - 1; n += 2) {
2960             i1 = idx[0];
2961             i2 = idx[1];
2962             idx += 2;
2963             tmp0 = x[i1];
2964             tmp1 = x[i2];
2965             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2966             v1 += 2;
2967             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2968             v2 += 2;
2969           }
2970 
2971           if (n == sz - 1) {
2972             tmp0 = x[*idx];
2973             sum1 -= v1[0] * tmp0;
2974             sum2 -= v2[0] * tmp0;
2975           }
2976           t[row]     = sum1;
2977           t[row + 1] = sum2;
2978           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2979           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2980           ibdiag += 4;
2981           break;
2982         case 3:
2983           v2   = a->a + ii[row + 1];
2984           v3   = a->a + ii[row + 2];
2985           sum1 = b[row];
2986           sum2 = b[row + 1];
2987           sum3 = b[row + 2];
2988           for (n = 0; n < sz - 1; n += 2) {
2989             i1 = idx[0];
2990             i2 = idx[1];
2991             idx += 2;
2992             tmp0 = x[i1];
2993             tmp1 = x[i2];
2994             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2995             v1 += 2;
2996             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2997             v2 += 2;
2998             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2999             v3 += 2;
3000           }
3001 
3002           if (n == sz - 1) {
3003             tmp0 = x[*idx];
3004             sum1 -= v1[0] * tmp0;
3005             sum2 -= v2[0] * tmp0;
3006             sum3 -= v3[0] * tmp0;
3007           }
3008           t[row]     = sum1;
3009           t[row + 1] = sum2;
3010           t[row + 2] = sum3;
3011           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3012           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3013           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3014           ibdiag += 9;
3015           break;
3016         case 4:
3017           v2   = a->a + ii[row + 1];
3018           v3   = a->a + ii[row + 2];
3019           v4   = a->a + ii[row + 3];
3020           sum1 = b[row];
3021           sum2 = b[row + 1];
3022           sum3 = b[row + 2];
3023           sum4 = b[row + 3];
3024           for (n = 0; n < sz - 1; n += 2) {
3025             i1 = idx[0];
3026             i2 = idx[1];
3027             idx += 2;
3028             tmp0 = x[i1];
3029             tmp1 = x[i2];
3030             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3031             v1 += 2;
3032             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3033             v2 += 2;
3034             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3035             v3 += 2;
3036             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3037             v4 += 2;
3038           }
3039 
3040           if (n == sz - 1) {
3041             tmp0 = x[*idx];
3042             sum1 -= v1[0] * tmp0;
3043             sum2 -= v2[0] * tmp0;
3044             sum3 -= v3[0] * tmp0;
3045             sum4 -= v4[0] * tmp0;
3046           }
3047           t[row]     = sum1;
3048           t[row + 1] = sum2;
3049           t[row + 2] = sum3;
3050           t[row + 3] = sum4;
3051           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3052           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3053           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3054           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3055           ibdiag += 16;
3056           break;
3057         case 5:
3058           v2   = a->a + ii[row + 1];
3059           v3   = a->a + ii[row + 2];
3060           v4   = a->a + ii[row + 3];
3061           v5   = a->a + ii[row + 4];
3062           sum1 = b[row];
3063           sum2 = b[row + 1];
3064           sum3 = b[row + 2];
3065           sum4 = b[row + 3];
3066           sum5 = b[row + 4];
3067           for (n = 0; n < sz - 1; n += 2) {
3068             i1 = idx[0];
3069             i2 = idx[1];
3070             idx += 2;
3071             tmp0 = x[i1];
3072             tmp1 = x[i2];
3073             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3074             v1 += 2;
3075             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3076             v2 += 2;
3077             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3078             v3 += 2;
3079             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3080             v4 += 2;
3081             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3082             v5 += 2;
3083           }
3084 
3085           if (n == sz - 1) {
3086             tmp0 = x[*idx];
3087             sum1 -= v1[0] * tmp0;
3088             sum2 -= v2[0] * tmp0;
3089             sum3 -= v3[0] * tmp0;
3090             sum4 -= v4[0] * tmp0;
3091             sum5 -= v5[0] * tmp0;
3092           }
3093           t[row]     = sum1;
3094           t[row + 1] = sum2;
3095           t[row + 2] = sum3;
3096           t[row + 3] = sum4;
3097           t[row + 4] = sum5;
3098           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3099           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3100           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3101           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3102           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3103           ibdiag += 25;
3104           break;
3105         default:
3106           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3107         }
3108       }
3109 
3110       xb = t;
3111       PetscCall(PetscLogFlops(a->nz));
3112     } else xb = b;
3113     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3114       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3115       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3116         nodesz = sizes[i + 1] - sizes[i];
3117         ibdiag -= nodesz * nodesz;
3118         sz  = ii[row + 1] - diag[row] - 1;
3119         v1  = a->a + diag[row] + 1;
3120         idx = a->j + diag[row] + 1;
3121 
3122         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3123         switch (nodesz) {
3124         case 1:
3125 
3126           sum1 = xb[row];
3127           for (n = 0; n < sz - 1; n += 2) {
3128             i1 = idx[0];
3129             i2 = idx[1];
3130             idx += 2;
3131             tmp0 = x[i1];
3132             tmp1 = x[i2];
3133             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3134             v1 += 2;
3135           }
3136 
3137           if (n == sz - 1) {
3138             tmp0 = x[*idx];
3139             sum1 -= *v1 * tmp0;
3140           }
3141           x[row--] = sum1 * (*ibdiag);
3142           break;
3143 
3144         case 2:
3145 
3146           sum1 = xb[row];
3147           sum2 = xb[row - 1];
3148           /* note that sum1 is associated with the second of the two rows */
3149           v2 = a->a + diag[row - 1] + 2;
3150           for (n = 0; n < sz - 1; n += 2) {
3151             i1 = idx[0];
3152             i2 = idx[1];
3153             idx += 2;
3154             tmp0 = x[i1];
3155             tmp1 = x[i2];
3156             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3157             v1 += 2;
3158             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3159             v2 += 2;
3160           }
3161 
3162           if (n == sz - 1) {
3163             tmp0 = x[*idx];
3164             sum1 -= *v1 * tmp0;
3165             sum2 -= *v2 * tmp0;
3166           }
3167           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3168           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3169           break;
3170         case 3:
3171 
3172           sum1 = xb[row];
3173           sum2 = xb[row - 1];
3174           sum3 = xb[row - 2];
3175           v2   = a->a + diag[row - 1] + 2;
3176           v3   = a->a + diag[row - 2] + 3;
3177           for (n = 0; n < sz - 1; n += 2) {
3178             i1 = idx[0];
3179             i2 = idx[1];
3180             idx += 2;
3181             tmp0 = x[i1];
3182             tmp1 = x[i2];
3183             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3184             v1 += 2;
3185             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3186             v2 += 2;
3187             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3188             v3 += 2;
3189           }
3190 
3191           if (n == sz - 1) {
3192             tmp0 = x[*idx];
3193             sum1 -= *v1 * tmp0;
3194             sum2 -= *v2 * tmp0;
3195             sum3 -= *v3 * tmp0;
3196           }
3197           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3198           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3199           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3200           break;
3201         case 4:
3202 
3203           sum1 = xb[row];
3204           sum2 = xb[row - 1];
3205           sum3 = xb[row - 2];
3206           sum4 = xb[row - 3];
3207           v2   = a->a + diag[row - 1] + 2;
3208           v3   = a->a + diag[row - 2] + 3;
3209           v4   = a->a + diag[row - 3] + 4;
3210           for (n = 0; n < sz - 1; n += 2) {
3211             i1 = idx[0];
3212             i2 = idx[1];
3213             idx += 2;
3214             tmp0 = x[i1];
3215             tmp1 = x[i2];
3216             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3217             v1 += 2;
3218             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3219             v2 += 2;
3220             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3221             v3 += 2;
3222             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3223             v4 += 2;
3224           }
3225 
3226           if (n == sz - 1) {
3227             tmp0 = x[*idx];
3228             sum1 -= *v1 * tmp0;
3229             sum2 -= *v2 * tmp0;
3230             sum3 -= *v3 * tmp0;
3231             sum4 -= *v4 * tmp0;
3232           }
3233           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3234           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3235           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3236           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3237           break;
3238         case 5:
3239 
3240           sum1 = xb[row];
3241           sum2 = xb[row - 1];
3242           sum3 = xb[row - 2];
3243           sum4 = xb[row - 3];
3244           sum5 = xb[row - 4];
3245           v2   = a->a + diag[row - 1] + 2;
3246           v3   = a->a + diag[row - 2] + 3;
3247           v4   = a->a + diag[row - 3] + 4;
3248           v5   = a->a + diag[row - 4] + 5;
3249           for (n = 0; n < sz - 1; n += 2) {
3250             i1 = idx[0];
3251             i2 = idx[1];
3252             idx += 2;
3253             tmp0 = x[i1];
3254             tmp1 = x[i2];
3255             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3256             v1 += 2;
3257             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3258             v2 += 2;
3259             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3260             v3 += 2;
3261             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3262             v4 += 2;
3263             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3264             v5 += 2;
3265           }
3266 
3267           if (n == sz - 1) {
3268             tmp0 = x[*idx];
3269             sum1 -= *v1 * tmp0;
3270             sum2 -= *v2 * tmp0;
3271             sum3 -= *v3 * tmp0;
3272             sum4 -= *v4 * tmp0;
3273             sum5 -= *v5 * tmp0;
3274           }
3275           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3276           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3277           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3278           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3279           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3280           break;
3281         default:
3282           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3283         }
3284       }
3285 
3286       PetscCall(PetscLogFlops(a->nz));
3287     }
3288     its--;
3289   }
3290   while (its--) {
3291     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3292       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3293         nodesz = sizes[i + 1] - sizes[i];
3294         sz     = diag[row] - ii[row];
3295         v1     = a->a + ii[row];
3296         idx    = a->j + ii[row];
3297         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3298         switch (nodesz) {
3299         case 1:
3300           sum1 = b[row];
3301           for (n = 0; n < sz - 1; n += 2) {
3302             i1 = idx[0];
3303             i2 = idx[1];
3304             idx += 2;
3305             tmp0 = x[i1];
3306             tmp1 = x[i2];
3307             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3308             v1 += 2;
3309           }
3310           if (n == sz - 1) {
3311             tmp0 = x[*idx++];
3312             sum1 -= *v1 * tmp0;
3313             v1++;
3314           }
3315           t[row] = sum1;
3316           sz     = ii[row + 1] - diag[row] - 1;
3317           idx    = a->j + diag[row] + 1;
3318           v1 += 1;
3319           for (n = 0; n < sz - 1; n += 2) {
3320             i1 = idx[0];
3321             i2 = idx[1];
3322             idx += 2;
3323             tmp0 = x[i1];
3324             tmp1 = x[i2];
3325             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3326             v1 += 2;
3327           }
3328           if (n == sz - 1) {
3329             tmp0 = x[*idx++];
3330             sum1 -= *v1 * tmp0;
3331           }
3332           /* in MatSOR_SeqAIJ this line would be
3333            *
3334            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3335            *
3336            * but omega == 1, so this becomes
3337            *
3338            * x[row] = sum1*(*ibdiag++);
3339            *
3340            */
3341           x[row] = sum1 * (*ibdiag);
3342           break;
3343         case 2:
3344           v2   = a->a + ii[row + 1];
3345           sum1 = b[row];
3346           sum2 = b[row + 1];
3347           for (n = 0; n < sz - 1; n += 2) {
3348             i1 = idx[0];
3349             i2 = idx[1];
3350             idx += 2;
3351             tmp0 = x[i1];
3352             tmp1 = x[i2];
3353             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3354             v1 += 2;
3355             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3356             v2 += 2;
3357           }
3358           if (n == sz - 1) {
3359             tmp0 = x[*idx++];
3360             sum1 -= v1[0] * tmp0;
3361             sum2 -= v2[0] * tmp0;
3362             v1++;
3363             v2++;
3364           }
3365           t[row]     = sum1;
3366           t[row + 1] = sum2;
3367           sz         = ii[row + 1] - diag[row] - 2;
3368           idx        = a->j + diag[row] + 2;
3369           v1 += 2;
3370           v2 += 2;
3371           for (n = 0; n < sz - 1; n += 2) {
3372             i1 = idx[0];
3373             i2 = idx[1];
3374             idx += 2;
3375             tmp0 = x[i1];
3376             tmp1 = x[i2];
3377             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3378             v1 += 2;
3379             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3380             v2 += 2;
3381           }
3382           if (n == sz - 1) {
3383             tmp0 = x[*idx];
3384             sum1 -= v1[0] * tmp0;
3385             sum2 -= v2[0] * tmp0;
3386           }
3387           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3388           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3389           break;
3390         case 3:
3391           v2   = a->a + ii[row + 1];
3392           v3   = a->a + ii[row + 2];
3393           sum1 = b[row];
3394           sum2 = b[row + 1];
3395           sum3 = b[row + 2];
3396           for (n = 0; n < sz - 1; n += 2) {
3397             i1 = idx[0];
3398             i2 = idx[1];
3399             idx += 2;
3400             tmp0 = x[i1];
3401             tmp1 = x[i2];
3402             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3403             v1 += 2;
3404             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3405             v2 += 2;
3406             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3407             v3 += 2;
3408           }
3409           if (n == sz - 1) {
3410             tmp0 = x[*idx++];
3411             sum1 -= v1[0] * tmp0;
3412             sum2 -= v2[0] * tmp0;
3413             sum3 -= v3[0] * tmp0;
3414             v1++;
3415             v2++;
3416             v3++;
3417           }
3418           t[row]     = sum1;
3419           t[row + 1] = sum2;
3420           t[row + 2] = sum3;
3421           sz         = ii[row + 1] - diag[row] - 3;
3422           idx        = a->j + diag[row] + 3;
3423           v1 += 3;
3424           v2 += 3;
3425           v3 += 3;
3426           for (n = 0; n < sz - 1; n += 2) {
3427             i1 = idx[0];
3428             i2 = idx[1];
3429             idx += 2;
3430             tmp0 = x[i1];
3431             tmp1 = x[i2];
3432             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3433             v1 += 2;
3434             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3435             v2 += 2;
3436             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3437             v3 += 2;
3438           }
3439           if (n == sz - 1) {
3440             tmp0 = x[*idx];
3441             sum1 -= v1[0] * tmp0;
3442             sum2 -= v2[0] * tmp0;
3443             sum3 -= v3[0] * tmp0;
3444           }
3445           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3446           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3447           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3448           break;
3449         case 4:
3450           v2   = a->a + ii[row + 1];
3451           v3   = a->a + ii[row + 2];
3452           v4   = a->a + ii[row + 3];
3453           sum1 = b[row];
3454           sum2 = b[row + 1];
3455           sum3 = b[row + 2];
3456           sum4 = b[row + 3];
3457           for (n = 0; n < sz - 1; n += 2) {
3458             i1 = idx[0];
3459             i2 = idx[1];
3460             idx += 2;
3461             tmp0 = x[i1];
3462             tmp1 = x[i2];
3463             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3464             v1 += 2;
3465             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3466             v2 += 2;
3467             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3468             v3 += 2;
3469             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3470             v4 += 2;
3471           }
3472           if (n == sz - 1) {
3473             tmp0 = x[*idx++];
3474             sum1 -= v1[0] * tmp0;
3475             sum2 -= v2[0] * tmp0;
3476             sum3 -= v3[0] * tmp0;
3477             sum4 -= v4[0] * tmp0;
3478             v1++;
3479             v2++;
3480             v3++;
3481             v4++;
3482           }
3483           t[row]     = sum1;
3484           t[row + 1] = sum2;
3485           t[row + 2] = sum3;
3486           t[row + 3] = sum4;
3487           sz         = ii[row + 1] - diag[row] - 4;
3488           idx        = a->j + diag[row] + 4;
3489           v1 += 4;
3490           v2 += 4;
3491           v3 += 4;
3492           v4 += 4;
3493           for (n = 0; n < sz - 1; n += 2) {
3494             i1 = idx[0];
3495             i2 = idx[1];
3496             idx += 2;
3497             tmp0 = x[i1];
3498             tmp1 = x[i2];
3499             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3500             v1 += 2;
3501             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3502             v2 += 2;
3503             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3504             v3 += 2;
3505             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3506             v4 += 2;
3507           }
3508           if (n == sz - 1) {
3509             tmp0 = x[*idx];
3510             sum1 -= v1[0] * tmp0;
3511             sum2 -= v2[0] * tmp0;
3512             sum3 -= v3[0] * tmp0;
3513             sum4 -= v4[0] * tmp0;
3514           }
3515           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3516           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3517           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3518           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3519           break;
3520         case 5:
3521           v2   = a->a + ii[row + 1];
3522           v3   = a->a + ii[row + 2];
3523           v4   = a->a + ii[row + 3];
3524           v5   = a->a + ii[row + 4];
3525           sum1 = b[row];
3526           sum2 = b[row + 1];
3527           sum3 = b[row + 2];
3528           sum4 = b[row + 3];
3529           sum5 = b[row + 4];
3530           for (n = 0; n < sz - 1; n += 2) {
3531             i1 = idx[0];
3532             i2 = idx[1];
3533             idx += 2;
3534             tmp0 = x[i1];
3535             tmp1 = x[i2];
3536             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3537             v1 += 2;
3538             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3539             v2 += 2;
3540             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3541             v3 += 2;
3542             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3543             v4 += 2;
3544             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3545             v5 += 2;
3546           }
3547           if (n == sz - 1) {
3548             tmp0 = x[*idx++];
3549             sum1 -= v1[0] * tmp0;
3550             sum2 -= v2[0] * tmp0;
3551             sum3 -= v3[0] * tmp0;
3552             sum4 -= v4[0] * tmp0;
3553             sum5 -= v5[0] * tmp0;
3554             v1++;
3555             v2++;
3556             v3++;
3557             v4++;
3558             v5++;
3559           }
3560           t[row]     = sum1;
3561           t[row + 1] = sum2;
3562           t[row + 2] = sum3;
3563           t[row + 3] = sum4;
3564           t[row + 4] = sum5;
3565           sz         = ii[row + 1] - diag[row] - 5;
3566           idx        = a->j + diag[row] + 5;
3567           v1 += 5;
3568           v2 += 5;
3569           v3 += 5;
3570           v4 += 5;
3571           v5 += 5;
3572           for (n = 0; n < sz - 1; n += 2) {
3573             i1 = idx[0];
3574             i2 = idx[1];
3575             idx += 2;
3576             tmp0 = x[i1];
3577             tmp1 = x[i2];
3578             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3579             v1 += 2;
3580             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3581             v2 += 2;
3582             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3583             v3 += 2;
3584             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3585             v4 += 2;
3586             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3587             v5 += 2;
3588           }
3589           if (n == sz - 1) {
3590             tmp0 = x[*idx];
3591             sum1 -= v1[0] * tmp0;
3592             sum2 -= v2[0] * tmp0;
3593             sum3 -= v3[0] * tmp0;
3594             sum4 -= v4[0] * tmp0;
3595             sum5 -= v5[0] * tmp0;
3596           }
3597           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3598           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3599           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3600           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3601           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3602           break;
3603         default:
3604           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3605         }
3606       }
3607       xb = t;
3608       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3609     } else xb = b;
3610 
3611     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3612       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3613       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3614         nodesz = sizes[i + 1] - sizes[i];
3615         ibdiag -= nodesz * nodesz;
3616 
3617         /* set RHS */
3618         if (xb == b) {
3619           /* whole (old way) */
3620           sz  = ii[row + 1] - ii[row];
3621           idx = a->j + ii[row];
3622           switch (nodesz) {
3623           case 5:
3624             v5 = a->a + ii[row - 4]; /* fall through */
3625           case 4:
3626             v4 = a->a + ii[row - 3]; /* fall through */
3627           case 3:
3628             v3 = a->a + ii[row - 2]; /* fall through */
3629           case 2:
3630             v2 = a->a + ii[row - 1]; /* fall through */
3631           case 1:
3632             v1 = a->a + ii[row];
3633             break;
3634           default:
3635             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3636           }
3637         } else {
3638           /* upper, no diag */
3639           sz  = ii[row + 1] - diag[row] - 1;
3640           idx = a->j + diag[row] + 1;
3641           switch (nodesz) {
3642           case 5:
3643             v5 = a->a + diag[row - 4] + 5; /* fall through */
3644           case 4:
3645             v4 = a->a + diag[row - 3] + 4; /* fall through */
3646           case 3:
3647             v3 = a->a + diag[row - 2] + 3; /* fall through */
3648           case 2:
3649             v2 = a->a + diag[row - 1] + 2; /* fall through */
3650           case 1:
3651             v1 = a->a + diag[row] + 1;
3652           }
3653         }
3654         /* set sum */
3655         switch (nodesz) {
3656         case 5:
3657           sum5 = xb[row - 4]; /* fall through */
3658         case 4:
3659           sum4 = xb[row - 3]; /* fall through */
3660         case 3:
3661           sum3 = xb[row - 2]; /* fall through */
3662         case 2:
3663           sum2 = xb[row - 1]; /* fall through */
3664         case 1:
3665           /* note that sum1 is associated with the last row */
3666           sum1 = xb[row];
3667         }
3668         /* do sums */
3669         for (n = 0; n < sz - 1; n += 2) {
3670           i1 = idx[0];
3671           i2 = idx[1];
3672           idx += 2;
3673           tmp0 = x[i1];
3674           tmp1 = x[i2];
3675           switch (nodesz) {
3676           case 5:
3677             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3678             v5 += 2; /* fall through */
3679           case 4:
3680             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3681             v4 += 2; /* fall through */
3682           case 3:
3683             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3684             v3 += 2; /* fall through */
3685           case 2:
3686             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3687             v2 += 2; /* fall through */
3688           case 1:
3689             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3690             v1 += 2;
3691           }
3692         }
3693         /* ragged edge */
3694         if (n == sz - 1) {
3695           tmp0 = x[*idx];
3696           switch (nodesz) {
3697           case 5:
3698             sum5 -= *v5 * tmp0; /* fall through */
3699           case 4:
3700             sum4 -= *v4 * tmp0; /* fall through */
3701           case 3:
3702             sum3 -= *v3 * tmp0; /* fall through */
3703           case 2:
3704             sum2 -= *v2 * tmp0; /* fall through */
3705           case 1:
3706             sum1 -= *v1 * tmp0;
3707           }
3708         }
3709         /* update */
3710         if (xb == b) {
3711           /* whole (old way) w/ diag */
3712           switch (nodesz) {
3713           case 5:
3714             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3715             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3716             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3717             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3718             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3719             break;
3720           case 4:
3721             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3722             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3723             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3724             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3725             break;
3726           case 3:
3727             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3728             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3729             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3730             break;
3731           case 2:
3732             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3733             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3734             break;
3735           case 1:
3736             x[row--] += sum1 * (*ibdiag);
3737             break;
3738           }
3739         } else {
3740           /* no diag so set =  */
3741           switch (nodesz) {
3742           case 5:
3743             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3744             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3745             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3746             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3747             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3748             break;
3749           case 4:
3750             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3751             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3752             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3753             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3754             break;
3755           case 3:
3756             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3757             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3758             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3759             break;
3760           case 2:
3761             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3762             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3763             break;
3764           case 1:
3765             x[row--] = sum1 * (*ibdiag);
3766             break;
3767           }
3768         }
3769       }
3770       if (xb == b) {
3771         PetscCall(PetscLogFlops(2.0 * a->nz));
3772       } else {
3773         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3774       }
3775     }
3776   }
3777   if (flag & SOR_EISENSTAT) {
3778     /*
3779           Apply  (U + D)^-1  where D is now the block diagonal
3780     */
3781     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3782     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3783       nodesz = sizes[i + 1] - sizes[i];
3784       ibdiag -= nodesz * nodesz;
3785       sz  = ii[row + 1] - diag[row] - 1;
3786       v1  = a->a + diag[row] + 1;
3787       idx = a->j + diag[row] + 1;
3788       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3789       switch (nodesz) {
3790       case 1:
3791 
3792         sum1 = b[row];
3793         for (n = 0; n < sz - 1; n += 2) {
3794           i1 = idx[0];
3795           i2 = idx[1];
3796           idx += 2;
3797           tmp0 = x[i1];
3798           tmp1 = x[i2];
3799           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3800           v1 += 2;
3801         }
3802 
3803         if (n == sz - 1) {
3804           tmp0 = x[*idx];
3805           sum1 -= *v1 * tmp0;
3806         }
3807         x[row] = sum1 * (*ibdiag);
3808         row--;
3809         break;
3810 
3811       case 2:
3812 
3813         sum1 = b[row];
3814         sum2 = b[row - 1];
3815         /* note that sum1 is associated with the second of the two rows */
3816         v2 = a->a + diag[row - 1] + 2;
3817         for (n = 0; n < sz - 1; n += 2) {
3818           i1 = idx[0];
3819           i2 = idx[1];
3820           idx += 2;
3821           tmp0 = x[i1];
3822           tmp1 = x[i2];
3823           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3824           v1 += 2;
3825           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3826           v2 += 2;
3827         }
3828 
3829         if (n == sz - 1) {
3830           tmp0 = x[*idx];
3831           sum1 -= *v1 * tmp0;
3832           sum2 -= *v2 * tmp0;
3833         }
3834         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3835         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3836         row -= 2;
3837         break;
3838       case 3:
3839 
3840         sum1 = b[row];
3841         sum2 = b[row - 1];
3842         sum3 = b[row - 2];
3843         v2   = a->a + diag[row - 1] + 2;
3844         v3   = a->a + diag[row - 2] + 3;
3845         for (n = 0; n < sz - 1; n += 2) {
3846           i1 = idx[0];
3847           i2 = idx[1];
3848           idx += 2;
3849           tmp0 = x[i1];
3850           tmp1 = x[i2];
3851           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3852           v1 += 2;
3853           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3854           v2 += 2;
3855           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3856           v3 += 2;
3857         }
3858 
3859         if (n == sz - 1) {
3860           tmp0 = x[*idx];
3861           sum1 -= *v1 * tmp0;
3862           sum2 -= *v2 * tmp0;
3863           sum3 -= *v3 * tmp0;
3864         }
3865         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3866         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3867         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3868         row -= 3;
3869         break;
3870       case 4:
3871 
3872         sum1 = b[row];
3873         sum2 = b[row - 1];
3874         sum3 = b[row - 2];
3875         sum4 = b[row - 3];
3876         v2   = a->a + diag[row - 1] + 2;
3877         v3   = a->a + diag[row - 2] + 3;
3878         v4   = a->a + diag[row - 3] + 4;
3879         for (n = 0; n < sz - 1; n += 2) {
3880           i1 = idx[0];
3881           i2 = idx[1];
3882           idx += 2;
3883           tmp0 = x[i1];
3884           tmp1 = x[i2];
3885           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3886           v1 += 2;
3887           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3888           v2 += 2;
3889           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3890           v3 += 2;
3891           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3892           v4 += 2;
3893         }
3894 
3895         if (n == sz - 1) {
3896           tmp0 = x[*idx];
3897           sum1 -= *v1 * tmp0;
3898           sum2 -= *v2 * tmp0;
3899           sum3 -= *v3 * tmp0;
3900           sum4 -= *v4 * tmp0;
3901         }
3902         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3903         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3904         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3905         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3906         row -= 4;
3907         break;
3908       case 5:
3909 
3910         sum1 = b[row];
3911         sum2 = b[row - 1];
3912         sum3 = b[row - 2];
3913         sum4 = b[row - 3];
3914         sum5 = b[row - 4];
3915         v2   = a->a + diag[row - 1] + 2;
3916         v3   = a->a + diag[row - 2] + 3;
3917         v4   = a->a + diag[row - 3] + 4;
3918         v5   = a->a + diag[row - 4] + 5;
3919         for (n = 0; n < sz - 1; n += 2) {
3920           i1 = idx[0];
3921           i2 = idx[1];
3922           idx += 2;
3923           tmp0 = x[i1];
3924           tmp1 = x[i2];
3925           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3926           v1 += 2;
3927           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3928           v2 += 2;
3929           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3930           v3 += 2;
3931           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3932           v4 += 2;
3933           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3934           v5 += 2;
3935         }
3936 
3937         if (n == sz - 1) {
3938           tmp0 = x[*idx];
3939           sum1 -= *v1 * tmp0;
3940           sum2 -= *v2 * tmp0;
3941           sum3 -= *v3 * tmp0;
3942           sum4 -= *v4 * tmp0;
3943           sum5 -= *v5 * tmp0;
3944         }
3945         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3946         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3947         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3948         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3949         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3950         row -= 5;
3951         break;
3952       default:
3953         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3954       }
3955     }
3956     PetscCall(PetscLogFlops(a->nz));
3957 
3958     /*
3959            t = b - D x    where D is the block diagonal
3960     */
3961     cnt = 0;
3962     for (i = 0, row = 0; i < m; i++) {
3963       nodesz = sizes[i + 1] - sizes[i];
3964       switch (nodesz) {
3965       case 1:
3966         t[row] = b[row] - bdiag[cnt++] * x[row];
3967         row++;
3968         break;
3969       case 2:
3970         x1         = x[row];
3971         x2         = x[row + 1];
3972         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3973         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3974         t[row]     = b[row] - tmp1;
3975         t[row + 1] = b[row + 1] - tmp2;
3976         row += 2;
3977         cnt += 4;
3978         break;
3979       case 3:
3980         x1         = x[row];
3981         x2         = x[row + 1];
3982         x3         = x[row + 2];
3983         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3984         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3985         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3986         t[row]     = b[row] - tmp1;
3987         t[row + 1] = b[row + 1] - tmp2;
3988         t[row + 2] = b[row + 2] - tmp3;
3989         row += 3;
3990         cnt += 9;
3991         break;
3992       case 4:
3993         x1         = x[row];
3994         x2         = x[row + 1];
3995         x3         = x[row + 2];
3996         x4         = x[row + 3];
3997         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3998         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3999         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4000         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4001         t[row]     = b[row] - tmp1;
4002         t[row + 1] = b[row + 1] - tmp2;
4003         t[row + 2] = b[row + 2] - tmp3;
4004         t[row + 3] = b[row + 3] - tmp4;
4005         row += 4;
4006         cnt += 16;
4007         break;
4008       case 5:
4009         x1         = x[row];
4010         x2         = x[row + 1];
4011         x3         = x[row + 2];
4012         x4         = x[row + 3];
4013         x5         = x[row + 4];
4014         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4015         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4016         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4017         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4018         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4019         t[row]     = b[row] - tmp1;
4020         t[row + 1] = b[row + 1] - tmp2;
4021         t[row + 2] = b[row + 2] - tmp3;
4022         t[row + 3] = b[row + 3] - tmp4;
4023         t[row + 4] = b[row + 4] - tmp5;
4024         row += 5;
4025         cnt += 25;
4026         break;
4027       default:
4028         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4029       }
4030     }
4031     PetscCall(PetscLogFlops(m));
4032 
4033     /*
4034           Apply (L + D)^-1 where D is the block diagonal
4035     */
4036     for (i = 0, row = 0; i < m; i++) {
4037       nodesz = sizes[i + 1] - sizes[i];
4038       sz     = diag[row] - ii[row];
4039       v1     = a->a + ii[row];
4040       idx    = a->j + ii[row];
4041       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4042       switch (nodesz) {
4043       case 1:
4044 
4045         sum1 = t[row];
4046         for (n = 0; n < sz - 1; n += 2) {
4047           i1 = idx[0];
4048           i2 = idx[1];
4049           idx += 2;
4050           tmp0 = t[i1];
4051           tmp1 = t[i2];
4052           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4053           v1 += 2;
4054         }
4055 
4056         if (n == sz - 1) {
4057           tmp0 = t[*idx];
4058           sum1 -= *v1 * tmp0;
4059         }
4060         x[row] += t[row] = sum1 * (*ibdiag++);
4061         row++;
4062         break;
4063       case 2:
4064         v2   = a->a + ii[row + 1];
4065         sum1 = t[row];
4066         sum2 = t[row + 1];
4067         for (n = 0; n < sz - 1; n += 2) {
4068           i1 = idx[0];
4069           i2 = idx[1];
4070           idx += 2;
4071           tmp0 = t[i1];
4072           tmp1 = t[i2];
4073           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4074           v1 += 2;
4075           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4076           v2 += 2;
4077         }
4078 
4079         if (n == sz - 1) {
4080           tmp0 = t[*idx];
4081           sum1 -= v1[0] * tmp0;
4082           sum2 -= v2[0] * tmp0;
4083         }
4084         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4085         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4086         ibdiag += 4;
4087         row += 2;
4088         break;
4089       case 3:
4090         v2   = a->a + ii[row + 1];
4091         v3   = a->a + ii[row + 2];
4092         sum1 = t[row];
4093         sum2 = t[row + 1];
4094         sum3 = t[row + 2];
4095         for (n = 0; n < sz - 1; n += 2) {
4096           i1 = idx[0];
4097           i2 = idx[1];
4098           idx += 2;
4099           tmp0 = t[i1];
4100           tmp1 = t[i2];
4101           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4102           v1 += 2;
4103           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4104           v2 += 2;
4105           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4106           v3 += 2;
4107         }
4108 
4109         if (n == sz - 1) {
4110           tmp0 = t[*idx];
4111           sum1 -= v1[0] * tmp0;
4112           sum2 -= v2[0] * tmp0;
4113           sum3 -= v3[0] * tmp0;
4114         }
4115         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4116         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4117         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4118         ibdiag += 9;
4119         row += 3;
4120         break;
4121       case 4:
4122         v2   = a->a + ii[row + 1];
4123         v3   = a->a + ii[row + 2];
4124         v4   = a->a + ii[row + 3];
4125         sum1 = t[row];
4126         sum2 = t[row + 1];
4127         sum3 = t[row + 2];
4128         sum4 = t[row + 3];
4129         for (n = 0; n < sz - 1; n += 2) {
4130           i1 = idx[0];
4131           i2 = idx[1];
4132           idx += 2;
4133           tmp0 = t[i1];
4134           tmp1 = t[i2];
4135           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4136           v1 += 2;
4137           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4138           v2 += 2;
4139           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4140           v3 += 2;
4141           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4142           v4 += 2;
4143         }
4144 
4145         if (n == sz - 1) {
4146           tmp0 = t[*idx];
4147           sum1 -= v1[0] * tmp0;
4148           sum2 -= v2[0] * tmp0;
4149           sum3 -= v3[0] * tmp0;
4150           sum4 -= v4[0] * tmp0;
4151         }
4152         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4153         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4154         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4155         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4156         ibdiag += 16;
4157         row += 4;
4158         break;
4159       case 5:
4160         v2   = a->a + ii[row + 1];
4161         v3   = a->a + ii[row + 2];
4162         v4   = a->a + ii[row + 3];
4163         v5   = a->a + ii[row + 4];
4164         sum1 = t[row];
4165         sum2 = t[row + 1];
4166         sum3 = t[row + 2];
4167         sum4 = t[row + 3];
4168         sum5 = t[row + 4];
4169         for (n = 0; n < sz - 1; n += 2) {
4170           i1 = idx[0];
4171           i2 = idx[1];
4172           idx += 2;
4173           tmp0 = t[i1];
4174           tmp1 = t[i2];
4175           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4176           v1 += 2;
4177           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4178           v2 += 2;
4179           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4180           v3 += 2;
4181           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4182           v4 += 2;
4183           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4184           v5 += 2;
4185         }
4186 
4187         if (n == sz - 1) {
4188           tmp0 = t[*idx];
4189           sum1 -= v1[0] * tmp0;
4190           sum2 -= v2[0] * tmp0;
4191           sum3 -= v3[0] * tmp0;
4192           sum4 -= v4[0] * tmp0;
4193           sum5 -= v5[0] * tmp0;
4194         }
4195         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4196         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4197         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4198         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4199         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4200         ibdiag += 25;
4201         row += 5;
4202         break;
4203       default:
4204         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4205       }
4206     }
4207     PetscCall(PetscLogFlops(a->nz));
4208   }
4209   PetscCall(VecRestoreArray(xx, &x));
4210   PetscCall(VecRestoreArrayRead(bb, &b));
4211   PetscFunctionReturn(PETSC_SUCCESS);
4212 }
4213 
4214 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4215 {
4216   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4217   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4218   const MatScalar   *bdiag = a->inode.bdiag;
4219   const PetscScalar *b;
4220   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4221   const PetscInt    *sizes = a->inode.size_csr;
4222 
4223   PetscFunctionBegin;
4224   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4225   PetscCall(VecGetArray(xx, &x));
4226   PetscCall(VecGetArrayRead(bb, &b));
4227   cnt = 0;
4228   for (i = 0, row = 0; i < m; i++) {
4229     nodesz = sizes[i + 1] - sizes[i];
4230     switch (nodesz) {
4231     case 1:
4232       x[row] = b[row] * bdiag[cnt++];
4233       row++;
4234       break;
4235     case 2:
4236       x1       = b[row];
4237       x2       = b[row + 1];
4238       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4239       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4240       x[row++] = tmp1;
4241       x[row++] = tmp2;
4242       cnt += 4;
4243       break;
4244     case 3:
4245       x1       = b[row];
4246       x2       = b[row + 1];
4247       x3       = b[row + 2];
4248       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4249       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4250       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4251       x[row++] = tmp1;
4252       x[row++] = tmp2;
4253       x[row++] = tmp3;
4254       cnt += 9;
4255       break;
4256     case 4:
4257       x1       = b[row];
4258       x2       = b[row + 1];
4259       x3       = b[row + 2];
4260       x4       = b[row + 3];
4261       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4262       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4263       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4264       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4265       x[row++] = tmp1;
4266       x[row++] = tmp2;
4267       x[row++] = tmp3;
4268       x[row++] = tmp4;
4269       cnt += 16;
4270       break;
4271     case 5:
4272       x1       = b[row];
4273       x2       = b[row + 1];
4274       x3       = b[row + 2];
4275       x4       = b[row + 3];
4276       x5       = b[row + 4];
4277       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4278       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4279       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4280       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4281       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4282       x[row++] = tmp1;
4283       x[row++] = tmp2;
4284       x[row++] = tmp3;
4285       x[row++] = tmp4;
4286       x[row++] = tmp5;
4287       cnt += 25;
4288       break;
4289     default:
4290       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
4291     }
4292   }
4293   PetscCall(PetscLogFlops(2.0 * cnt));
4294   PetscCall(VecRestoreArray(xx, &x));
4295   PetscCall(VecRestoreArrayRead(bb, &b));
4296   PetscFunctionReturn(PETSC_SUCCESS);
4297 }
4298 
4299 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4300 {
4301   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4302 
4303   PetscFunctionBegin;
4304   a->inode.node_count       = 0;
4305   a->inode.use              = PETSC_FALSE;
4306   a->inode.checked          = PETSC_FALSE;
4307   a->inode.mat_nonzerostate = -1;
4308   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4309   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4310   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4311   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4312   A->ops->coloringpatch     = NULL;
4313   A->ops->multdiagonalblock = NULL;
4314   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4315   PetscFunctionReturn(PETSC_SUCCESS);
4316 }
4317 
4318 /*
4319     samestructure indicates that the matrix has not changed its nonzero structure so we
4320     do not need to recompute the inodes
4321 */
4322 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4323 {
4324   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4325   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4326   PetscBool       flag;
4327   const PetscInt *idx, *idy, *ii;
4328 
4329   PetscFunctionBegin;
4330   if (!a->inode.use) {
4331     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4332     PetscCall(PetscFree(a->inode.size_csr));
4333     PetscFunctionReturn(PETSC_SUCCESS);
4334   }
4335   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4336 
4337   m = A->rmap->n;
4338   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4339   ns    = a->inode.size_csr;
4340   ns[0] = 0;
4341 
4342   i          = 0;
4343   node_count = 0;
4344   idx        = a->j;
4345   ii         = a->i;
4346   if (idx) {
4347     while (i < m) {            /* For each row */
4348       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4349       /* Limits the number of elements in a node to 'a->inode.limit' */
4350       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4351         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4352         if (nzy != nzx) break;
4353         idy += nzx; /* Same nonzero pattern */
4354         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4355         if (!flag) break;
4356       }
4357       ns[node_count + 1] = ns[node_count] + blk_size;
4358       node_count++;
4359       idx += blk_size * nzx;
4360       i = j;
4361     }
4362   }
4363   /* If not enough inodes found,, do not use inode version of the routines */
4364   if (!m || !idx || node_count > .8 * m) {
4365     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4366     PetscCall(PetscFree(a->inode.size_csr));
4367     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4368   } else {
4369     if (!A->factortype) {
4370       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4371       if (A->rmap->n == A->cmap->n) {
4372         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4373         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4374         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4375         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4376         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4377       }
4378     } else {
4379       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4380     }
4381     a->inode.node_count = node_count;
4382     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4383   }
4384   a->inode.checked          = PETSC_TRUE;
4385   a->inode.mat_nonzerostate = A->nonzerostate;
4386   PetscFunctionReturn(PETSC_SUCCESS);
4387 }
4388 
4389 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4390 {
4391   Mat         B = *C;
4392   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4393   PetscInt    m = A->rmap->n;
4394 
4395   PetscFunctionBegin;
4396   c->inode.use              = a->inode.use;
4397   c->inode.limit            = a->inode.limit;
4398   c->inode.max_limit        = a->inode.max_limit;
4399   c->inode.checked          = PETSC_FALSE;
4400   c->inode.size_csr         = NULL;
4401   c->inode.node_count       = 0;
4402   c->inode.ibdiagvalid      = PETSC_FALSE;
4403   c->inode.ibdiag           = NULL;
4404   c->inode.bdiag            = NULL;
4405   c->inode.mat_nonzerostate = -1;
4406   if (a->inode.use) {
4407     if (a->inode.checked && a->inode.size_csr) {
4408       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4409       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4410 
4411       c->inode.checked          = PETSC_TRUE;
4412       c->inode.node_count       = a->inode.node_count;
4413       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4414     }
4415     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4416     if (!B->factortype) {
4417       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4418       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4419       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4420       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4421       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4422       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4423     } else {
4424       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4425     }
4426   }
4427   PetscFunctionReturn(PETSC_SUCCESS);
4428 }
4429 
4430 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4431 {
4432   PetscInt        k;
4433   const PetscInt *vi;
4434 
4435   PetscFunctionBegin;
4436   vi = aj + ai[row];
4437   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4438   vi        = aj + adiag[row];
4439   cols[nzl] = vi[0];
4440   vi        = aj + adiag[row + 1] + 1;
4441   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4442   PetscFunctionReturn(PETSC_SUCCESS);
4443 }
4444 /*
4445    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4446    Modified from MatSeqAIJCheckInode().
4447 
4448    Input Parameters:
4449 .  Mat A - ILU or LU matrix factor
4450 
4451 */
4452 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4453 {
4454   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4455   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4456   PetscInt       *cols1, *cols2, *ns;
4457   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4458   PetscBool       flag;
4459 
4460   PetscFunctionBegin;
4461   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4462   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4463 
4464   m = A->rmap->n;
4465   if (a->inode.size_csr) ns = a->inode.size_csr;
4466   else PetscCall(PetscMalloc1(m + 1, &ns));
4467   ns[0] = 0;
4468 
4469   i          = 0;
4470   node_count = 0;
4471   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4472   while (i < m) {                       /* For each row */
4473     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4474     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4475     nzx  = nzl1 + nzu1 + 1;
4476     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4477 
4478     /* Limits the number of elements in a node to 'a->inode.limit' */
4479     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4480       nzl2 = ai[j + 1] - ai[j];
4481       nzu2 = adiag[j] - adiag[j + 1] - 1;
4482       nzy  = nzl2 + nzu2 + 1;
4483       if (nzy != nzx) break;
4484       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4485       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4486       if (!flag) break;
4487     }
4488     ns[node_count + 1] = ns[node_count] + blk_size;
4489     node_count++;
4490     i = j;
4491   }
4492   PetscCall(PetscFree2(cols1, cols2));
4493   /* If not enough inodes found,, do not use inode version of the routines */
4494   if (!m || node_count > .8 * m) {
4495     PetscCall(PetscFree(ns));
4496 
4497     a->inode.node_count = 0;
4498     a->inode.size_csr   = NULL;
4499     a->inode.use        = PETSC_FALSE;
4500 
4501     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4502   } else {
4503     A->ops->mult              = NULL;
4504     A->ops->sor               = NULL;
4505     A->ops->multadd           = NULL;
4506     A->ops->getrowij          = NULL;
4507     A->ops->restorerowij      = NULL;
4508     A->ops->getcolumnij       = NULL;
4509     A->ops->restorecolumnij   = NULL;
4510     A->ops->coloringpatch     = NULL;
4511     A->ops->multdiagonalblock = NULL;
4512     a->inode.node_count       = node_count;
4513     a->inode.size_csr         = ns;
4514     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4515   }
4516   a->inode.checked = PETSC_TRUE;
4517   PetscFunctionReturn(PETSC_SUCCESS);
4518 }
4519 
4520 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4521 {
4522   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4523 
4524   PetscFunctionBegin;
4525   a->inode.ibdiagvalid = PETSC_FALSE;
4526   PetscFunctionReturn(PETSC_SUCCESS);
4527 }
4528 
4529 /*
4530      This is really ugly. if inodes are used this replaces the
4531   permutations with ones that correspond to rows/cols of the matrix
4532   rather than inode blocks
4533 */
4534 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4535 {
4536   PetscFunctionBegin;
4537   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4538   PetscFunctionReturn(PETSC_SUCCESS);
4539 }
4540 
4541 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4542 {
4543   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4544   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4545   const PetscInt *ridx, *cidx;
4546   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4547   PetscInt        nslim_col, *ns_col;
4548   IS              ris = *rperm, cis = *cperm;
4549 
4550   PetscFunctionBegin;
4551   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4552   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4553 
4554   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4555   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4556   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4557 
4558   PetscCall(ISGetIndices(ris, &ridx));
4559   PetscCall(ISGetIndices(cis, &cidx));
4560 
4561   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4562   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4563 
4564   /* Construct the permutations for rows*/
4565   for (i = 0, row = 0; i < nslim_row; ++i) {
4566     indx      = ridx[i];
4567     start_val = tns[indx];
4568     end_val   = tns[indx + 1];
4569     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4570   }
4571 
4572   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4573   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4574 
4575   /* Construct permutations for columns */
4576   for (i = 0, col = 0; i < nslim_col; ++i) {
4577     indx      = cidx[i];
4578     start_val = tns[indx];
4579     end_val   = tns[indx + 1];
4580     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4581   }
4582 
4583   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4584   PetscCall(ISSetPermutation(*rperm));
4585   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4586   PetscCall(ISSetPermutation(*cperm));
4587 
4588   PetscCall(ISRestoreIndices(ris, &ridx));
4589   PetscCall(ISRestoreIndices(cis, &cidx));
4590 
4591   PetscCall(PetscFree(ns_col));
4592   PetscCall(PetscFree2(permr, permc));
4593   PetscCall(ISDestroy(&cis));
4594   PetscCall(ISDestroy(&ris));
4595   PetscCall(PetscFree(tns));
4596   PetscFunctionReturn(PETSC_SUCCESS);
4597 }
4598 
4599 /*@C
4600   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4601 
4602   Not Collective
4603 
4604   Input Parameter:
4605 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4606 
4607   Output Parameters:
4608 + node_count - no of inodes present in the matrix.
4609 . sizes      - an array of size `node_count`, with the sizes of each inode.
4610 - limit      - the max size used to generate the inodes.
4611 
4612   Level: advanced
4613 
4614   Note:
4615   It should be called after the matrix is assembled.
4616   The contents of the sizes[] array should not be changed.
4617   `NULL` may be passed for information not needed
4618 
4619 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4620 @*/
4621 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4622 {
4623   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4624 
4625   PetscFunctionBegin;
4626   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4627   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4628   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4629   PetscFunctionReturn(PETSC_SUCCESS);
4630 }
4631 
4632 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4633 {
4634   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4635 
4636   PetscFunctionBegin;
4637   if (node_count) *node_count = a->inode.node_count;
4638   if (sizes) *sizes = a->inode.size_csr;
4639   if (limit) *limit = a->inode.limit;
4640   PetscFunctionReturn(PETSC_SUCCESS);
4641 }
4642