xref: /petsc/src/mat/impls/aij/seq/inode.c (revision d5b43468fb8780a8feea140ccd6fa3e6a50411cc)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
9 {
10   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
11   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
12 
13   PetscFunctionBegin;
14   n = A->cmap->n;
15   m = A->rmap->n;
16   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
22       ;
23     for (; count + 1 < n; count++, i++)
24       ;
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30 
31   /* Use the same row structure wherever feasible. */
32   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
33 
34   /* if m < n; pad up the remainder with inode_limit */
35   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36   /* The last node is the odd ball. padd it up with the remaining rows; */
37   if (count < n) {
38     ns_col[i] = n - count;
39     i++;
40   } else if (count > n) {
41     /* Adjust for the over estimation */
42     ns_col[i - 1] += n - count;
43   }
44   *size = i;
45   *ns   = ns_col;
46   PetscFunctionReturn(0);
47 }
48 
49 /*
50       This builds symmetric version of nonzero structure,
51 */
52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53 {
54   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
55   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
58 
59   PetscFunctionBegin;
60   nslim_row = a->inode.node_count;
61   m         = A->rmap->n;
62   n         = A->cmap->n;
63   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
65 
66   /* Use the row_inode as column_inode */
67   nslim_col = nslim_row;
68   ns_col    = ns_row;
69 
70   /* allocate space for reformated inode structure */
71   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
73 
74   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75     nsz = ns_col[i1];
76     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77   }
78   /* allocate space for row pointers */
79   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80   *iia = ia;
81   PetscCall(PetscMalloc1(nslim_row + 1, &work));
82 
83   /* determine the number of columns in each row */
84   ia[0] = oshift;
85   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86     j    = aj + ai[row] + ishift;
87     jmax = aj + ai[row + 1] + ishift;
88     if (j == jmax) continue; /* empty row */
89     col = *j++ + ishift;
90     i2  = tvc[col];
91     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92       ia[i1 + 1]++;
93       ia[i2 + 1]++;
94       i2++; /* Start col of next node */
95       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96       i2 = tvc[col];
97     }
98     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99   }
100 
101   /* shift ia[i] to point to next row */
102   for (i1 = 1; i1 < nslim_row + 1; i1++) {
103     row = ia[i1 - 1];
104     ia[i1] += row;
105     work[i1 - 1] = row - oshift;
106   }
107 
108   /* allocate space for column pointers */
109   nz = ia[nslim_row] + (!ishift);
110   PetscCall(PetscMalloc1(nz, &ja));
111   *jja = ja;
112 
113   /* loop over lower triangular part putting into ja */
114   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115     j    = aj + ai[row] + ishift;
116     jmax = aj + ai[row + 1] + ishift;
117     if (j == jmax) continue; /* empty row */
118     col = *j++ + ishift;
119     i2  = tvc[col];
120     while (i2 < i1 && j < jmax) {
121       ja[work[i2]++] = i1 + oshift;
122       ja[work[i1]++] = i2 + oshift;
123       ++i2;
124       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125       i2 = tvc[col];
126     }
127     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128   }
129   PetscCall(PetscFree(work));
130   PetscCall(PetscFree2(tns, tvc));
131   PetscFunctionReturn(0);
132 }
133 
134 /*
135       This builds nonsymmetric version of nonzero structure,
136 */
137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138 {
139   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
140   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141   PetscInt       *tns, *tvc, nsz, i1, i2;
142   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
143 
144   PetscFunctionBegin;
145   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146   nslim_row = a->inode.node_count;
147   n         = A->cmap->n;
148 
149   /* Create The column_inode for this matrix */
150   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
151 
152   /* allocate space for reformated column_inode structure */
153   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
155 
156   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157     nsz = ns_col[i1];
158     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159   }
160   /* allocate space for row pointers */
161   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162   *iia = ia;
163   PetscCall(PetscMalloc1(nslim_row + 1, &work));
164 
165   /* determine the number of columns in each row */
166   ia[0] = oshift;
167   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168     j  = aj + ai[row] + ishift;
169     nz = ai[row + 1] - ai[row];
170     if (!nz) continue; /* empty row */
171     col = *j++ + ishift;
172     i2  = tvc[col];
173     while (nz-- > 0) { /* off-diagonal elements */
174       ia[i1 + 1]++;
175       i2++; /* Start col of next node */
176       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177       if (nz > 0) i2 = tvc[col];
178     }
179   }
180 
181   /* shift ia[i] to point to next row */
182   for (i1 = 1; i1 < nslim_row + 1; i1++) {
183     row = ia[i1 - 1];
184     ia[i1] += row;
185     work[i1 - 1] = row - oshift;
186   }
187 
188   /* allocate space for column pointers */
189   nz = ia[nslim_row] + (!ishift);
190   PetscCall(PetscMalloc1(nz, &ja));
191   *jja = ja;
192 
193   /* loop over matrix putting into ja */
194   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195     j  = aj + ai[row] + ishift;
196     nz = ai[row + 1] - ai[row];
197     if (!nz) continue; /* empty row */
198     col = *j++ + ishift;
199     i2  = tvc[col];
200     while (nz-- > 0) {
201       ja[work[i1]++] = i2 + oshift;
202       ++i2;
203       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204       if (nz > 0) i2 = tvc[col];
205     }
206   }
207   PetscCall(PetscFree(ns_col));
208   PetscCall(PetscFree(work));
209   PetscCall(PetscFree2(tns, tvc));
210   PetscFunctionReturn(0);
211 }
212 
213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214 {
215   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
216 
217   PetscFunctionBegin;
218   if (n) *n = a->inode.node_count;
219   if (!ia) PetscFunctionReturn(0);
220   if (!blockcompressed) {
221     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222   } else if (symmetric) {
223     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224   } else {
225     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226   }
227   PetscFunctionReturn(0);
228 }
229 
230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231 {
232   PetscFunctionBegin;
233   if (!ia) PetscFunctionReturn(0);
234 
235   if (!blockcompressed) {
236     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237   } else {
238     PetscCall(PetscFree(*ia));
239     PetscCall(PetscFree(*ja));
240   }
241   PetscFunctionReturn(0);
242 }
243 
244 /* ----------------------------------------------------------- */
245 
246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247 {
248   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
250   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
251 
252   PetscFunctionBegin;
253   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
254   nslim_row = a->inode.node_count;
255   n         = A->cmap->n;
256 
257   /* Create The column_inode for this matrix */
258   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
259 
260   /* allocate space for reformated column_inode structure */
261   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
262   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
263 
264   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
265     nsz = ns_col[i1];
266     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
267   }
268   /* allocate space for column pointers */
269   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
270   *iia = ia;
271   PetscCall(PetscMalloc1(nslim_col + 1, &work));
272 
273   /* determine the number of columns in each row */
274   ia[0] = oshift;
275   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
276     j   = aj + ai[row] + ishift;
277     col = *j++ + ishift;
278     i2  = tvc[col];
279     nz  = ai[row + 1] - ai[row];
280     while (nz-- > 0) { /* off-diagonal elements */
281       /* ia[i1+1]++; */
282       ia[i2 + 1]++;
283       i2++;
284       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285       if (nz > 0) i2 = tvc[col];
286     }
287   }
288 
289   /* shift ia[i] to point to next col */
290   for (i1 = 1; i1 < nslim_col + 1; i1++) {
291     col = ia[i1 - 1];
292     ia[i1] += col;
293     work[i1 - 1] = col - oshift;
294   }
295 
296   /* allocate space for column pointers */
297   nz = ia[nslim_col] + (!ishift);
298   PetscCall(PetscMalloc1(nz, &ja));
299   *jja = ja;
300 
301   /* loop over matrix putting into ja */
302   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
303     j   = aj + ai[row] + ishift;
304     col = *j++ + ishift;
305     i2  = tvc[col];
306     nz  = ai[row + 1] - ai[row];
307     while (nz-- > 0) {
308       /* ja[work[i1]++] = i2 + oshift; */
309       ja[work[i2]++] = i1 + oshift;
310       i2++;
311       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312       if (nz > 0) i2 = tvc[col];
313     }
314   }
315   PetscCall(PetscFree(ns_col));
316   PetscCall(PetscFree(work));
317   PetscCall(PetscFree2(tns, tvc));
318   PetscFunctionReturn(0);
319 }
320 
321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322 {
323   PetscFunctionBegin;
324   PetscCall(MatCreateColInode_Private(A, n, NULL));
325   if (!ia) PetscFunctionReturn(0);
326 
327   if (!blockcompressed) {
328     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
329   } else if (symmetric) {
330     /* Since the indices are symmetric it doesn't matter */
331     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
332   } else {
333     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
334   }
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339 {
340   PetscFunctionBegin;
341   if (!ia) PetscFunctionReturn(0);
342   if (!blockcompressed) {
343     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
344   } else {
345     PetscCall(PetscFree(*ia));
346     PetscCall(PetscFree(*ja));
347   }
348   PetscFunctionReturn(0);
349 }
350 
351 /* ----------------------------------------------------------- */
352 
353 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
354 {
355   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
356   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
357   PetscScalar       *y;
358   const PetscScalar *x;
359   const MatScalar   *v1, *v2, *v3, *v4, *v5;
360   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
361   const PetscInt    *idx, *ns, *ii;
362 
363 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
364   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
365 #endif
366 
367   PetscFunctionBegin;
368   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
369   node_max = a->inode.node_count;
370   ns       = a->inode.size; /* Node Size array */
371   PetscCall(VecGetArrayRead(xx, &x));
372   PetscCall(VecGetArray(yy, &y));
373   idx = a->j;
374   v1  = a->a;
375   ii  = a->i;
376 
377   for (i = 0, row = 0; i < node_max; ++i) {
378     nsz = ns[i];
379     n   = ii[1] - ii[0];
380     nonzerorow += (n > 0) * nsz;
381     ii += nsz;
382     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
383     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
384     sz = n;                                                                /* No of non zeros in this row */
385                                                                            /* Switch on the size of Node */
386     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
387     case 1:
388       sum1 = 0.;
389 
390       for (n = 0; n < sz - 1; n += 2) {
391         i1 = idx[0]; /* The instructions are ordered to */
392         i2 = idx[1]; /* make the compiler's job easy */
393         idx += 2;
394         tmp0 = x[i1];
395         tmp1 = x[i2];
396         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
397         v1 += 2;
398       }
399 
400       if (n == sz - 1) { /* Take care of the last nonzero  */
401         tmp0 = x[*idx++];
402         sum1 += *v1++ * tmp0;
403       }
404       y[row++] = sum1;
405       break;
406     case 2:
407       sum1 = 0.;
408       sum2 = 0.;
409       v2   = v1 + n;
410 
411       for (n = 0; n < sz - 1; n += 2) {
412         i1 = idx[0];
413         i2 = idx[1];
414         idx += 2;
415         tmp0 = x[i1];
416         tmp1 = x[i2];
417         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
418         v1 += 2;
419         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
420         v2 += 2;
421       }
422       if (n == sz - 1) {
423         tmp0 = x[*idx++];
424         sum1 += *v1++ * tmp0;
425         sum2 += *v2++ * tmp0;
426       }
427       y[row++] = sum1;
428       y[row++] = sum2;
429       v1       = v2; /* Since the next block to be processed starts there*/
430       idx += sz;
431       break;
432     case 3:
433       sum1 = 0.;
434       sum2 = 0.;
435       sum3 = 0.;
436       v2   = v1 + n;
437       v3   = v2 + n;
438 
439       for (n = 0; n < sz - 1; n += 2) {
440         i1 = idx[0];
441         i2 = idx[1];
442         idx += 2;
443         tmp0 = x[i1];
444         tmp1 = x[i2];
445         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
446         v1 += 2;
447         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
448         v2 += 2;
449         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
450         v3 += 2;
451       }
452       if (n == sz - 1) {
453         tmp0 = x[*idx++];
454         sum1 += *v1++ * tmp0;
455         sum2 += *v2++ * tmp0;
456         sum3 += *v3++ * tmp0;
457       }
458       y[row++] = sum1;
459       y[row++] = sum2;
460       y[row++] = sum3;
461       v1       = v3; /* Since the next block to be processed starts there*/
462       idx += 2 * sz;
463       break;
464     case 4:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       sum4 = 0.;
469       v2   = v1 + n;
470       v3   = v2 + n;
471       v4   = v3 + n;
472 
473       for (n = 0; n < sz - 1; n += 2) {
474         i1 = idx[0];
475         i2 = idx[1];
476         idx += 2;
477         tmp0 = x[i1];
478         tmp1 = x[i2];
479         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
480         v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
482         v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
484         v3 += 2;
485         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
486         v4 += 2;
487       }
488       if (n == sz - 1) {
489         tmp0 = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493         sum4 += *v4++ * tmp0;
494       }
495       y[row++] = sum1;
496       y[row++] = sum2;
497       y[row++] = sum3;
498       y[row++] = sum4;
499       v1       = v4; /* Since the next block to be processed starts there*/
500       idx += 3 * sz;
501       break;
502     case 5:
503       sum1 = 0.;
504       sum2 = 0.;
505       sum3 = 0.;
506       sum4 = 0.;
507       sum5 = 0.;
508       v2   = v1 + n;
509       v3   = v2 + n;
510       v4   = v3 + n;
511       v5   = v4 + n;
512 
513       for (n = 0; n < sz - 1; n += 2) {
514         i1 = idx[0];
515         i2 = idx[1];
516         idx += 2;
517         tmp0 = x[i1];
518         tmp1 = x[i2];
519         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
520         v1 += 2;
521         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
522         v2 += 2;
523         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
524         v3 += 2;
525         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
526         v4 += 2;
527         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
528         v5 += 2;
529       }
530       if (n == sz - 1) {
531         tmp0 = x[*idx++];
532         sum1 += *v1++ * tmp0;
533         sum2 += *v2++ * tmp0;
534         sum3 += *v3++ * tmp0;
535         sum4 += *v4++ * tmp0;
536         sum5 += *v5++ * tmp0;
537       }
538       y[row++] = sum1;
539       y[row++] = sum2;
540       y[row++] = sum3;
541       y[row++] = sum4;
542       y[row++] = sum5;
543       v1       = v5; /* Since the next block to be processed starts there */
544       idx += 4 * sz;
545       break;
546     default:
547       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
548     }
549   }
550   PetscCall(VecRestoreArrayRead(xx, &x));
551   PetscCall(VecRestoreArray(yy, &y));
552   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
553   PetscFunctionReturn(0);
554 }
555 /* ----------------------------------------------------------- */
556 /* Almost same code as the MatMult_SeqAIJ_Inode() */
557 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
558 {
559   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
560   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
561   const MatScalar   *v1, *v2, *v3, *v4, *v5;
562   const PetscScalar *x;
563   PetscScalar       *y, *z, *zt;
564   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
565   const PetscInt    *idx, *ns, *ii;
566 
567   PetscFunctionBegin;
568   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
569   node_max = a->inode.node_count;
570   ns       = a->inode.size; /* Node Size array */
571 
572   PetscCall(VecGetArrayRead(xx, &x));
573   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
574   zt = z;
575 
576   idx = a->j;
577   v1  = a->a;
578   ii  = a->i;
579 
580   for (i = 0, row = 0; i < node_max; ++i) {
581     nsz = ns[i];
582     n   = ii[1] - ii[0];
583     ii += nsz;
584     sz = n;        /* No of non zeros in this row */
585                    /* Switch on the size of Node */
586     switch (nsz) { /* Each loop in 'case' is unrolled */
587     case 1:
588       sum1 = *zt++;
589 
590       for (n = 0; n < sz - 1; n += 2) {
591         i1 = idx[0]; /* The instructions are ordered to */
592         i2 = idx[1]; /* make the compiler's job easy */
593         idx += 2;
594         tmp0 = x[i1];
595         tmp1 = x[i2];
596         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
597         v1 += 2;
598       }
599 
600       if (n == sz - 1) { /* Take care of the last nonzero  */
601         tmp0 = x[*idx++];
602         sum1 += *v1++ * tmp0;
603       }
604       y[row++] = sum1;
605       break;
606     case 2:
607       sum1 = *zt++;
608       sum2 = *zt++;
609       v2   = v1 + n;
610 
611       for (n = 0; n < sz - 1; n += 2) {
612         i1 = idx[0];
613         i2 = idx[1];
614         idx += 2;
615         tmp0 = x[i1];
616         tmp1 = x[i2];
617         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
618         v1 += 2;
619         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
620         v2 += 2;
621       }
622       if (n == sz - 1) {
623         tmp0 = x[*idx++];
624         sum1 += *v1++ * tmp0;
625         sum2 += *v2++ * tmp0;
626       }
627       y[row++] = sum1;
628       y[row++] = sum2;
629       v1       = v2; /* Since the next block to be processed starts there*/
630       idx += sz;
631       break;
632     case 3:
633       sum1 = *zt++;
634       sum2 = *zt++;
635       sum3 = *zt++;
636       v2   = v1 + n;
637       v3   = v2 + n;
638 
639       for (n = 0; n < sz - 1; n += 2) {
640         i1 = idx[0];
641         i2 = idx[1];
642         idx += 2;
643         tmp0 = x[i1];
644         tmp1 = x[i2];
645         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
646         v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
648         v2 += 2;
649         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
650         v3 += 2;
651       }
652       if (n == sz - 1) {
653         tmp0 = x[*idx++];
654         sum1 += *v1++ * tmp0;
655         sum2 += *v2++ * tmp0;
656         sum3 += *v3++ * tmp0;
657       }
658       y[row++] = sum1;
659       y[row++] = sum2;
660       y[row++] = sum3;
661       v1       = v3; /* Since the next block to be processed starts there*/
662       idx += 2 * sz;
663       break;
664     case 4:
665       sum1 = *zt++;
666       sum2 = *zt++;
667       sum3 = *zt++;
668       sum4 = *zt++;
669       v2   = v1 + n;
670       v3   = v2 + n;
671       v4   = v3 + n;
672 
673       for (n = 0; n < sz - 1; n += 2) {
674         i1 = idx[0];
675         i2 = idx[1];
676         idx += 2;
677         tmp0 = x[i1];
678         tmp1 = x[i2];
679         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
680         v1 += 2;
681         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
682         v2 += 2;
683         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
684         v3 += 2;
685         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
686         v4 += 2;
687       }
688       if (n == sz - 1) {
689         tmp0 = x[*idx++];
690         sum1 += *v1++ * tmp0;
691         sum2 += *v2++ * tmp0;
692         sum3 += *v3++ * tmp0;
693         sum4 += *v4++ * tmp0;
694       }
695       y[row++] = sum1;
696       y[row++] = sum2;
697       y[row++] = sum3;
698       y[row++] = sum4;
699       v1       = v4; /* Since the next block to be processed starts there*/
700       idx += 3 * sz;
701       break;
702     case 5:
703       sum1 = *zt++;
704       sum2 = *zt++;
705       sum3 = *zt++;
706       sum4 = *zt++;
707       sum5 = *zt++;
708       v2   = v1 + n;
709       v3   = v2 + n;
710       v4   = v3 + n;
711       v5   = v4 + n;
712 
713       for (n = 0; n < sz - 1; n += 2) {
714         i1 = idx[0];
715         i2 = idx[1];
716         idx += 2;
717         tmp0 = x[i1];
718         tmp1 = x[i2];
719         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
720         v1 += 2;
721         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
722         v2 += 2;
723         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
724         v3 += 2;
725         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
726         v4 += 2;
727         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
728         v5 += 2;
729       }
730       if (n == sz - 1) {
731         tmp0 = x[*idx++];
732         sum1 += *v1++ * tmp0;
733         sum2 += *v2++ * tmp0;
734         sum3 += *v3++ * tmp0;
735         sum4 += *v4++ * tmp0;
736         sum5 += *v5++ * tmp0;
737       }
738       y[row++] = sum1;
739       y[row++] = sum2;
740       y[row++] = sum3;
741       y[row++] = sum4;
742       y[row++] = sum5;
743       v1       = v5; /* Since the next block to be processed starts there */
744       idx += 4 * sz;
745       break;
746     default:
747       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
748     }
749   }
750   PetscCall(VecRestoreArrayRead(xx, &x));
751   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
752   PetscCall(PetscLogFlops(2.0 * a->nz));
753   PetscFunctionReturn(0);
754 }
755 
756 /* ----------------------------------------------------------- */
757 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
758 {
759   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
760   IS                 iscol = a->col, isrow = a->row;
761   const PetscInt    *r, *c, *rout, *cout;
762   PetscInt           i, j, n = A->rmap->n, nz;
763   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
764   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
765   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
766   PetscScalar        sum1, sum2, sum3, sum4, sum5;
767   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
768   const PetscScalar *b;
769 
770   PetscFunctionBegin;
771   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
772   node_max = a->inode.node_count;
773   ns       = a->inode.size; /* Node Size array */
774 
775   PetscCall(VecGetArrayRead(bb, &b));
776   PetscCall(VecGetArrayWrite(xx, &x));
777   tmp = a->solve_work;
778 
779   PetscCall(ISGetIndices(isrow, &rout));
780   r = rout;
781   PetscCall(ISGetIndices(iscol, &cout));
782   c = cout + (n - 1);
783 
784   /* forward solve the lower triangular */
785   tmps = tmp;
786   aa   = a_a;
787   aj   = a_j;
788   ad   = a->diag;
789 
790   for (i = 0, row = 0; i < node_max; ++i) {
791     nsz = ns[i];
792     aii = ai[row];
793     v1  = aa + aii;
794     vi  = aj + aii;
795     nz  = ad[row] - aii;
796     if (i < node_max - 1) {
797       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
798       * but our indexing to determine it's size could. */
799       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
800       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
801       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
802       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
803     }
804 
805     switch (nsz) { /* Each loop in 'case' is unrolled */
806     case 1:
807       sum1 = b[*r++];
808       for (j = 0; j < nz - 1; j += 2) {
809         i0 = vi[0];
810         i1 = vi[1];
811         vi += 2;
812         tmp0 = tmps[i0];
813         tmp1 = tmps[i1];
814         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
815         v1 += 2;
816       }
817       if (j == nz - 1) {
818         tmp0 = tmps[*vi++];
819         sum1 -= *v1++ * tmp0;
820       }
821       tmp[row++] = sum1;
822       break;
823     case 2:
824       sum1 = b[*r++];
825       sum2 = b[*r++];
826       v2   = aa + ai[row + 1];
827 
828       for (j = 0; j < nz - 1; j += 2) {
829         i0 = vi[0];
830         i1 = vi[1];
831         vi += 2;
832         tmp0 = tmps[i0];
833         tmp1 = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
835         v1 += 2;
836         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
837         v2 += 2;
838       }
839       if (j == nz - 1) {
840         tmp0 = tmps[*vi++];
841         sum1 -= *v1++ * tmp0;
842         sum2 -= *v2++ * tmp0;
843       }
844       sum2 -= *v2++ * sum1;
845       tmp[row++] = sum1;
846       tmp[row++] = sum2;
847       break;
848     case 3:
849       sum1 = b[*r++];
850       sum2 = b[*r++];
851       sum3 = b[*r++];
852       v2   = aa + ai[row + 1];
853       v3   = aa + ai[row + 2];
854 
855       for (j = 0; j < nz - 1; j += 2) {
856         i0 = vi[0];
857         i1 = vi[1];
858         vi += 2;
859         tmp0 = tmps[i0];
860         tmp1 = tmps[i1];
861         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
862         v1 += 2;
863         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
864         v2 += 2;
865         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
866         v3 += 2;
867       }
868       if (j == nz - 1) {
869         tmp0 = tmps[*vi++];
870         sum1 -= *v1++ * tmp0;
871         sum2 -= *v2++ * tmp0;
872         sum3 -= *v3++ * tmp0;
873       }
874       sum2 -= *v2++ * sum1;
875       sum3 -= *v3++ * sum1;
876       sum3 -= *v3++ * sum2;
877 
878       tmp[row++] = sum1;
879       tmp[row++] = sum2;
880       tmp[row++] = sum3;
881       break;
882 
883     case 4:
884       sum1 = b[*r++];
885       sum2 = b[*r++];
886       sum3 = b[*r++];
887       sum4 = b[*r++];
888       v2   = aa + ai[row + 1];
889       v3   = aa + ai[row + 2];
890       v4   = aa + ai[row + 3];
891 
892       for (j = 0; j < nz - 1; j += 2) {
893         i0 = vi[0];
894         i1 = vi[1];
895         vi += 2;
896         tmp0 = tmps[i0];
897         tmp1 = tmps[i1];
898         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
899         v1 += 2;
900         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
901         v2 += 2;
902         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
903         v3 += 2;
904         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
905         v4 += 2;
906       }
907       if (j == nz - 1) {
908         tmp0 = tmps[*vi++];
909         sum1 -= *v1++ * tmp0;
910         sum2 -= *v2++ * tmp0;
911         sum3 -= *v3++ * tmp0;
912         sum4 -= *v4++ * tmp0;
913       }
914       sum2 -= *v2++ * sum1;
915       sum3 -= *v3++ * sum1;
916       sum4 -= *v4++ * sum1;
917       sum3 -= *v3++ * sum2;
918       sum4 -= *v4++ * sum2;
919       sum4 -= *v4++ * sum3;
920 
921       tmp[row++] = sum1;
922       tmp[row++] = sum2;
923       tmp[row++] = sum3;
924       tmp[row++] = sum4;
925       break;
926     case 5:
927       sum1 = b[*r++];
928       sum2 = b[*r++];
929       sum3 = b[*r++];
930       sum4 = b[*r++];
931       sum5 = b[*r++];
932       v2   = aa + ai[row + 1];
933       v3   = aa + ai[row + 2];
934       v4   = aa + ai[row + 3];
935       v5   = aa + ai[row + 4];
936 
937       for (j = 0; j < nz - 1; j += 2) {
938         i0 = vi[0];
939         i1 = vi[1];
940         vi += 2;
941         tmp0 = tmps[i0];
942         tmp1 = tmps[i1];
943         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
944         v1 += 2;
945         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
946         v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
948         v3 += 2;
949         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
950         v4 += 2;
951         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
952         v5 += 2;
953       }
954       if (j == nz - 1) {
955         tmp0 = tmps[*vi++];
956         sum1 -= *v1++ * tmp0;
957         sum2 -= *v2++ * tmp0;
958         sum3 -= *v3++ * tmp0;
959         sum4 -= *v4++ * tmp0;
960         sum5 -= *v5++ * tmp0;
961       }
962 
963       sum2 -= *v2++ * sum1;
964       sum3 -= *v3++ * sum1;
965       sum4 -= *v4++ * sum1;
966       sum5 -= *v5++ * sum1;
967       sum3 -= *v3++ * sum2;
968       sum4 -= *v4++ * sum2;
969       sum5 -= *v5++ * sum2;
970       sum4 -= *v4++ * sum3;
971       sum5 -= *v5++ * sum3;
972       sum5 -= *v5++ * sum4;
973 
974       tmp[row++] = sum1;
975       tmp[row++] = sum2;
976       tmp[row++] = sum3;
977       tmp[row++] = sum4;
978       tmp[row++] = sum5;
979       break;
980     default:
981       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
982     }
983   }
984   /* backward solve the upper triangular */
985   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
986     nsz = ns[i];
987     aii = ai[row + 1] - 1;
988     v1  = aa + aii;
989     vi  = aj + aii;
990     nz  = aii - ad[row];
991     switch (nsz) { /* Each loop in 'case' is unrolled */
992     case 1:
993       sum1 = tmp[row];
994 
995       for (j = nz; j > 1; j -= 2) {
996         vi -= 2;
997         i0   = vi[2];
998         i1   = vi[1];
999         tmp0 = tmps[i0];
1000         tmp1 = tmps[i1];
1001         v1 -= 2;
1002         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1003       }
1004       if (j == 1) {
1005         tmp0 = tmps[*vi--];
1006         sum1 -= *v1-- * tmp0;
1007       }
1008       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1009       row--;
1010       break;
1011     case 2:
1012       sum1 = tmp[row];
1013       sum2 = tmp[row - 1];
1014       v2   = aa + ai[row] - 1;
1015       for (j = nz; j > 1; j -= 2) {
1016         vi -= 2;
1017         i0   = vi[2];
1018         i1   = vi[1];
1019         tmp0 = tmps[i0];
1020         tmp1 = tmps[i1];
1021         v1 -= 2;
1022         v2 -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025       }
1026       if (j == 1) {
1027         tmp0 = tmps[*vi--];
1028         sum1 -= *v1-- * tmp0;
1029         sum2 -= *v2-- * tmp0;
1030       }
1031 
1032       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1033       row--;
1034       sum2 -= *v2-- * tmp0;
1035       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1036       row--;
1037       break;
1038     case 3:
1039       sum1 = tmp[row];
1040       sum2 = tmp[row - 1];
1041       sum3 = tmp[row - 2];
1042       v2   = aa + ai[row] - 1;
1043       v3   = aa + ai[row - 1] - 1;
1044       for (j = nz; j > 1; j -= 2) {
1045         vi -= 2;
1046         i0   = vi[2];
1047         i1   = vi[1];
1048         tmp0 = tmps[i0];
1049         tmp1 = tmps[i1];
1050         v1 -= 2;
1051         v2 -= 2;
1052         v3 -= 2;
1053         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1054         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1055         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1056       }
1057       if (j == 1) {
1058         tmp0 = tmps[*vi--];
1059         sum1 -= *v1-- * tmp0;
1060         sum2 -= *v2-- * tmp0;
1061         sum3 -= *v3-- * tmp0;
1062       }
1063       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1064       row--;
1065       sum2 -= *v2-- * tmp0;
1066       sum3 -= *v3-- * tmp0;
1067       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1068       row--;
1069       sum3 -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1071       row--;
1072 
1073       break;
1074     case 4:
1075       sum1 = tmp[row];
1076       sum2 = tmp[row - 1];
1077       sum3 = tmp[row - 2];
1078       sum4 = tmp[row - 3];
1079       v2   = aa + ai[row] - 1;
1080       v3   = aa + ai[row - 1] - 1;
1081       v4   = aa + ai[row - 2] - 1;
1082 
1083       for (j = nz; j > 1; j -= 2) {
1084         vi -= 2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1 -= 2;
1090         v2 -= 2;
1091         v3 -= 2;
1092         v4 -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j == 1) {
1099         tmp0 = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1107       row--;
1108       sum2 -= *v2-- * tmp0;
1109       sum3 -= *v3-- * tmp0;
1110       sum4 -= *v4-- * tmp0;
1111       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1112       row--;
1113       sum3 -= *v3-- * tmp0;
1114       sum4 -= *v4-- * tmp0;
1115       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1116       row--;
1117       sum4 -= *v4-- * tmp0;
1118       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1119       row--;
1120       break;
1121     case 5:
1122       sum1 = tmp[row];
1123       sum2 = tmp[row - 1];
1124       sum3 = tmp[row - 2];
1125       sum4 = tmp[row - 3];
1126       sum5 = tmp[row - 4];
1127       v2   = aa + ai[row] - 1;
1128       v3   = aa + ai[row - 1] - 1;
1129       v4   = aa + ai[row - 2] - 1;
1130       v5   = aa + ai[row - 3] - 1;
1131       for (j = nz; j > 1; j -= 2) {
1132         vi -= 2;
1133         i0   = vi[2];
1134         i1   = vi[1];
1135         tmp0 = tmps[i0];
1136         tmp1 = tmps[i1];
1137         v1 -= 2;
1138         v2 -= 2;
1139         v3 -= 2;
1140         v4 -= 2;
1141         v5 -= 2;
1142         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1143         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1144         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1145         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1146         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1147       }
1148       if (j == 1) {
1149         tmp0 = tmps[*vi--];
1150         sum1 -= *v1-- * tmp0;
1151         sum2 -= *v2-- * tmp0;
1152         sum3 -= *v3-- * tmp0;
1153         sum4 -= *v4-- * tmp0;
1154         sum5 -= *v5-- * tmp0;
1155       }
1156 
1157       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1158       row--;
1159       sum2 -= *v2-- * tmp0;
1160       sum3 -= *v3-- * tmp0;
1161       sum4 -= *v4-- * tmp0;
1162       sum5 -= *v5-- * tmp0;
1163       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1164       row--;
1165       sum3 -= *v3-- * tmp0;
1166       sum4 -= *v4-- * tmp0;
1167       sum5 -= *v5-- * tmp0;
1168       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1169       row--;
1170       sum4 -= *v4-- * tmp0;
1171       sum5 -= *v5-- * tmp0;
1172       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1173       row--;
1174       sum5 -= *v5-- * tmp0;
1175       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1176       row--;
1177       break;
1178     default:
1179       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1180     }
1181   }
1182   PetscCall(ISRestoreIndices(isrow, &rout));
1183   PetscCall(ISRestoreIndices(iscol, &cout));
1184   PetscCall(VecRestoreArrayRead(bb, &b));
1185   PetscCall(VecRestoreArrayWrite(xx, &x));
1186   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1187   PetscFunctionReturn(0);
1188 }
1189 
1190 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1191 {
1192   Mat              C = B;
1193   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1194   IS               isrow = b->row, isicol = b->icol;
1195   const PetscInt  *r, *ic, *ics;
1196   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1197   PetscInt         i, j, k, nz, nzL, row, *pj;
1198   const PetscInt  *ajtmp, *bjtmp;
1199   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1200   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1201   FactorShiftCtx   sctx;
1202   const PetscInt  *ddiag;
1203   PetscReal        rs;
1204   MatScalar        d;
1205   PetscInt         inod, nodesz, node_max, col;
1206   const PetscInt  *ns;
1207   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1208 
1209   PetscFunctionBegin;
1210   /* MatPivotSetUp(): initialize shift context sctx */
1211   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1212 
1213   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1214     ddiag          = a->diag;
1215     sctx.shift_top = info->zeropivot;
1216     for (i = 0; i < n; i++) {
1217       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1218       d  = (aa)[ddiag[i]];
1219       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1220       v  = aa + ai[i];
1221       nz = ai[i + 1] - ai[i];
1222       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1223       if (rs > sctx.shift_top) sctx.shift_top = rs;
1224     }
1225     sctx.shift_top *= 1.1;
1226     sctx.nshift_max = 5;
1227     sctx.shift_lo   = 0.;
1228     sctx.shift_hi   = 1.;
1229   }
1230 
1231   PetscCall(ISGetIndices(isrow, &r));
1232   PetscCall(ISGetIndices(isicol, &ic));
1233 
1234   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1235   ics = ic;
1236 
1237   node_max = a->inode.node_count;
1238   ns       = a->inode.size;
1239   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1240 
1241   /* If max inode size > 4, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1244   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1245     if (ns[i] > 4) {
1246       tmp_vec1[j] = 4;
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   PetscCall(PetscMalloc1(n + 1, &nsmap));
1259   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1260   for (i = 0, row = 0; i < node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i = 0, j = 0; i < node_max; i++) {
1266     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1267     tmp_vec2[i] = nodesz;
1268     j += nodesz;
1269   }
1270   PetscCall(PetscFree(nsmap));
1271   PetscCall(PetscFree(tmp_vec1));
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.newshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz) {
1283       case 1:
1284         /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i + 1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz    = bdiag[i] - bdiag[i + 1];
1293         bjtmp = bj + bdiag[i + 1] + 1;
1294         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i] + 1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1301 
1302         /* ZeropivotApply() */
1303         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1304 
1305         /* elimination */
1306         bjtmp = bj + bi[i];
1307         row   = *bjtmp++;
1308         nzL   = bi[i + 1] - bi[i];
1309         for (k = 0; k < nzL; k++) {
1310           pc = rtmp1 + row;
1311           if (*pc != 0.0) {
1312             pv   = b->a + bdiag[row];
1313             mul1 = *pc * (*pv);
1314             *pc  = mul1;
1315             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1316             pv   = b->a + bdiag[row + 1] + 1;
1317             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1318             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1319             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1320           }
1321           row = *bjtmp++;
1322         }
1323 
1324         /* finished row so stick it into b->a */
1325         rs = 0.0;
1326         /* L part */
1327         pv = b->a + bi[i];
1328         pj = b->j + bi[i];
1329         nz = bi[i + 1] - bi[i];
1330         for (j = 0; j < nz; j++) {
1331           pv[j] = rtmp1[pj[j]];
1332           rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i + 1] + 1;
1337         pj = b->j + bdiag[i + 1] + 1;
1338         nz = bdiag[i] - bdiag[i + 1] - 1;
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1348         if (sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simpler triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356         /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i + 1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for (j = 0; j < nz; j++) {
1362           col        = bjtmp[j];
1363           rtmp1[col] = 0.0;
1364           rtmp2[col] = 0.0;
1365         }
1366 
1367         /* U part */
1368         nz    = bdiag[i] - bdiag[i + 1];
1369         bjtmp = bj + bdiag[i + 1] + 1;
1370         for (j = 0; j < nz; j++) {
1371           col        = bjtmp[j];
1372           rtmp1[col] = 0.0;
1373           rtmp2[col] = 0.0;
1374         }
1375 
1376         /* load in initial (unfactored row) */
1377         nz    = ai[r[i] + 1] - ai[r[i]];
1378         ajtmp = aj + ai[r[i]];
1379         v1    = aa + ai[r[i]];
1380         v2    = aa + ai[r[i] + 1];
1381         for (j = 0; j < nz; j++) {
1382           col        = ics[ajtmp[j]];
1383           rtmp1[col] = v1[j];
1384           rtmp2[col] = v2[j];
1385         }
1386         /* ZeropivotApply(): shift the diagonal of the matrix  */
1387         rtmp1[i] += sctx.shift_amount;
1388         rtmp2[i + 1] += sctx.shift_amount;
1389 
1390         /* elimination */
1391         bjtmp = bj + bi[i];
1392         row   = *bjtmp++; /* pivot row */
1393         nzL   = bi[i + 1] - bi[i];
1394         for (k = 0; k < nzL; k++) {
1395           pc1 = rtmp1 + row;
1396           pc2 = rtmp2 + row;
1397           if (*pc1 != 0.0 || *pc2 != 0.0) {
1398             pv   = b->a + bdiag[row];
1399             mul1 = *pc1 * (*pv);
1400             mul2 = *pc2 * (*pv);
1401             *pc1 = mul1;
1402             *pc2 = mul2;
1403 
1404             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1405             pv = b->a + bdiag[row + 1] + 1;
1406             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1407             for (j = 0; j < nz; j++) {
1408               col = pj[j];
1409               rtmp1[col] -= mul1 * pv[j];
1410               rtmp2[col] -= mul2 * pv[j];
1411             }
1412             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1413           }
1414           row = *bjtmp++;
1415         }
1416 
1417         /* finished row i; check zero pivot, then stick row i into b->a */
1418         rs = 0.0;
1419         /* L part */
1420         pc1 = b->a + bi[i];
1421         pj  = b->j + bi[i];
1422         nz  = bi[i + 1] - bi[i];
1423         for (j = 0; j < nz; j++) {
1424           col    = pj[j];
1425           pc1[j] = rtmp1[col];
1426           rs += PetscAbsScalar(pc1[j]);
1427         }
1428         /* U part */
1429         pc1 = b->a + bdiag[i + 1] + 1;
1430         pj  = b->j + bdiag[i + 1] + 1;
1431         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1432         for (j = 0; j < nz; j++) {
1433           col    = pj[j];
1434           pc1[j] = rtmp1[col];
1435           rs += PetscAbsScalar(pc1[j]);
1436         }
1437 
1438         sctx.rs = rs;
1439         sctx.pv = rtmp1[i];
1440         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1441         if (sctx.newshift) break;
1442         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1443         *pc1 = 1.0 / sctx.pv;
1444 
1445         /* Now take care of diagonal 2x2 block. */
1446         pc2 = rtmp2 + i;
1447         if (*pc2 != 0.0) {
1448           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1449           *pc2 = mul1;                        /* insert L entry */
1450           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1451           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1452           for (j = 0; j < nz; j++) {
1453             col = pj[j];
1454             rtmp2[col] -= mul1 * rtmp1[col];
1455           }
1456           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1457         }
1458 
1459         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1460         rs = 0.0;
1461         /* L part */
1462         pc2 = b->a + bi[i + 1];
1463         pj  = b->j + bi[i + 1];
1464         nz  = bi[i + 2] - bi[i + 1];
1465         for (j = 0; j < nz; j++) {
1466           col    = pj[j];
1467           pc2[j] = rtmp2[col];
1468           rs += PetscAbsScalar(pc2[j]);
1469         }
1470         /* U part */
1471         pc2 = b->a + bdiag[i + 2] + 1;
1472         pj  = b->j + bdiag[i + 2] + 1;
1473         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1474         for (j = 0; j < nz; j++) {
1475           col    = pj[j];
1476           pc2[j] = rtmp2[col];
1477           rs += PetscAbsScalar(pc2[j]);
1478         }
1479 
1480         sctx.rs = rs;
1481         sctx.pv = rtmp2[i + 1];
1482         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1483         if (sctx.newshift) break;
1484         pc2  = b->a + bdiag[i + 1];
1485         *pc2 = 1.0 / sctx.pv;
1486         break;
1487 
1488       case 3:
1489         /*----------*/
1490         /* zero rtmp */
1491         /* L part */
1492         nz    = bi[i + 1] - bi[i];
1493         bjtmp = bj + bi[i];
1494         for (j = 0; j < nz; j++) {
1495           col        = bjtmp[j];
1496           rtmp1[col] = 0.0;
1497           rtmp2[col] = 0.0;
1498           rtmp3[col] = 0.0;
1499         }
1500 
1501         /* U part */
1502         nz    = bdiag[i] - bdiag[i + 1];
1503         bjtmp = bj + bdiag[i + 1] + 1;
1504         for (j = 0; j < nz; j++) {
1505           col        = bjtmp[j];
1506           rtmp1[col] = 0.0;
1507           rtmp2[col] = 0.0;
1508           rtmp3[col] = 0.0;
1509         }
1510 
1511         /* load in initial (unfactored row) */
1512         nz    = ai[r[i] + 1] - ai[r[i]];
1513         ajtmp = aj + ai[r[i]];
1514         v1    = aa + ai[r[i]];
1515         v2    = aa + ai[r[i] + 1];
1516         v3    = aa + ai[r[i] + 2];
1517         for (j = 0; j < nz; j++) {
1518           col        = ics[ajtmp[j]];
1519           rtmp1[col] = v1[j];
1520           rtmp2[col] = v2[j];
1521           rtmp3[col] = v3[j];
1522         }
1523         /* ZeropivotApply(): shift the diagonal of the matrix  */
1524         rtmp1[i] += sctx.shift_amount;
1525         rtmp2[i + 1] += sctx.shift_amount;
1526         rtmp3[i + 2] += sctx.shift_amount;
1527 
1528         /* elimination */
1529         bjtmp = bj + bi[i];
1530         row   = *bjtmp++; /* pivot row */
1531         nzL   = bi[i + 1] - bi[i];
1532         for (k = 0; k < nzL; k++) {
1533           pc1 = rtmp1 + row;
1534           pc2 = rtmp2 + row;
1535           pc3 = rtmp3 + row;
1536           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1537             pv   = b->a + bdiag[row];
1538             mul1 = *pc1 * (*pv);
1539             mul2 = *pc2 * (*pv);
1540             mul3 = *pc3 * (*pv);
1541             *pc1 = mul1;
1542             *pc2 = mul2;
1543             *pc3 = mul3;
1544 
1545             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1546             pv = b->a + bdiag[row + 1] + 1;
1547             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1548             for (j = 0; j < nz; j++) {
1549               col = pj[j];
1550               rtmp1[col] -= mul1 * pv[j];
1551               rtmp2[col] -= mul2 * pv[j];
1552               rtmp3[col] -= mul3 * pv[j];
1553             }
1554             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1555           }
1556           row = *bjtmp++;
1557         }
1558 
1559         /* finished row i; check zero pivot, then stick row i into b->a */
1560         rs = 0.0;
1561         /* L part */
1562         pc1 = b->a + bi[i];
1563         pj  = b->j + bi[i];
1564         nz  = bi[i + 1] - bi[i];
1565         for (j = 0; j < nz; j++) {
1566           col    = pj[j];
1567           pc1[j] = rtmp1[col];
1568           rs += PetscAbsScalar(pc1[j]);
1569         }
1570         /* U part */
1571         pc1 = b->a + bdiag[i + 1] + 1;
1572         pj  = b->j + bdiag[i + 1] + 1;
1573         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1574         for (j = 0; j < nz; j++) {
1575           col    = pj[j];
1576           pc1[j] = rtmp1[col];
1577           rs += PetscAbsScalar(pc1[j]);
1578         }
1579 
1580         sctx.rs = rs;
1581         sctx.pv = rtmp1[i];
1582         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1583         if (sctx.newshift) break;
1584         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1585         *pc1 = 1.0 / sctx.pv;
1586 
1587         /* Now take care of 1st column of diagonal 3x3 block. */
1588         pc2 = rtmp2 + i;
1589         pc3 = rtmp3 + i;
1590         if (*pc2 != 0.0 || *pc3 != 0.0) {
1591           mul2 = (*pc2) * (*pc1);
1592           *pc2 = mul2;
1593           mul3 = (*pc3) * (*pc1);
1594           *pc3 = mul3;
1595           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1596           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1597           for (j = 0; j < nz; j++) {
1598             col = pj[j];
1599             rtmp2[col] -= mul2 * rtmp1[col];
1600             rtmp3[col] -= mul3 * rtmp1[col];
1601           }
1602           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1603         }
1604 
1605         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1606         rs = 0.0;
1607         /* L part */
1608         pc2 = b->a + bi[i + 1];
1609         pj  = b->j + bi[i + 1];
1610         nz  = bi[i + 2] - bi[i + 1];
1611         for (j = 0; j < nz; j++) {
1612           col    = pj[j];
1613           pc2[j] = rtmp2[col];
1614           rs += PetscAbsScalar(pc2[j]);
1615         }
1616         /* U part */
1617         pc2 = b->a + bdiag[i + 2] + 1;
1618         pj  = b->j + bdiag[i + 2] + 1;
1619         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1620         for (j = 0; j < nz; j++) {
1621           col    = pj[j];
1622           pc2[j] = rtmp2[col];
1623           rs += PetscAbsScalar(pc2[j]);
1624         }
1625 
1626         sctx.rs = rs;
1627         sctx.pv = rtmp2[i + 1];
1628         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1629         if (sctx.newshift) break;
1630         pc2  = b->a + bdiag[i + 1];
1631         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1632 
1633         /* Now take care of 2nd column of diagonal 3x3 block. */
1634         pc3 = rtmp3 + i + 1;
1635         if (*pc3 != 0.0) {
1636           mul3 = (*pc3) * (*pc2);
1637           *pc3 = mul3;
1638           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1639           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1640           for (j = 0; j < nz; j++) {
1641             col = pj[j];
1642             rtmp3[col] -= mul3 * rtmp2[col];
1643           }
1644           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1645         }
1646 
1647         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1648         rs = 0.0;
1649         /* L part */
1650         pc3 = b->a + bi[i + 2];
1651         pj  = b->j + bi[i + 2];
1652         nz  = bi[i + 3] - bi[i + 2];
1653         for (j = 0; j < nz; j++) {
1654           col    = pj[j];
1655           pc3[j] = rtmp3[col];
1656           rs += PetscAbsScalar(pc3[j]);
1657         }
1658         /* U part */
1659         pc3 = b->a + bdiag[i + 3] + 1;
1660         pj  = b->j + bdiag[i + 3] + 1;
1661         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1662         for (j = 0; j < nz; j++) {
1663           col    = pj[j];
1664           pc3[j] = rtmp3[col];
1665           rs += PetscAbsScalar(pc3[j]);
1666         }
1667 
1668         sctx.rs = rs;
1669         sctx.pv = rtmp3[i + 2];
1670         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1671         if (sctx.newshift) break;
1672         pc3  = b->a + bdiag[i + 2];
1673         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1674         break;
1675       case 4:
1676         /*----------*/
1677         /* zero rtmp */
1678         /* L part */
1679         nz    = bi[i + 1] - bi[i];
1680         bjtmp = bj + bi[i];
1681         for (j = 0; j < nz; j++) {
1682           col        = bjtmp[j];
1683           rtmp1[col] = 0.0;
1684           rtmp2[col] = 0.0;
1685           rtmp3[col] = 0.0;
1686           rtmp4[col] = 0.0;
1687         }
1688 
1689         /* U part */
1690         nz    = bdiag[i] - bdiag[i + 1];
1691         bjtmp = bj + bdiag[i + 1] + 1;
1692         for (j = 0; j < nz; j++) {
1693           col        = bjtmp[j];
1694           rtmp1[col] = 0.0;
1695           rtmp2[col] = 0.0;
1696           rtmp3[col] = 0.0;
1697           rtmp4[col] = 0.0;
1698         }
1699 
1700         /* load in initial (unfactored row) */
1701         nz    = ai[r[i] + 1] - ai[r[i]];
1702         ajtmp = aj + ai[r[i]];
1703         v1    = aa + ai[r[i]];
1704         v2    = aa + ai[r[i] + 1];
1705         v3    = aa + ai[r[i] + 2];
1706         v4    = aa + ai[r[i] + 3];
1707         for (j = 0; j < nz; j++) {
1708           col        = ics[ajtmp[j]];
1709           rtmp1[col] = v1[j];
1710           rtmp2[col] = v2[j];
1711           rtmp3[col] = v3[j];
1712           rtmp4[col] = v4[j];
1713         }
1714         /* ZeropivotApply(): shift the diagonal of the matrix  */
1715         rtmp1[i] += sctx.shift_amount;
1716         rtmp2[i + 1] += sctx.shift_amount;
1717         rtmp3[i + 2] += sctx.shift_amount;
1718         rtmp4[i + 3] += sctx.shift_amount;
1719 
1720         /* elimination */
1721         bjtmp = bj + bi[i];
1722         row   = *bjtmp++; /* pivot row */
1723         nzL   = bi[i + 1] - bi[i];
1724         for (k = 0; k < nzL; k++) {
1725           pc1 = rtmp1 + row;
1726           pc2 = rtmp2 + row;
1727           pc3 = rtmp3 + row;
1728           pc4 = rtmp4 + row;
1729           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1730             pv   = b->a + bdiag[row];
1731             mul1 = *pc1 * (*pv);
1732             mul2 = *pc2 * (*pv);
1733             mul3 = *pc3 * (*pv);
1734             mul4 = *pc4 * (*pv);
1735             *pc1 = mul1;
1736             *pc2 = mul2;
1737             *pc3 = mul3;
1738             *pc4 = mul4;
1739 
1740             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1741             pv = b->a + bdiag[row + 1] + 1;
1742             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1743             for (j = 0; j < nz; j++) {
1744               col = pj[j];
1745               rtmp1[col] -= mul1 * pv[j];
1746               rtmp2[col] -= mul2 * pv[j];
1747               rtmp3[col] -= mul3 * pv[j];
1748               rtmp4[col] -= mul4 * pv[j];
1749             }
1750             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1751           }
1752           row = *bjtmp++;
1753         }
1754 
1755         /* finished row i; check zero pivot, then stick row i into b->a */
1756         rs = 0.0;
1757         /* L part */
1758         pc1 = b->a + bi[i];
1759         pj  = b->j + bi[i];
1760         nz  = bi[i + 1] - bi[i];
1761         for (j = 0; j < nz; j++) {
1762           col    = pj[j];
1763           pc1[j] = rtmp1[col];
1764           rs += PetscAbsScalar(pc1[j]);
1765         }
1766         /* U part */
1767         pc1 = b->a + bdiag[i + 1] + 1;
1768         pj  = b->j + bdiag[i + 1] + 1;
1769         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1770         for (j = 0; j < nz; j++) {
1771           col    = pj[j];
1772           pc1[j] = rtmp1[col];
1773           rs += PetscAbsScalar(pc1[j]);
1774         }
1775 
1776         sctx.rs = rs;
1777         sctx.pv = rtmp1[i];
1778         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1779         if (sctx.newshift) break;
1780         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1781         *pc1 = 1.0 / sctx.pv;
1782 
1783         /* Now take care of 1st column of diagonal 4x4 block. */
1784         pc2 = rtmp2 + i;
1785         pc3 = rtmp3 + i;
1786         pc4 = rtmp4 + i;
1787         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1788           mul2 = (*pc2) * (*pc1);
1789           *pc2 = mul2;
1790           mul3 = (*pc3) * (*pc1);
1791           *pc3 = mul3;
1792           mul4 = (*pc4) * (*pc1);
1793           *pc4 = mul4;
1794           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1795           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1796           for (j = 0; j < nz; j++) {
1797             col = pj[j];
1798             rtmp2[col] -= mul2 * rtmp1[col];
1799             rtmp3[col] -= mul3 * rtmp1[col];
1800             rtmp4[col] -= mul4 * rtmp1[col];
1801           }
1802           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1803         }
1804 
1805         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1806         rs = 0.0;
1807         /* L part */
1808         pc2 = b->a + bi[i + 1];
1809         pj  = b->j + bi[i + 1];
1810         nz  = bi[i + 2] - bi[i + 1];
1811         for (j = 0; j < nz; j++) {
1812           col    = pj[j];
1813           pc2[j] = rtmp2[col];
1814           rs += PetscAbsScalar(pc2[j]);
1815         }
1816         /* U part */
1817         pc2 = b->a + bdiag[i + 2] + 1;
1818         pj  = b->j + bdiag[i + 2] + 1;
1819         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1820         for (j = 0; j < nz; j++) {
1821           col    = pj[j];
1822           pc2[j] = rtmp2[col];
1823           rs += PetscAbsScalar(pc2[j]);
1824         }
1825 
1826         sctx.rs = rs;
1827         sctx.pv = rtmp2[i + 1];
1828         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1829         if (sctx.newshift) break;
1830         pc2  = b->a + bdiag[i + 1];
1831         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1832 
1833         /* Now take care of 2nd column of diagonal 4x4 block. */
1834         pc3 = rtmp3 + i + 1;
1835         pc4 = rtmp4 + i + 1;
1836         if (*pc3 != 0.0 || *pc4 != 0.0) {
1837           mul3 = (*pc3) * (*pc2);
1838           *pc3 = mul3;
1839           mul4 = (*pc4) * (*pc2);
1840           *pc4 = mul4;
1841           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1842           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1843           for (j = 0; j < nz; j++) {
1844             col = pj[j];
1845             rtmp3[col] -= mul3 * rtmp2[col];
1846             rtmp4[col] -= mul4 * rtmp2[col];
1847           }
1848           PetscCall(PetscLogFlops(4.0 * nz));
1849         }
1850 
1851         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1852         rs = 0.0;
1853         /* L part */
1854         pc3 = b->a + bi[i + 2];
1855         pj  = b->j + bi[i + 2];
1856         nz  = bi[i + 3] - bi[i + 2];
1857         for (j = 0; j < nz; j++) {
1858           col    = pj[j];
1859           pc3[j] = rtmp3[col];
1860           rs += PetscAbsScalar(pc3[j]);
1861         }
1862         /* U part */
1863         pc3 = b->a + bdiag[i + 3] + 1;
1864         pj  = b->j + bdiag[i + 3] + 1;
1865         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1866         for (j = 0; j < nz; j++) {
1867           col    = pj[j];
1868           pc3[j] = rtmp3[col];
1869           rs += PetscAbsScalar(pc3[j]);
1870         }
1871 
1872         sctx.rs = rs;
1873         sctx.pv = rtmp3[i + 2];
1874         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1875         if (sctx.newshift) break;
1876         pc3  = b->a + bdiag[i + 2];
1877         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1878 
1879         /* Now take care of 3rd column of diagonal 4x4 block. */
1880         pc4 = rtmp4 + i + 2;
1881         if (*pc4 != 0.0) {
1882           mul4 = (*pc4) * (*pc3);
1883           *pc4 = mul4;
1884           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1885           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1886           for (j = 0; j < nz; j++) {
1887             col = pj[j];
1888             rtmp4[col] -= mul4 * rtmp3[col];
1889           }
1890           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1891         }
1892 
1893         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1894         rs = 0.0;
1895         /* L part */
1896         pc4 = b->a + bi[i + 3];
1897         pj  = b->j + bi[i + 3];
1898         nz  = bi[i + 4] - bi[i + 3];
1899         for (j = 0; j < nz; j++) {
1900           col    = pj[j];
1901           pc4[j] = rtmp4[col];
1902           rs += PetscAbsScalar(pc4[j]);
1903         }
1904         /* U part */
1905         pc4 = b->a + bdiag[i + 4] + 1;
1906         pj  = b->j + bdiag[i + 4] + 1;
1907         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1908         for (j = 0; j < nz; j++) {
1909           col    = pj[j];
1910           pc4[j] = rtmp4[col];
1911           rs += PetscAbsScalar(pc4[j]);
1912         }
1913 
1914         sctx.rs = rs;
1915         sctx.pv = rtmp4[i + 3];
1916         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1917         if (sctx.newshift) break;
1918         pc4  = b->a + bdiag[i + 3];
1919         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1920         break;
1921 
1922       default:
1923         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1924       }
1925       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1926       i += nodesz;              /* Update the row */
1927     }
1928 
1929     /* MatPivotRefine() */
1930     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1931       /*
1932        * if no shift in this attempt & shifting & started shifting & can refine,
1933        * then try lower shift
1934        */
1935       sctx.shift_hi       = sctx.shift_fraction;
1936       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1937       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1938       sctx.newshift       = PETSC_TRUE;
1939       sctx.nshift++;
1940     }
1941   } while (sctx.newshift);
1942 
1943   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1944   PetscCall(PetscFree(tmp_vec2));
1945   PetscCall(ISRestoreIndices(isicol, &ic));
1946   PetscCall(ISRestoreIndices(isrow, &r));
1947 
1948   if (b->inode.size) {
1949     C->ops->solve = MatSolve_SeqAIJ_Inode;
1950   } else {
1951     C->ops->solve = MatSolve_SeqAIJ;
1952   }
1953   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1954   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1955   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1956   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1957   C->assembled              = PETSC_TRUE;
1958   C->preallocated           = PETSC_TRUE;
1959 
1960   PetscCall(PetscLogFlops(C->cmap->n));
1961 
1962   /* MatShiftView(A,info,&sctx) */
1963   if (sctx.nshift) {
1964     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1965       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1966     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1967       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1968     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1969       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1970     }
1971   }
1972   PetscFunctionReturn(0);
1973 }
1974 
1975 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1976 {
1977   Mat              C = B;
1978   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1979   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1980   const PetscInt  *r, *ic, *c, *ics;
1981   PetscInt         n = A->rmap->n, *bi = b->i;
1982   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1983   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1984   PetscInt        *ai = a->i, *aj = a->j;
1985   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1986   PetscScalar      mul1, mul2, mul3, tmp;
1987   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1988   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1989   PetscReal        rs = 0.0;
1990   FactorShiftCtx   sctx;
1991 
1992   PetscFunctionBegin;
1993   sctx.shift_top      = 0;
1994   sctx.nshift_max     = 0;
1995   sctx.shift_lo       = 0;
1996   sctx.shift_hi       = 0;
1997   sctx.shift_fraction = 0;
1998 
1999   /* if both shift schemes are chosen by user, only use info->shiftpd */
2000   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2001     sctx.shift_top = 0;
2002     for (i = 0; i < n; i++) {
2003       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2004       rs    = 0.0;
2005       ajtmp = aj + ai[i];
2006       rtmp1 = aa + ai[i];
2007       nz    = ai[i + 1] - ai[i];
2008       for (j = 0; j < nz; j++) {
2009         if (*ajtmp != i) {
2010           rs += PetscAbsScalar(*rtmp1++);
2011         } else {
2012           rs -= PetscRealPart(*rtmp1++);
2013         }
2014         ajtmp++;
2015       }
2016       if (rs > sctx.shift_top) sctx.shift_top = rs;
2017     }
2018     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2019     sctx.shift_top *= 1.1;
2020     sctx.nshift_max = 5;
2021     sctx.shift_lo   = 0.;
2022     sctx.shift_hi   = 1.;
2023   }
2024   sctx.shift_amount = 0;
2025   sctx.nshift       = 0;
2026 
2027   PetscCall(ISGetIndices(isrow, &r));
2028   PetscCall(ISGetIndices(iscol, &c));
2029   PetscCall(ISGetIndices(isicol, &ic));
2030   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2031   ics = ic;
2032 
2033   node_max = a->inode.node_count;
2034   ns       = a->inode.size;
2035   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2036 
2037   /* If max inode size > 3, split it into two inodes.*/
2038   /* also map the inode sizes according to the ordering */
2039   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2040   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2041     if (ns[i] > 3) {
2042       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2043       ++j;
2044       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2045     } else {
2046       tmp_vec1[j] = ns[i];
2047     }
2048   }
2049   /* Use the correct node_max */
2050   node_max = j;
2051 
2052   /* Now reorder the inode info based on mat re-ordering info */
2053   /* First create a row -> inode_size_array_index map */
2054   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2055   for (i = 0, row = 0; i < node_max; i++) {
2056     nodesz = tmp_vec1[i];
2057     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2058   }
2059   /* Using nsmap, create a reordered ns structure */
2060   for (i = 0, j = 0; i < node_max; i++) {
2061     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2062     tmp_vec2[i] = nodesz;
2063     j += nodesz;
2064   }
2065   PetscCall(PetscFree2(nsmap, tmp_vec1));
2066   /* Now use the correct ns */
2067   ns = tmp_vec2;
2068 
2069   do {
2070     sctx.newshift = PETSC_FALSE;
2071     /* Now loop over each block-row, and do the factorization */
2072     for (i = 0, row = 0; i < node_max; i++) {
2073       nodesz = ns[i];
2074       nz     = bi[row + 1] - bi[row];
2075       bjtmp  = bj + bi[row];
2076 
2077       switch (nodesz) {
2078       case 1:
2079         for (j = 0; j < nz; j++) {
2080           idx         = bjtmp[j];
2081           rtmp11[idx] = 0.0;
2082         }
2083 
2084         /* load in initial (unfactored row) */
2085         idx    = r[row];
2086         nz_tmp = ai[idx + 1] - ai[idx];
2087         ajtmp  = aj + ai[idx];
2088         v1     = aa + ai[idx];
2089 
2090         for (j = 0; j < nz_tmp; j++) {
2091           idx         = ics[ajtmp[j]];
2092           rtmp11[idx] = v1[j];
2093         }
2094         rtmp11[ics[r[row]]] += sctx.shift_amount;
2095 
2096         prow = *bjtmp++;
2097         while (prow < row) {
2098           pc1 = rtmp11 + prow;
2099           if (*pc1 != 0.0) {
2100             pv     = ba + bd[prow];
2101             pj     = nbj + bd[prow];
2102             mul1   = *pc1 * *pv++;
2103             *pc1   = mul1;
2104             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2105             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2106             for (j = 0; j < nz_tmp; j++) {
2107               tmp = pv[j];
2108               idx = pj[j];
2109               rtmp11[idx] -= mul1 * tmp;
2110             }
2111           }
2112           prow = *bjtmp++;
2113         }
2114         pj  = bj + bi[row];
2115         pc1 = ba + bi[row];
2116 
2117         sctx.pv     = rtmp11[row];
2118         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2119         rs          = 0.0;
2120         for (j = 0; j < nz; j++) {
2121           idx    = pj[j];
2122           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2123           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2124         }
2125         sctx.rs = rs;
2126         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2127         if (sctx.newshift) goto endofwhile;
2128         break;
2129 
2130       case 2:
2131         for (j = 0; j < nz; j++) {
2132           idx         = bjtmp[j];
2133           rtmp11[idx] = 0.0;
2134           rtmp22[idx] = 0.0;
2135         }
2136 
2137         /* load in initial (unfactored row) */
2138         idx    = r[row];
2139         nz_tmp = ai[idx + 1] - ai[idx];
2140         ajtmp  = aj + ai[idx];
2141         v1     = aa + ai[idx];
2142         v2     = aa + ai[idx + 1];
2143         for (j = 0; j < nz_tmp; j++) {
2144           idx         = ics[ajtmp[j]];
2145           rtmp11[idx] = v1[j];
2146           rtmp22[idx] = v2[j];
2147         }
2148         rtmp11[ics[r[row]]] += sctx.shift_amount;
2149         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2150 
2151         prow = *bjtmp++;
2152         while (prow < row) {
2153           pc1 = rtmp11 + prow;
2154           pc2 = rtmp22 + prow;
2155           if (*pc1 != 0.0 || *pc2 != 0.0) {
2156             pv   = ba + bd[prow];
2157             pj   = nbj + bd[prow];
2158             mul1 = *pc1 * *pv;
2159             mul2 = *pc2 * *pv;
2160             ++pv;
2161             *pc1 = mul1;
2162             *pc2 = mul2;
2163 
2164             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2165             for (j = 0; j < nz_tmp; j++) {
2166               tmp = pv[j];
2167               idx = pj[j];
2168               rtmp11[idx] -= mul1 * tmp;
2169               rtmp22[idx] -= mul2 * tmp;
2170             }
2171             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2172           }
2173           prow = *bjtmp++;
2174         }
2175 
2176         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2177         pc1 = rtmp11 + prow;
2178         pc2 = rtmp22 + prow;
2179 
2180         sctx.pv = *pc1;
2181         pj      = bj + bi[prow];
2182         rs      = 0.0;
2183         for (j = 0; j < nz; j++) {
2184           idx = pj[j];
2185           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2186         }
2187         sctx.rs = rs;
2188         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2189         if (sctx.newshift) goto endofwhile;
2190 
2191         if (*pc2 != 0.0) {
2192           pj     = nbj + bd[prow];
2193           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2194           *pc2   = mul2;
2195           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2196           for (j = 0; j < nz_tmp; j++) {
2197             idx = pj[j];
2198             tmp = rtmp11[idx];
2199             rtmp22[idx] -= mul2 * tmp;
2200           }
2201           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2202         }
2203 
2204         pj  = bj + bi[row];
2205         pc1 = ba + bi[row];
2206         pc2 = ba + bi[row + 1];
2207 
2208         sctx.pv         = rtmp22[row + 1];
2209         rs              = 0.0;
2210         rtmp11[row]     = 1.0 / rtmp11[row];
2211         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2212         /* copy row entries from dense representation to sparse */
2213         for (j = 0; j < nz; j++) {
2214           idx    = pj[j];
2215           pc1[j] = rtmp11[idx];
2216           pc2[j] = rtmp22[idx];
2217           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2218         }
2219         sctx.rs = rs;
2220         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2221         if (sctx.newshift) goto endofwhile;
2222         break;
2223 
2224       case 3:
2225         for (j = 0; j < nz; j++) {
2226           idx         = bjtmp[j];
2227           rtmp11[idx] = 0.0;
2228           rtmp22[idx] = 0.0;
2229           rtmp33[idx] = 0.0;
2230         }
2231         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2232         idx    = r[row];
2233         nz_tmp = ai[idx + 1] - ai[idx];
2234         ajtmp  = aj + ai[idx];
2235         v1     = aa + ai[idx];
2236         v2     = aa + ai[idx + 1];
2237         v3     = aa + ai[idx + 2];
2238         for (j = 0; j < nz_tmp; j++) {
2239           idx         = ics[ajtmp[j]];
2240           rtmp11[idx] = v1[j];
2241           rtmp22[idx] = v2[j];
2242           rtmp33[idx] = v3[j];
2243         }
2244         rtmp11[ics[r[row]]] += sctx.shift_amount;
2245         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2246         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2247 
2248         /* loop over all pivot row blocks above this row block */
2249         prow = *bjtmp++;
2250         while (prow < row) {
2251           pc1 = rtmp11 + prow;
2252           pc2 = rtmp22 + prow;
2253           pc3 = rtmp33 + prow;
2254           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2255             pv   = ba + bd[prow];
2256             pj   = nbj + bd[prow];
2257             mul1 = *pc1 * *pv;
2258             mul2 = *pc2 * *pv;
2259             mul3 = *pc3 * *pv;
2260             ++pv;
2261             *pc1 = mul1;
2262             *pc2 = mul2;
2263             *pc3 = mul3;
2264 
2265             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2266             /* update this row based on pivot row */
2267             for (j = 0; j < nz_tmp; j++) {
2268               tmp = pv[j];
2269               idx = pj[j];
2270               rtmp11[idx] -= mul1 * tmp;
2271               rtmp22[idx] -= mul2 * tmp;
2272               rtmp33[idx] -= mul3 * tmp;
2273             }
2274             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2275           }
2276           prow = *bjtmp++;
2277         }
2278 
2279         /* Now take care of diagonal 3x3 block in this set of rows */
2280         /* note: prow = row here */
2281         pc1 = rtmp11 + prow;
2282         pc2 = rtmp22 + prow;
2283         pc3 = rtmp33 + prow;
2284 
2285         sctx.pv = *pc1;
2286         pj      = bj + bi[prow];
2287         rs      = 0.0;
2288         for (j = 0; j < nz; j++) {
2289           idx = pj[j];
2290           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2291         }
2292         sctx.rs = rs;
2293         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2294         if (sctx.newshift) goto endofwhile;
2295 
2296         if (*pc2 != 0.0 || *pc3 != 0.0) {
2297           mul2   = (*pc2) / (*pc1);
2298           mul3   = (*pc3) / (*pc1);
2299           *pc2   = mul2;
2300           *pc3   = mul3;
2301           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2302           pj     = nbj + bd[prow];
2303           for (j = 0; j < nz_tmp; j++) {
2304             idx = pj[j];
2305             tmp = rtmp11[idx];
2306             rtmp22[idx] -= mul2 * tmp;
2307             rtmp33[idx] -= mul3 * tmp;
2308           }
2309           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2310         }
2311         ++prow;
2312 
2313         pc2     = rtmp22 + prow;
2314         pc3     = rtmp33 + prow;
2315         sctx.pv = *pc2;
2316         pj      = bj + bi[prow];
2317         rs      = 0.0;
2318         for (j = 0; j < nz; j++) {
2319           idx = pj[j];
2320           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2321         }
2322         sctx.rs = rs;
2323         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2324         if (sctx.newshift) goto endofwhile;
2325 
2326         if (*pc3 != 0.0) {
2327           mul3   = (*pc3) / (*pc2);
2328           *pc3   = mul3;
2329           pj     = nbj + bd[prow];
2330           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2331           for (j = 0; j < nz_tmp; j++) {
2332             idx = pj[j];
2333             tmp = rtmp22[idx];
2334             rtmp33[idx] -= mul3 * tmp;
2335           }
2336           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2337         }
2338 
2339         pj  = bj + bi[row];
2340         pc1 = ba + bi[row];
2341         pc2 = ba + bi[row + 1];
2342         pc3 = ba + bi[row + 2];
2343 
2344         sctx.pv         = rtmp33[row + 2];
2345         rs              = 0.0;
2346         rtmp11[row]     = 1.0 / rtmp11[row];
2347         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2348         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2349         /* copy row entries from dense representation to sparse */
2350         for (j = 0; j < nz; j++) {
2351           idx    = pj[j];
2352           pc1[j] = rtmp11[idx];
2353           pc2[j] = rtmp22[idx];
2354           pc3[j] = rtmp33[idx];
2355           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2356         }
2357 
2358         sctx.rs = rs;
2359         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2360         if (sctx.newshift) goto endofwhile;
2361         break;
2362 
2363       default:
2364         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2365       }
2366       row += nodesz; /* Update the row */
2367     }
2368   endofwhile:;
2369   } while (sctx.newshift);
2370   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2371   PetscCall(PetscFree(tmp_vec2));
2372   PetscCall(ISRestoreIndices(isicol, &ic));
2373   PetscCall(ISRestoreIndices(isrow, &r));
2374   PetscCall(ISRestoreIndices(iscol, &c));
2375 
2376   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2377   /* do not set solve add, since MatSolve_Inode + Add is faster */
2378   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2379   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2380   C->assembled              = PETSC_TRUE;
2381   C->preallocated           = PETSC_TRUE;
2382   if (sctx.nshift) {
2383     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2384       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2385     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2386       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2387     }
2388   }
2389   PetscCall(PetscLogFlops(C->cmap->n));
2390   PetscCall(MatSeqAIJCheckInode(C));
2391   PetscFunctionReturn(0);
2392 }
2393 
2394 /* ----------------------------------------------------------- */
2395 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2396 {
2397   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2398   IS                 iscol = a->col, isrow = a->row;
2399   const PetscInt    *r, *c, *rout, *cout;
2400   PetscInt           i, j, n = A->rmap->n;
2401   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2402   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2403   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2404   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2405   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2406   const PetscScalar *b;
2407 
2408   PetscFunctionBegin;
2409   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2410   node_max = a->inode.node_count;
2411   ns       = a->inode.size; /* Node Size array */
2412 
2413   PetscCall(VecGetArrayRead(bb, &b));
2414   PetscCall(VecGetArrayWrite(xx, &x));
2415   tmp = a->solve_work;
2416 
2417   PetscCall(ISGetIndices(isrow, &rout));
2418   r = rout;
2419   PetscCall(ISGetIndices(iscol, &cout));
2420   c = cout;
2421 
2422   /* forward solve the lower triangular */
2423   tmps = tmp;
2424   aa   = a_a;
2425   aj   = a_j;
2426   ad   = a->diag;
2427 
2428   for (i = 0, row = 0; i < node_max; ++i) {
2429     nsz = ns[i];
2430     aii = ai[row];
2431     v1  = aa + aii;
2432     vi  = aj + aii;
2433     nz  = ai[row + 1] - ai[row];
2434 
2435     if (i < node_max - 1) {
2436       /* Prefetch the indices for the next block */
2437       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2438       /* Prefetch the data for the next block */
2439       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2440     }
2441 
2442     switch (nsz) { /* Each loop in 'case' is unrolled */
2443     case 1:
2444       sum1 = b[r[row]];
2445       for (j = 0; j < nz - 1; j += 2) {
2446         i0   = vi[j];
2447         i1   = vi[j + 1];
2448         tmp0 = tmps[i0];
2449         tmp1 = tmps[i1];
2450         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2451       }
2452       if (j == nz - 1) {
2453         tmp0 = tmps[vi[j]];
2454         sum1 -= v1[j] * tmp0;
2455       }
2456       tmp[row++] = sum1;
2457       break;
2458     case 2:
2459       sum1 = b[r[row]];
2460       sum2 = b[r[row + 1]];
2461       v2   = aa + ai[row + 1];
2462 
2463       for (j = 0; j < nz - 1; j += 2) {
2464         i0   = vi[j];
2465         i1   = vi[j + 1];
2466         tmp0 = tmps[i0];
2467         tmp1 = tmps[i1];
2468         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2469         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2470       }
2471       if (j == nz - 1) {
2472         tmp0 = tmps[vi[j]];
2473         sum1 -= v1[j] * tmp0;
2474         sum2 -= v2[j] * tmp0;
2475       }
2476       sum2 -= v2[nz] * sum1;
2477       tmp[row++] = sum1;
2478       tmp[row++] = sum2;
2479       break;
2480     case 3:
2481       sum1 = b[r[row]];
2482       sum2 = b[r[row + 1]];
2483       sum3 = b[r[row + 2]];
2484       v2   = aa + ai[row + 1];
2485       v3   = aa + ai[row + 2];
2486 
2487       for (j = 0; j < nz - 1; j += 2) {
2488         i0   = vi[j];
2489         i1   = vi[j + 1];
2490         tmp0 = tmps[i0];
2491         tmp1 = tmps[i1];
2492         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2493         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2494         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2495       }
2496       if (j == nz - 1) {
2497         tmp0 = tmps[vi[j]];
2498         sum1 -= v1[j] * tmp0;
2499         sum2 -= v2[j] * tmp0;
2500         sum3 -= v3[j] * tmp0;
2501       }
2502       sum2 -= v2[nz] * sum1;
2503       sum3 -= v3[nz] * sum1;
2504       sum3 -= v3[nz + 1] * sum2;
2505       tmp[row++] = sum1;
2506       tmp[row++] = sum2;
2507       tmp[row++] = sum3;
2508       break;
2509 
2510     case 4:
2511       sum1 = b[r[row]];
2512       sum2 = b[r[row + 1]];
2513       sum3 = b[r[row + 2]];
2514       sum4 = b[r[row + 3]];
2515       v2   = aa + ai[row + 1];
2516       v3   = aa + ai[row + 2];
2517       v4   = aa + ai[row + 3];
2518 
2519       for (j = 0; j < nz - 1; j += 2) {
2520         i0   = vi[j];
2521         i1   = vi[j + 1];
2522         tmp0 = tmps[i0];
2523         tmp1 = tmps[i1];
2524         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2525         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2526         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2527         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2528       }
2529       if (j == nz - 1) {
2530         tmp0 = tmps[vi[j]];
2531         sum1 -= v1[j] * tmp0;
2532         sum2 -= v2[j] * tmp0;
2533         sum3 -= v3[j] * tmp0;
2534         sum4 -= v4[j] * tmp0;
2535       }
2536       sum2 -= v2[nz] * sum1;
2537       sum3 -= v3[nz] * sum1;
2538       sum4 -= v4[nz] * sum1;
2539       sum3 -= v3[nz + 1] * sum2;
2540       sum4 -= v4[nz + 1] * sum2;
2541       sum4 -= v4[nz + 2] * sum3;
2542 
2543       tmp[row++] = sum1;
2544       tmp[row++] = sum2;
2545       tmp[row++] = sum3;
2546       tmp[row++] = sum4;
2547       break;
2548     case 5:
2549       sum1 = b[r[row]];
2550       sum2 = b[r[row + 1]];
2551       sum3 = b[r[row + 2]];
2552       sum4 = b[r[row + 3]];
2553       sum5 = b[r[row + 4]];
2554       v2   = aa + ai[row + 1];
2555       v3   = aa + ai[row + 2];
2556       v4   = aa + ai[row + 3];
2557       v5   = aa + ai[row + 4];
2558 
2559       for (j = 0; j < nz - 1; j += 2) {
2560         i0   = vi[j];
2561         i1   = vi[j + 1];
2562         tmp0 = tmps[i0];
2563         tmp1 = tmps[i1];
2564         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2565         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2566         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2567         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2568         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2569       }
2570       if (j == nz - 1) {
2571         tmp0 = tmps[vi[j]];
2572         sum1 -= v1[j] * tmp0;
2573         sum2 -= v2[j] * tmp0;
2574         sum3 -= v3[j] * tmp0;
2575         sum4 -= v4[j] * tmp0;
2576         sum5 -= v5[j] * tmp0;
2577       }
2578 
2579       sum2 -= v2[nz] * sum1;
2580       sum3 -= v3[nz] * sum1;
2581       sum4 -= v4[nz] * sum1;
2582       sum5 -= v5[nz] * sum1;
2583       sum3 -= v3[nz + 1] * sum2;
2584       sum4 -= v4[nz + 1] * sum2;
2585       sum5 -= v5[nz + 1] * sum2;
2586       sum4 -= v4[nz + 2] * sum3;
2587       sum5 -= v5[nz + 2] * sum3;
2588       sum5 -= v5[nz + 3] * sum4;
2589 
2590       tmp[row++] = sum1;
2591       tmp[row++] = sum2;
2592       tmp[row++] = sum3;
2593       tmp[row++] = sum4;
2594       tmp[row++] = sum5;
2595       break;
2596     default:
2597       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2598     }
2599   }
2600   /* backward solve the upper triangular */
2601   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2602     nsz = ns[i];
2603     aii = ad[row + 1] + 1;
2604     v1  = aa + aii;
2605     vi  = aj + aii;
2606     nz  = ad[row] - ad[row + 1] - 1;
2607 
2608     if (i > 0) {
2609       /* Prefetch the indices for the next block */
2610       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2611       /* Prefetch the data for the next block */
2612       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2613     }
2614 
2615     switch (nsz) { /* Each loop in 'case' is unrolled */
2616     case 1:
2617       sum1 = tmp[row];
2618 
2619       for (j = 0; j < nz - 1; j += 2) {
2620         i0   = vi[j];
2621         i1   = vi[j + 1];
2622         tmp0 = tmps[i0];
2623         tmp1 = tmps[i1];
2624         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2625       }
2626       if (j == nz - 1) {
2627         tmp0 = tmps[vi[j]];
2628         sum1 -= v1[j] * tmp0;
2629       }
2630       x[c[row]] = tmp[row] = sum1 * v1[nz];
2631       row--;
2632       break;
2633     case 2:
2634       sum1 = tmp[row];
2635       sum2 = tmp[row - 1];
2636       v2   = aa + ad[row] + 1;
2637       for (j = 0; j < nz - 1; j += 2) {
2638         i0   = vi[j];
2639         i1   = vi[j + 1];
2640         tmp0 = tmps[i0];
2641         tmp1 = tmps[i1];
2642         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2643         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2644       }
2645       if (j == nz - 1) {
2646         tmp0 = tmps[vi[j]];
2647         sum1 -= v1[j] * tmp0;
2648         sum2 -= v2[j + 1] * tmp0;
2649       }
2650 
2651       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2652       row--;
2653       sum2 -= v2[0] * tmp0;
2654       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2655       row--;
2656       break;
2657     case 3:
2658       sum1 = tmp[row];
2659       sum2 = tmp[row - 1];
2660       sum3 = tmp[row - 2];
2661       v2   = aa + ad[row] + 1;
2662       v3   = aa + ad[row - 1] + 1;
2663       for (j = 0; j < nz - 1; j += 2) {
2664         i0   = vi[j];
2665         i1   = vi[j + 1];
2666         tmp0 = tmps[i0];
2667         tmp1 = tmps[i1];
2668         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2669         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2670         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2671       }
2672       if (j == nz - 1) {
2673         tmp0 = tmps[vi[j]];
2674         sum1 -= v1[j] * tmp0;
2675         sum2 -= v2[j + 1] * tmp0;
2676         sum3 -= v3[j + 2] * tmp0;
2677       }
2678       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2679       row--;
2680       sum2 -= v2[0] * tmp0;
2681       sum3 -= v3[1] * tmp0;
2682       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2683       row--;
2684       sum3 -= v3[0] * tmp0;
2685       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2686       row--;
2687 
2688       break;
2689     case 4:
2690       sum1 = tmp[row];
2691       sum2 = tmp[row - 1];
2692       sum3 = tmp[row - 2];
2693       sum4 = tmp[row - 3];
2694       v2   = aa + ad[row] + 1;
2695       v3   = aa + ad[row - 1] + 1;
2696       v4   = aa + ad[row - 2] + 1;
2697 
2698       for (j = 0; j < nz - 1; j += 2) {
2699         i0   = vi[j];
2700         i1   = vi[j + 1];
2701         tmp0 = tmps[i0];
2702         tmp1 = tmps[i1];
2703         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2704         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2705         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2706         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2707       }
2708       if (j == nz - 1) {
2709         tmp0 = tmps[vi[j]];
2710         sum1 -= v1[j] * tmp0;
2711         sum2 -= v2[j + 1] * tmp0;
2712         sum3 -= v3[j + 2] * tmp0;
2713         sum4 -= v4[j + 3] * tmp0;
2714       }
2715 
2716       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2717       row--;
2718       sum2 -= v2[0] * tmp0;
2719       sum3 -= v3[1] * tmp0;
2720       sum4 -= v4[2] * tmp0;
2721       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2722       row--;
2723       sum3 -= v3[0] * tmp0;
2724       sum4 -= v4[1] * tmp0;
2725       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2726       row--;
2727       sum4 -= v4[0] * tmp0;
2728       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2729       row--;
2730       break;
2731     case 5:
2732       sum1 = tmp[row];
2733       sum2 = tmp[row - 1];
2734       sum3 = tmp[row - 2];
2735       sum4 = tmp[row - 3];
2736       sum5 = tmp[row - 4];
2737       v2   = aa + ad[row] + 1;
2738       v3   = aa + ad[row - 1] + 1;
2739       v4   = aa + ad[row - 2] + 1;
2740       v5   = aa + ad[row - 3] + 1;
2741       for (j = 0; j < nz - 1; j += 2) {
2742         i0   = vi[j];
2743         i1   = vi[j + 1];
2744         tmp0 = tmps[i0];
2745         tmp1 = tmps[i1];
2746         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2747         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2748         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2749         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2750         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2751       }
2752       if (j == nz - 1) {
2753         tmp0 = tmps[vi[j]];
2754         sum1 -= v1[j] * tmp0;
2755         sum2 -= v2[j + 1] * tmp0;
2756         sum3 -= v3[j + 2] * tmp0;
2757         sum4 -= v4[j + 3] * tmp0;
2758         sum5 -= v5[j + 4] * tmp0;
2759       }
2760 
2761       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2762       row--;
2763       sum2 -= v2[0] * tmp0;
2764       sum3 -= v3[1] * tmp0;
2765       sum4 -= v4[2] * tmp0;
2766       sum5 -= v5[3] * tmp0;
2767       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2768       row--;
2769       sum3 -= v3[0] * tmp0;
2770       sum4 -= v4[1] * tmp0;
2771       sum5 -= v5[2] * tmp0;
2772       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2773       row--;
2774       sum4 -= v4[0] * tmp0;
2775       sum5 -= v5[1] * tmp0;
2776       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2777       row--;
2778       sum5 -= v5[0] * tmp0;
2779       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2780       row--;
2781       break;
2782     default:
2783       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2784     }
2785   }
2786   PetscCall(ISRestoreIndices(isrow, &rout));
2787   PetscCall(ISRestoreIndices(iscol, &cout));
2788   PetscCall(VecRestoreArrayRead(bb, &b));
2789   PetscCall(VecRestoreArrayWrite(xx, &x));
2790   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2791   PetscFunctionReturn(0);
2792 }
2793 
2794 /*
2795      Makes a longer coloring[] array and calls the usual code with that
2796 */
2797 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2798 {
2799   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2800   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2801   PetscInt        *colorused, i;
2802   ISColoringValue *newcolor;
2803 
2804   PetscFunctionBegin;
2805   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2806   PetscCall(PetscMalloc1(n + 1, &newcolor));
2807   /* loop over inodes, marking a color for each column*/
2808   row = 0;
2809   for (i = 0; i < m; i++) {
2810     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2811   }
2812 
2813   /* eliminate unneeded colors */
2814   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2815   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2816 
2817   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2818   ncolors = colorused[5 * ncolors - 1];
2819   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2820   PetscCall(PetscFree(colorused));
2821   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2822   PetscCall(PetscFree(coloring));
2823   PetscFunctionReturn(0);
2824 }
2825 
2826 #include <petsc/private/kernels/blockinvert.h>
2827 
2828 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2829 {
2830   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2831   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2832   MatScalar         *ibdiag, *bdiag, work[25], *t;
2833   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2834   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2835   const PetscScalar *xb, *b;
2836   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2837   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2838   PetscInt           sz, k, ipvt[5];
2839   PetscBool          allowzeropivot, zeropivotdetected;
2840   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2841 
2842   PetscFunctionBegin;
2843   allowzeropivot = PetscNot(A->erroriffailure);
2844   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2845   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2846   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2847 
2848   if (!a->inode.ibdiagvalid) {
2849     if (!a->inode.ibdiag) {
2850       /* calculate space needed for diagonal blocks */
2851       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2852       a->inode.bdiagsize = cnt;
2853 
2854       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2855     }
2856 
2857     /* copy over the diagonal blocks and invert them */
2858     ibdiag = a->inode.ibdiag;
2859     bdiag  = a->inode.bdiag;
2860     cnt    = 0;
2861     for (i = 0, row = 0; i < m; i++) {
2862       for (j = 0; j < sizes[i]; j++) {
2863         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2864       }
2865       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2866 
2867       switch (sizes[i]) {
2868       case 1:
2869         /* Create matrix data structure */
2870         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2871           if (allowzeropivot) {
2872             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2873             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2874             A->factorerror_zeropivot_row   = row;
2875             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2876           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2877         }
2878         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2879         break;
2880       case 2:
2881         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2882         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2883         break;
2884       case 3:
2885         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2886         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2887         break;
2888       case 4:
2889         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2890         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2891         break;
2892       case 5:
2893         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2894         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2895         break;
2896       default:
2897         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2898       }
2899       cnt += sizes[i] * sizes[i];
2900       row += sizes[i];
2901     }
2902     a->inode.ibdiagvalid = PETSC_TRUE;
2903   }
2904   ibdiag = a->inode.ibdiag;
2905   bdiag  = a->inode.bdiag;
2906   t      = a->inode.ssor_work;
2907 
2908   PetscCall(VecGetArray(xx, &x));
2909   PetscCall(VecGetArrayRead(bb, &b));
2910   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2911   if (flag & SOR_ZERO_INITIAL_GUESS) {
2912     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2913       for (i = 0, row = 0; i < m; i++) {
2914         sz  = diag[row] - ii[row];
2915         v1  = a->a + ii[row];
2916         idx = a->j + ii[row];
2917 
2918         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2919         switch (sizes[i]) {
2920         case 1:
2921 
2922           sum1 = b[row];
2923           for (n = 0; n < sz - 1; n += 2) {
2924             i1 = idx[0];
2925             i2 = idx[1];
2926             idx += 2;
2927             tmp0 = x[i1];
2928             tmp1 = x[i2];
2929             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2930             v1 += 2;
2931           }
2932 
2933           if (n == sz - 1) {
2934             tmp0 = x[*idx];
2935             sum1 -= *v1 * tmp0;
2936           }
2937           t[row]   = sum1;
2938           x[row++] = sum1 * (*ibdiag++);
2939           break;
2940         case 2:
2941           v2   = a->a + ii[row + 1];
2942           sum1 = b[row];
2943           sum2 = b[row + 1];
2944           for (n = 0; n < sz - 1; n += 2) {
2945             i1 = idx[0];
2946             i2 = idx[1];
2947             idx += 2;
2948             tmp0 = x[i1];
2949             tmp1 = x[i2];
2950             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2951             v1 += 2;
2952             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2953             v2 += 2;
2954           }
2955 
2956           if (n == sz - 1) {
2957             tmp0 = x[*idx];
2958             sum1 -= v1[0] * tmp0;
2959             sum2 -= v2[0] * tmp0;
2960           }
2961           t[row]     = sum1;
2962           t[row + 1] = sum2;
2963           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2964           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2965           ibdiag += 4;
2966           break;
2967         case 3:
2968           v2   = a->a + ii[row + 1];
2969           v3   = a->a + ii[row + 2];
2970           sum1 = b[row];
2971           sum2 = b[row + 1];
2972           sum3 = b[row + 2];
2973           for (n = 0; n < sz - 1; n += 2) {
2974             i1 = idx[0];
2975             i2 = idx[1];
2976             idx += 2;
2977             tmp0 = x[i1];
2978             tmp1 = x[i2];
2979             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2980             v1 += 2;
2981             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2982             v2 += 2;
2983             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2984             v3 += 2;
2985           }
2986 
2987           if (n == sz - 1) {
2988             tmp0 = x[*idx];
2989             sum1 -= v1[0] * tmp0;
2990             sum2 -= v2[0] * tmp0;
2991             sum3 -= v3[0] * tmp0;
2992           }
2993           t[row]     = sum1;
2994           t[row + 1] = sum2;
2995           t[row + 2] = sum3;
2996           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2997           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2998           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2999           ibdiag += 9;
3000           break;
3001         case 4:
3002           v2   = a->a + ii[row + 1];
3003           v3   = a->a + ii[row + 2];
3004           v4   = a->a + ii[row + 3];
3005           sum1 = b[row];
3006           sum2 = b[row + 1];
3007           sum3 = b[row + 2];
3008           sum4 = b[row + 3];
3009           for (n = 0; n < sz - 1; n += 2) {
3010             i1 = idx[0];
3011             i2 = idx[1];
3012             idx += 2;
3013             tmp0 = x[i1];
3014             tmp1 = x[i2];
3015             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3016             v1 += 2;
3017             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3018             v2 += 2;
3019             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3020             v3 += 2;
3021             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3022             v4 += 2;
3023           }
3024 
3025           if (n == sz - 1) {
3026             tmp0 = x[*idx];
3027             sum1 -= v1[0] * tmp0;
3028             sum2 -= v2[0] * tmp0;
3029             sum3 -= v3[0] * tmp0;
3030             sum4 -= v4[0] * tmp0;
3031           }
3032           t[row]     = sum1;
3033           t[row + 1] = sum2;
3034           t[row + 2] = sum3;
3035           t[row + 3] = sum4;
3036           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3037           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3038           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3039           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3040           ibdiag += 16;
3041           break;
3042         case 5:
3043           v2   = a->a + ii[row + 1];
3044           v3   = a->a + ii[row + 2];
3045           v4   = a->a + ii[row + 3];
3046           v5   = a->a + ii[row + 4];
3047           sum1 = b[row];
3048           sum2 = b[row + 1];
3049           sum3 = b[row + 2];
3050           sum4 = b[row + 3];
3051           sum5 = b[row + 4];
3052           for (n = 0; n < sz - 1; n += 2) {
3053             i1 = idx[0];
3054             i2 = idx[1];
3055             idx += 2;
3056             tmp0 = x[i1];
3057             tmp1 = x[i2];
3058             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3059             v1 += 2;
3060             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3061             v2 += 2;
3062             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3063             v3 += 2;
3064             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3065             v4 += 2;
3066             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3067             v5 += 2;
3068           }
3069 
3070           if (n == sz - 1) {
3071             tmp0 = x[*idx];
3072             sum1 -= v1[0] * tmp0;
3073             sum2 -= v2[0] * tmp0;
3074             sum3 -= v3[0] * tmp0;
3075             sum4 -= v4[0] * tmp0;
3076             sum5 -= v5[0] * tmp0;
3077           }
3078           t[row]     = sum1;
3079           t[row + 1] = sum2;
3080           t[row + 2] = sum3;
3081           t[row + 3] = sum4;
3082           t[row + 4] = sum5;
3083           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3084           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3085           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3086           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3087           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3088           ibdiag += 25;
3089           break;
3090         default:
3091           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3092         }
3093       }
3094 
3095       xb = t;
3096       PetscCall(PetscLogFlops(a->nz));
3097     } else xb = b;
3098     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3099       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3100       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3101         ibdiag -= sizes[i] * sizes[i];
3102         sz  = ii[row + 1] - diag[row] - 1;
3103         v1  = a->a + diag[row] + 1;
3104         idx = a->j + diag[row] + 1;
3105 
3106         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3107         switch (sizes[i]) {
3108         case 1:
3109 
3110           sum1 = xb[row];
3111           for (n = 0; n < sz - 1; n += 2) {
3112             i1 = idx[0];
3113             i2 = idx[1];
3114             idx += 2;
3115             tmp0 = x[i1];
3116             tmp1 = x[i2];
3117             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3118             v1 += 2;
3119           }
3120 
3121           if (n == sz - 1) {
3122             tmp0 = x[*idx];
3123             sum1 -= *v1 * tmp0;
3124           }
3125           x[row--] = sum1 * (*ibdiag);
3126           break;
3127 
3128         case 2:
3129 
3130           sum1 = xb[row];
3131           sum2 = xb[row - 1];
3132           /* note that sum1 is associated with the second of the two rows */
3133           v2 = a->a + diag[row - 1] + 2;
3134           for (n = 0; n < sz - 1; n += 2) {
3135             i1 = idx[0];
3136             i2 = idx[1];
3137             idx += 2;
3138             tmp0 = x[i1];
3139             tmp1 = x[i2];
3140             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3141             v1 += 2;
3142             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3143             v2 += 2;
3144           }
3145 
3146           if (n == sz - 1) {
3147             tmp0 = x[*idx];
3148             sum1 -= *v1 * tmp0;
3149             sum2 -= *v2 * tmp0;
3150           }
3151           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3152           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3153           break;
3154         case 3:
3155 
3156           sum1 = xb[row];
3157           sum2 = xb[row - 1];
3158           sum3 = xb[row - 2];
3159           v2   = a->a + diag[row - 1] + 2;
3160           v3   = a->a + diag[row - 2] + 3;
3161           for (n = 0; n < sz - 1; n += 2) {
3162             i1 = idx[0];
3163             i2 = idx[1];
3164             idx += 2;
3165             tmp0 = x[i1];
3166             tmp1 = x[i2];
3167             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3168             v1 += 2;
3169             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3170             v2 += 2;
3171             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3172             v3 += 2;
3173           }
3174 
3175           if (n == sz - 1) {
3176             tmp0 = x[*idx];
3177             sum1 -= *v1 * tmp0;
3178             sum2 -= *v2 * tmp0;
3179             sum3 -= *v3 * tmp0;
3180           }
3181           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3182           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3183           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3184           break;
3185         case 4:
3186 
3187           sum1 = xb[row];
3188           sum2 = xb[row - 1];
3189           sum3 = xb[row - 2];
3190           sum4 = xb[row - 3];
3191           v2   = a->a + diag[row - 1] + 2;
3192           v3   = a->a + diag[row - 2] + 3;
3193           v4   = a->a + diag[row - 3] + 4;
3194           for (n = 0; n < sz - 1; n += 2) {
3195             i1 = idx[0];
3196             i2 = idx[1];
3197             idx += 2;
3198             tmp0 = x[i1];
3199             tmp1 = x[i2];
3200             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3201             v1 += 2;
3202             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3203             v2 += 2;
3204             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3205             v3 += 2;
3206             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3207             v4 += 2;
3208           }
3209 
3210           if (n == sz - 1) {
3211             tmp0 = x[*idx];
3212             sum1 -= *v1 * tmp0;
3213             sum2 -= *v2 * tmp0;
3214             sum3 -= *v3 * tmp0;
3215             sum4 -= *v4 * tmp0;
3216           }
3217           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3218           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3219           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3220           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3221           break;
3222         case 5:
3223 
3224           sum1 = xb[row];
3225           sum2 = xb[row - 1];
3226           sum3 = xb[row - 2];
3227           sum4 = xb[row - 3];
3228           sum5 = xb[row - 4];
3229           v2   = a->a + diag[row - 1] + 2;
3230           v3   = a->a + diag[row - 2] + 3;
3231           v4   = a->a + diag[row - 3] + 4;
3232           v5   = a->a + diag[row - 4] + 5;
3233           for (n = 0; n < sz - 1; n += 2) {
3234             i1 = idx[0];
3235             i2 = idx[1];
3236             idx += 2;
3237             tmp0 = x[i1];
3238             tmp1 = x[i2];
3239             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3240             v1 += 2;
3241             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3242             v2 += 2;
3243             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3244             v3 += 2;
3245             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3246             v4 += 2;
3247             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3248             v5 += 2;
3249           }
3250 
3251           if (n == sz - 1) {
3252             tmp0 = x[*idx];
3253             sum1 -= *v1 * tmp0;
3254             sum2 -= *v2 * tmp0;
3255             sum3 -= *v3 * tmp0;
3256             sum4 -= *v4 * tmp0;
3257             sum5 -= *v5 * tmp0;
3258           }
3259           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3260           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3261           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3262           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3263           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3264           break;
3265         default:
3266           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3267         }
3268       }
3269 
3270       PetscCall(PetscLogFlops(a->nz));
3271     }
3272     its--;
3273   }
3274   while (its--) {
3275     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3276       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3277         sz  = diag[row] - ii[row];
3278         v1  = a->a + ii[row];
3279         idx = a->j + ii[row];
3280         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3281         switch (sizes[i]) {
3282         case 1:
3283           sum1 = b[row];
3284           for (n = 0; n < sz - 1; n += 2) {
3285             i1 = idx[0];
3286             i2 = idx[1];
3287             idx += 2;
3288             tmp0 = x[i1];
3289             tmp1 = x[i2];
3290             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3291             v1 += 2;
3292           }
3293           if (n == sz - 1) {
3294             tmp0 = x[*idx++];
3295             sum1 -= *v1 * tmp0;
3296             v1++;
3297           }
3298           t[row] = sum1;
3299           sz     = ii[row + 1] - diag[row] - 1;
3300           idx    = a->j + diag[row] + 1;
3301           v1 += 1;
3302           for (n = 0; n < sz - 1; n += 2) {
3303             i1 = idx[0];
3304             i2 = idx[1];
3305             idx += 2;
3306             tmp0 = x[i1];
3307             tmp1 = x[i2];
3308             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3309             v1 += 2;
3310           }
3311           if (n == sz - 1) {
3312             tmp0 = x[*idx++];
3313             sum1 -= *v1 * tmp0;
3314           }
3315           /* in MatSOR_SeqAIJ this line would be
3316            *
3317            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3318            *
3319            * but omega == 1, so this becomes
3320            *
3321            * x[row] = sum1*(*ibdiag++);
3322            *
3323            */
3324           x[row] = sum1 * (*ibdiag);
3325           break;
3326         case 2:
3327           v2   = a->a + ii[row + 1];
3328           sum1 = b[row];
3329           sum2 = b[row + 1];
3330           for (n = 0; n < sz - 1; n += 2) {
3331             i1 = idx[0];
3332             i2 = idx[1];
3333             idx += 2;
3334             tmp0 = x[i1];
3335             tmp1 = x[i2];
3336             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3337             v1 += 2;
3338             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3339             v2 += 2;
3340           }
3341           if (n == sz - 1) {
3342             tmp0 = x[*idx++];
3343             sum1 -= v1[0] * tmp0;
3344             sum2 -= v2[0] * tmp0;
3345             v1++;
3346             v2++;
3347           }
3348           t[row]     = sum1;
3349           t[row + 1] = sum2;
3350           sz         = ii[row + 1] - diag[row] - 2;
3351           idx        = a->j + diag[row] + 2;
3352           v1 += 2;
3353           v2 += 2;
3354           for (n = 0; n < sz - 1; n += 2) {
3355             i1 = idx[0];
3356             i2 = idx[1];
3357             idx += 2;
3358             tmp0 = x[i1];
3359             tmp1 = x[i2];
3360             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3361             v1 += 2;
3362             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3363             v2 += 2;
3364           }
3365           if (n == sz - 1) {
3366             tmp0 = x[*idx];
3367             sum1 -= v1[0] * tmp0;
3368             sum2 -= v2[0] * tmp0;
3369           }
3370           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3371           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3372           break;
3373         case 3:
3374           v2   = a->a + ii[row + 1];
3375           v3   = a->a + ii[row + 2];
3376           sum1 = b[row];
3377           sum2 = b[row + 1];
3378           sum3 = b[row + 2];
3379           for (n = 0; n < sz - 1; n += 2) {
3380             i1 = idx[0];
3381             i2 = idx[1];
3382             idx += 2;
3383             tmp0 = x[i1];
3384             tmp1 = x[i2];
3385             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3386             v1 += 2;
3387             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3388             v2 += 2;
3389             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3390             v3 += 2;
3391           }
3392           if (n == sz - 1) {
3393             tmp0 = x[*idx++];
3394             sum1 -= v1[0] * tmp0;
3395             sum2 -= v2[0] * tmp0;
3396             sum3 -= v3[0] * tmp0;
3397             v1++;
3398             v2++;
3399             v3++;
3400           }
3401           t[row]     = sum1;
3402           t[row + 1] = sum2;
3403           t[row + 2] = sum3;
3404           sz         = ii[row + 1] - diag[row] - 3;
3405           idx        = a->j + diag[row] + 3;
3406           v1 += 3;
3407           v2 += 3;
3408           v3 += 3;
3409           for (n = 0; n < sz - 1; n += 2) {
3410             i1 = idx[0];
3411             i2 = idx[1];
3412             idx += 2;
3413             tmp0 = x[i1];
3414             tmp1 = x[i2];
3415             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3416             v1 += 2;
3417             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3418             v2 += 2;
3419             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3420             v3 += 2;
3421           }
3422           if (n == sz - 1) {
3423             tmp0 = x[*idx];
3424             sum1 -= v1[0] * tmp0;
3425             sum2 -= v2[0] * tmp0;
3426             sum3 -= v3[0] * tmp0;
3427           }
3428           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3429           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3430           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3431           break;
3432         case 4:
3433           v2   = a->a + ii[row + 1];
3434           v3   = a->a + ii[row + 2];
3435           v4   = a->a + ii[row + 3];
3436           sum1 = b[row];
3437           sum2 = b[row + 1];
3438           sum3 = b[row + 2];
3439           sum4 = b[row + 3];
3440           for (n = 0; n < sz - 1; n += 2) {
3441             i1 = idx[0];
3442             i2 = idx[1];
3443             idx += 2;
3444             tmp0 = x[i1];
3445             tmp1 = x[i2];
3446             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3447             v1 += 2;
3448             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3449             v2 += 2;
3450             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3451             v3 += 2;
3452             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3453             v4 += 2;
3454           }
3455           if (n == sz - 1) {
3456             tmp0 = x[*idx++];
3457             sum1 -= v1[0] * tmp0;
3458             sum2 -= v2[0] * tmp0;
3459             sum3 -= v3[0] * tmp0;
3460             sum4 -= v4[0] * tmp0;
3461             v1++;
3462             v2++;
3463             v3++;
3464             v4++;
3465           }
3466           t[row]     = sum1;
3467           t[row + 1] = sum2;
3468           t[row + 2] = sum3;
3469           t[row + 3] = sum4;
3470           sz         = ii[row + 1] - diag[row] - 4;
3471           idx        = a->j + diag[row] + 4;
3472           v1 += 4;
3473           v2 += 4;
3474           v3 += 4;
3475           v4 += 4;
3476           for (n = 0; n < sz - 1; n += 2) {
3477             i1 = idx[0];
3478             i2 = idx[1];
3479             idx += 2;
3480             tmp0 = x[i1];
3481             tmp1 = x[i2];
3482             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3483             v1 += 2;
3484             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3485             v2 += 2;
3486             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3487             v3 += 2;
3488             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3489             v4 += 2;
3490           }
3491           if (n == sz - 1) {
3492             tmp0 = x[*idx];
3493             sum1 -= v1[0] * tmp0;
3494             sum2 -= v2[0] * tmp0;
3495             sum3 -= v3[0] * tmp0;
3496             sum4 -= v4[0] * tmp0;
3497           }
3498           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3499           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3500           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3501           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3502           break;
3503         case 5:
3504           v2   = a->a + ii[row + 1];
3505           v3   = a->a + ii[row + 2];
3506           v4   = a->a + ii[row + 3];
3507           v5   = a->a + ii[row + 4];
3508           sum1 = b[row];
3509           sum2 = b[row + 1];
3510           sum3 = b[row + 2];
3511           sum4 = b[row + 3];
3512           sum5 = b[row + 4];
3513           for (n = 0; n < sz - 1; n += 2) {
3514             i1 = idx[0];
3515             i2 = idx[1];
3516             idx += 2;
3517             tmp0 = x[i1];
3518             tmp1 = x[i2];
3519             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3520             v1 += 2;
3521             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3522             v2 += 2;
3523             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3524             v3 += 2;
3525             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3526             v4 += 2;
3527             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3528             v5 += 2;
3529           }
3530           if (n == sz - 1) {
3531             tmp0 = x[*idx++];
3532             sum1 -= v1[0] * tmp0;
3533             sum2 -= v2[0] * tmp0;
3534             sum3 -= v3[0] * tmp0;
3535             sum4 -= v4[0] * tmp0;
3536             sum5 -= v5[0] * tmp0;
3537             v1++;
3538             v2++;
3539             v3++;
3540             v4++;
3541             v5++;
3542           }
3543           t[row]     = sum1;
3544           t[row + 1] = sum2;
3545           t[row + 2] = sum3;
3546           t[row + 3] = sum4;
3547           t[row + 4] = sum5;
3548           sz         = ii[row + 1] - diag[row] - 5;
3549           idx        = a->j + diag[row] + 5;
3550           v1 += 5;
3551           v2 += 5;
3552           v3 += 5;
3553           v4 += 5;
3554           v5 += 5;
3555           for (n = 0; n < sz - 1; n += 2) {
3556             i1 = idx[0];
3557             i2 = idx[1];
3558             idx += 2;
3559             tmp0 = x[i1];
3560             tmp1 = x[i2];
3561             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3562             v1 += 2;
3563             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3564             v2 += 2;
3565             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3566             v3 += 2;
3567             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3568             v4 += 2;
3569             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3570             v5 += 2;
3571           }
3572           if (n == sz - 1) {
3573             tmp0 = x[*idx];
3574             sum1 -= v1[0] * tmp0;
3575             sum2 -= v2[0] * tmp0;
3576             sum3 -= v3[0] * tmp0;
3577             sum4 -= v4[0] * tmp0;
3578             sum5 -= v5[0] * tmp0;
3579           }
3580           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3581           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3582           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3583           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3584           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3585           break;
3586         default:
3587           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3588         }
3589       }
3590       xb = t;
3591       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3592     } else xb = b;
3593 
3594     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3595       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3596       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3597         ibdiag -= sizes[i] * sizes[i];
3598 
3599         /* set RHS */
3600         if (xb == b) {
3601           /* whole (old way) */
3602           sz  = ii[row + 1] - ii[row];
3603           idx = a->j + ii[row];
3604           switch (sizes[i]) {
3605           case 5:
3606             v5 = a->a + ii[row - 4]; /* fall through */
3607           case 4:
3608             v4 = a->a + ii[row - 3]; /* fall through */
3609           case 3:
3610             v3 = a->a + ii[row - 2]; /* fall through */
3611           case 2:
3612             v2 = a->a + ii[row - 1]; /* fall through */
3613           case 1:
3614             v1 = a->a + ii[row];
3615             break;
3616           default:
3617             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3618           }
3619         } else {
3620           /* upper, no diag */
3621           sz  = ii[row + 1] - diag[row] - 1;
3622           idx = a->j + diag[row] + 1;
3623           switch (sizes[i]) {
3624           case 5:
3625             v5 = a->a + diag[row - 4] + 5; /* fall through */
3626           case 4:
3627             v4 = a->a + diag[row - 3] + 4; /* fall through */
3628           case 3:
3629             v3 = a->a + diag[row - 2] + 3; /* fall through */
3630           case 2:
3631             v2 = a->a + diag[row - 1] + 2; /* fall through */
3632           case 1:
3633             v1 = a->a + diag[row] + 1;
3634           }
3635         }
3636         /* set sum */
3637         switch (sizes[i]) {
3638         case 5:
3639           sum5 = xb[row - 4]; /* fall through */
3640         case 4:
3641           sum4 = xb[row - 3]; /* fall through */
3642         case 3:
3643           sum3 = xb[row - 2]; /* fall through */
3644         case 2:
3645           sum2 = xb[row - 1]; /* fall through */
3646         case 1:
3647           /* note that sum1 is associated with the last row */
3648           sum1 = xb[row];
3649         }
3650         /* do sums */
3651         for (n = 0; n < sz - 1; n += 2) {
3652           i1 = idx[0];
3653           i2 = idx[1];
3654           idx += 2;
3655           tmp0 = x[i1];
3656           tmp1 = x[i2];
3657           switch (sizes[i]) {
3658           case 5:
3659             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3660             v5 += 2; /* fall through */
3661           case 4:
3662             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3663             v4 += 2; /* fall through */
3664           case 3:
3665             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3666             v3 += 2; /* fall through */
3667           case 2:
3668             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3669             v2 += 2; /* fall through */
3670           case 1:
3671             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3672             v1 += 2;
3673           }
3674         }
3675         /* ragged edge */
3676         if (n == sz - 1) {
3677           tmp0 = x[*idx];
3678           switch (sizes[i]) {
3679           case 5:
3680             sum5 -= *v5 * tmp0; /* fall through */
3681           case 4:
3682             sum4 -= *v4 * tmp0; /* fall through */
3683           case 3:
3684             sum3 -= *v3 * tmp0; /* fall through */
3685           case 2:
3686             sum2 -= *v2 * tmp0; /* fall through */
3687           case 1:
3688             sum1 -= *v1 * tmp0;
3689           }
3690         }
3691         /* update */
3692         if (xb == b) {
3693           /* whole (old way) w/ diag */
3694           switch (sizes[i]) {
3695           case 5:
3696             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3697             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3698             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3699             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3700             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3701             break;
3702           case 4:
3703             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3704             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3705             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3706             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3707             break;
3708           case 3:
3709             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3710             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3711             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3712             break;
3713           case 2:
3714             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3715             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3716             break;
3717           case 1:
3718             x[row--] += sum1 * (*ibdiag);
3719             break;
3720           }
3721         } else {
3722           /* no diag so set =  */
3723           switch (sizes[i]) {
3724           case 5:
3725             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3726             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3727             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3728             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3729             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3730             break;
3731           case 4:
3732             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3733             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3734             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3735             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3736             break;
3737           case 3:
3738             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3739             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3740             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3741             break;
3742           case 2:
3743             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3744             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3745             break;
3746           case 1:
3747             x[row--] = sum1 * (*ibdiag);
3748             break;
3749           }
3750         }
3751       }
3752       if (xb == b) {
3753         PetscCall(PetscLogFlops(2.0 * a->nz));
3754       } else {
3755         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3756       }
3757     }
3758   }
3759   if (flag & SOR_EISENSTAT) {
3760     /*
3761           Apply  (U + D)^-1  where D is now the block diagonal
3762     */
3763     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3764     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3765       ibdiag -= sizes[i] * sizes[i];
3766       sz  = ii[row + 1] - diag[row] - 1;
3767       v1  = a->a + diag[row] + 1;
3768       idx = a->j + diag[row] + 1;
3769       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3770       switch (sizes[i]) {
3771       case 1:
3772 
3773         sum1 = b[row];
3774         for (n = 0; n < sz - 1; n += 2) {
3775           i1 = idx[0];
3776           i2 = idx[1];
3777           idx += 2;
3778           tmp0 = x[i1];
3779           tmp1 = x[i2];
3780           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3781           v1 += 2;
3782         }
3783 
3784         if (n == sz - 1) {
3785           tmp0 = x[*idx];
3786           sum1 -= *v1 * tmp0;
3787         }
3788         x[row] = sum1 * (*ibdiag);
3789         row--;
3790         break;
3791 
3792       case 2:
3793 
3794         sum1 = b[row];
3795         sum2 = b[row - 1];
3796         /* note that sum1 is associated with the second of the two rows */
3797         v2 = a->a + diag[row - 1] + 2;
3798         for (n = 0; n < sz - 1; n += 2) {
3799           i1 = idx[0];
3800           i2 = idx[1];
3801           idx += 2;
3802           tmp0 = x[i1];
3803           tmp1 = x[i2];
3804           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3805           v1 += 2;
3806           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3807           v2 += 2;
3808         }
3809 
3810         if (n == sz - 1) {
3811           tmp0 = x[*idx];
3812           sum1 -= *v1 * tmp0;
3813           sum2 -= *v2 * tmp0;
3814         }
3815         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3816         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3817         row -= 2;
3818         break;
3819       case 3:
3820 
3821         sum1 = b[row];
3822         sum2 = b[row - 1];
3823         sum3 = b[row - 2];
3824         v2   = a->a + diag[row - 1] + 2;
3825         v3   = a->a + diag[row - 2] + 3;
3826         for (n = 0; n < sz - 1; n += 2) {
3827           i1 = idx[0];
3828           i2 = idx[1];
3829           idx += 2;
3830           tmp0 = x[i1];
3831           tmp1 = x[i2];
3832           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3833           v1 += 2;
3834           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3835           v2 += 2;
3836           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3837           v3 += 2;
3838         }
3839 
3840         if (n == sz - 1) {
3841           tmp0 = x[*idx];
3842           sum1 -= *v1 * tmp0;
3843           sum2 -= *v2 * tmp0;
3844           sum3 -= *v3 * tmp0;
3845         }
3846         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3847         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3848         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3849         row -= 3;
3850         break;
3851       case 4:
3852 
3853         sum1 = b[row];
3854         sum2 = b[row - 1];
3855         sum3 = b[row - 2];
3856         sum4 = b[row - 3];
3857         v2   = a->a + diag[row - 1] + 2;
3858         v3   = a->a + diag[row - 2] + 3;
3859         v4   = a->a + diag[row - 3] + 4;
3860         for (n = 0; n < sz - 1; n += 2) {
3861           i1 = idx[0];
3862           i2 = idx[1];
3863           idx += 2;
3864           tmp0 = x[i1];
3865           tmp1 = x[i2];
3866           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3867           v1 += 2;
3868           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3869           v2 += 2;
3870           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3871           v3 += 2;
3872           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3873           v4 += 2;
3874         }
3875 
3876         if (n == sz - 1) {
3877           tmp0 = x[*idx];
3878           sum1 -= *v1 * tmp0;
3879           sum2 -= *v2 * tmp0;
3880           sum3 -= *v3 * tmp0;
3881           sum4 -= *v4 * tmp0;
3882         }
3883         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3884         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3885         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3886         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3887         row -= 4;
3888         break;
3889       case 5:
3890 
3891         sum1 = b[row];
3892         sum2 = b[row - 1];
3893         sum3 = b[row - 2];
3894         sum4 = b[row - 3];
3895         sum5 = b[row - 4];
3896         v2   = a->a + diag[row - 1] + 2;
3897         v3   = a->a + diag[row - 2] + 3;
3898         v4   = a->a + diag[row - 3] + 4;
3899         v5   = a->a + diag[row - 4] + 5;
3900         for (n = 0; n < sz - 1; n += 2) {
3901           i1 = idx[0];
3902           i2 = idx[1];
3903           idx += 2;
3904           tmp0 = x[i1];
3905           tmp1 = x[i2];
3906           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3907           v1 += 2;
3908           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3909           v2 += 2;
3910           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3911           v3 += 2;
3912           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3913           v4 += 2;
3914           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3915           v5 += 2;
3916         }
3917 
3918         if (n == sz - 1) {
3919           tmp0 = x[*idx];
3920           sum1 -= *v1 * tmp0;
3921           sum2 -= *v2 * tmp0;
3922           sum3 -= *v3 * tmp0;
3923           sum4 -= *v4 * tmp0;
3924           sum5 -= *v5 * tmp0;
3925         }
3926         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3927         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3928         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3929         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3930         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3931         row -= 5;
3932         break;
3933       default:
3934         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3935       }
3936     }
3937     PetscCall(PetscLogFlops(a->nz));
3938 
3939     /*
3940            t = b - D x    where D is the block diagonal
3941     */
3942     cnt = 0;
3943     for (i = 0, row = 0; i < m; i++) {
3944       switch (sizes[i]) {
3945       case 1:
3946         t[row] = b[row] - bdiag[cnt++] * x[row];
3947         row++;
3948         break;
3949       case 2:
3950         x1         = x[row];
3951         x2         = x[row + 1];
3952         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3953         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3954         t[row]     = b[row] - tmp1;
3955         t[row + 1] = b[row + 1] - tmp2;
3956         row += 2;
3957         cnt += 4;
3958         break;
3959       case 3:
3960         x1         = x[row];
3961         x2         = x[row + 1];
3962         x3         = x[row + 2];
3963         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3964         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3965         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3966         t[row]     = b[row] - tmp1;
3967         t[row + 1] = b[row + 1] - tmp2;
3968         t[row + 2] = b[row + 2] - tmp3;
3969         row += 3;
3970         cnt += 9;
3971         break;
3972       case 4:
3973         x1         = x[row];
3974         x2         = x[row + 1];
3975         x3         = x[row + 2];
3976         x4         = x[row + 3];
3977         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3978         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3979         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3980         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3981         t[row]     = b[row] - tmp1;
3982         t[row + 1] = b[row + 1] - tmp2;
3983         t[row + 2] = b[row + 2] - tmp3;
3984         t[row + 3] = b[row + 3] - tmp4;
3985         row += 4;
3986         cnt += 16;
3987         break;
3988       case 5:
3989         x1         = x[row];
3990         x2         = x[row + 1];
3991         x3         = x[row + 2];
3992         x4         = x[row + 3];
3993         x5         = x[row + 4];
3994         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3995         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3996         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3997         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3998         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3999         t[row]     = b[row] - tmp1;
4000         t[row + 1] = b[row + 1] - tmp2;
4001         t[row + 2] = b[row + 2] - tmp3;
4002         t[row + 3] = b[row + 3] - tmp4;
4003         t[row + 4] = b[row + 4] - tmp5;
4004         row += 5;
4005         cnt += 25;
4006         break;
4007       default:
4008         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4009       }
4010     }
4011     PetscCall(PetscLogFlops(m));
4012 
4013     /*
4014           Apply (L + D)^-1 where D is the block diagonal
4015     */
4016     for (i = 0, row = 0; i < m; i++) {
4017       sz  = diag[row] - ii[row];
4018       v1  = a->a + ii[row];
4019       idx = a->j + ii[row];
4020       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4021       switch (sizes[i]) {
4022       case 1:
4023 
4024         sum1 = t[row];
4025         for (n = 0; n < sz - 1; n += 2) {
4026           i1 = idx[0];
4027           i2 = idx[1];
4028           idx += 2;
4029           tmp0 = t[i1];
4030           tmp1 = t[i2];
4031           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4032           v1 += 2;
4033         }
4034 
4035         if (n == sz - 1) {
4036           tmp0 = t[*idx];
4037           sum1 -= *v1 * tmp0;
4038         }
4039         x[row] += t[row] = sum1 * (*ibdiag++);
4040         row++;
4041         break;
4042       case 2:
4043         v2   = a->a + ii[row + 1];
4044         sum1 = t[row];
4045         sum2 = t[row + 1];
4046         for (n = 0; n < sz - 1; n += 2) {
4047           i1 = idx[0];
4048           i2 = idx[1];
4049           idx += 2;
4050           tmp0 = t[i1];
4051           tmp1 = t[i2];
4052           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4053           v1 += 2;
4054           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4055           v2 += 2;
4056         }
4057 
4058         if (n == sz - 1) {
4059           tmp0 = t[*idx];
4060           sum1 -= v1[0] * tmp0;
4061           sum2 -= v2[0] * tmp0;
4062         }
4063         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4064         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4065         ibdiag += 4;
4066         row += 2;
4067         break;
4068       case 3:
4069         v2   = a->a + ii[row + 1];
4070         v3   = a->a + ii[row + 2];
4071         sum1 = t[row];
4072         sum2 = t[row + 1];
4073         sum3 = t[row + 2];
4074         for (n = 0; n < sz - 1; n += 2) {
4075           i1 = idx[0];
4076           i2 = idx[1];
4077           idx += 2;
4078           tmp0 = t[i1];
4079           tmp1 = t[i2];
4080           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4081           v1 += 2;
4082           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4083           v2 += 2;
4084           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4085           v3 += 2;
4086         }
4087 
4088         if (n == sz - 1) {
4089           tmp0 = t[*idx];
4090           sum1 -= v1[0] * tmp0;
4091           sum2 -= v2[0] * tmp0;
4092           sum3 -= v3[0] * tmp0;
4093         }
4094         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4095         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4096         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4097         ibdiag += 9;
4098         row += 3;
4099         break;
4100       case 4:
4101         v2   = a->a + ii[row + 1];
4102         v3   = a->a + ii[row + 2];
4103         v4   = a->a + ii[row + 3];
4104         sum1 = t[row];
4105         sum2 = t[row + 1];
4106         sum3 = t[row + 2];
4107         sum4 = t[row + 3];
4108         for (n = 0; n < sz - 1; n += 2) {
4109           i1 = idx[0];
4110           i2 = idx[1];
4111           idx += 2;
4112           tmp0 = t[i1];
4113           tmp1 = t[i2];
4114           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4115           v1 += 2;
4116           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4117           v2 += 2;
4118           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4119           v3 += 2;
4120           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4121           v4 += 2;
4122         }
4123 
4124         if (n == sz - 1) {
4125           tmp0 = t[*idx];
4126           sum1 -= v1[0] * tmp0;
4127           sum2 -= v2[0] * tmp0;
4128           sum3 -= v3[0] * tmp0;
4129           sum4 -= v4[0] * tmp0;
4130         }
4131         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4132         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4133         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4134         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4135         ibdiag += 16;
4136         row += 4;
4137         break;
4138       case 5:
4139         v2   = a->a + ii[row + 1];
4140         v3   = a->a + ii[row + 2];
4141         v4   = a->a + ii[row + 3];
4142         v5   = a->a + ii[row + 4];
4143         sum1 = t[row];
4144         sum2 = t[row + 1];
4145         sum3 = t[row + 2];
4146         sum4 = t[row + 3];
4147         sum5 = t[row + 4];
4148         for (n = 0; n < sz - 1; n += 2) {
4149           i1 = idx[0];
4150           i2 = idx[1];
4151           idx += 2;
4152           tmp0 = t[i1];
4153           tmp1 = t[i2];
4154           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4155           v1 += 2;
4156           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4157           v2 += 2;
4158           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4159           v3 += 2;
4160           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4161           v4 += 2;
4162           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4163           v5 += 2;
4164         }
4165 
4166         if (n == sz - 1) {
4167           tmp0 = t[*idx];
4168           sum1 -= v1[0] * tmp0;
4169           sum2 -= v2[0] * tmp0;
4170           sum3 -= v3[0] * tmp0;
4171           sum4 -= v4[0] * tmp0;
4172           sum5 -= v5[0] * tmp0;
4173         }
4174         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4175         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4176         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4177         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4178         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4179         ibdiag += 25;
4180         row += 5;
4181         break;
4182       default:
4183         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4184       }
4185     }
4186     PetscCall(PetscLogFlops(a->nz));
4187   }
4188   PetscCall(VecRestoreArray(xx, &x));
4189   PetscCall(VecRestoreArrayRead(bb, &b));
4190   PetscFunctionReturn(0);
4191 }
4192 
4193 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4194 {
4195   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4196   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4197   const MatScalar   *bdiag = a->inode.bdiag;
4198   const PetscScalar *b;
4199   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4200   const PetscInt    *sizes = a->inode.size;
4201 
4202   PetscFunctionBegin;
4203   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4204   PetscCall(VecGetArray(xx, &x));
4205   PetscCall(VecGetArrayRead(bb, &b));
4206   cnt = 0;
4207   for (i = 0, row = 0; i < m; i++) {
4208     switch (sizes[i]) {
4209     case 1:
4210       x[row] = b[row] * bdiag[cnt++];
4211       row++;
4212       break;
4213     case 2:
4214       x1       = b[row];
4215       x2       = b[row + 1];
4216       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4217       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4218       x[row++] = tmp1;
4219       x[row++] = tmp2;
4220       cnt += 4;
4221       break;
4222     case 3:
4223       x1       = b[row];
4224       x2       = b[row + 1];
4225       x3       = b[row + 2];
4226       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4227       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4228       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4229       x[row++] = tmp1;
4230       x[row++] = tmp2;
4231       x[row++] = tmp3;
4232       cnt += 9;
4233       break;
4234     case 4:
4235       x1       = b[row];
4236       x2       = b[row + 1];
4237       x3       = b[row + 2];
4238       x4       = b[row + 3];
4239       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4240       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4241       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4242       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4243       x[row++] = tmp1;
4244       x[row++] = tmp2;
4245       x[row++] = tmp3;
4246       x[row++] = tmp4;
4247       cnt += 16;
4248       break;
4249     case 5:
4250       x1       = b[row];
4251       x2       = b[row + 1];
4252       x3       = b[row + 2];
4253       x4       = b[row + 3];
4254       x5       = b[row + 4];
4255       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4256       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4257       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4258       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4259       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4260       x[row++] = tmp1;
4261       x[row++] = tmp2;
4262       x[row++] = tmp3;
4263       x[row++] = tmp4;
4264       x[row++] = tmp5;
4265       cnt += 25;
4266       break;
4267     default:
4268       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4269     }
4270   }
4271   PetscCall(PetscLogFlops(2.0 * cnt));
4272   PetscCall(VecRestoreArray(xx, &x));
4273   PetscCall(VecRestoreArrayRead(bb, &b));
4274   PetscFunctionReturn(0);
4275 }
4276 
4277 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4278 {
4279   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4280 
4281   PetscFunctionBegin;
4282   a->inode.node_count       = 0;
4283   a->inode.use              = PETSC_FALSE;
4284   a->inode.checked          = PETSC_FALSE;
4285   a->inode.mat_nonzerostate = -1;
4286   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4287   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4288   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4289   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4290   A->ops->coloringpatch     = NULL;
4291   A->ops->multdiagonalblock = NULL;
4292   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4293   PetscFunctionReturn(0);
4294 }
4295 
4296 /*
4297     samestructure indicates that the matrix has not changed its nonzero structure so we
4298     do not need to recompute the inodes
4299 */
4300 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4301 {
4302   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4303   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4304   PetscBool       flag;
4305   const PetscInt *idx, *idy, *ii;
4306 
4307   PetscFunctionBegin;
4308   if (!a->inode.use) {
4309     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4310     PetscCall(PetscFree(a->inode.size));
4311     PetscFunctionReturn(0);
4312   }
4313   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4314 
4315   m = A->rmap->n;
4316   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4317   ns = a->inode.size;
4318 
4319   i          = 0;
4320   node_count = 0;
4321   idx        = a->j;
4322   ii         = a->i;
4323   while (i < m) {            /* For each row */
4324     nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4325     /* Limits the number of elements in a node to 'a->inode.limit' */
4326     for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4327       nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4328       if (nzy != nzx) break;
4329       idy += nzx; /* Same nonzero pattern */
4330       PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4331       if (!flag) break;
4332     }
4333     ns[node_count++] = blk_size;
4334     idx += blk_size * nzx;
4335     i = j;
4336   }
4337 
4338   /* If not enough inodes found,, do not use inode version of the routines */
4339   if (!m || node_count > .8 * m) {
4340     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4341     PetscCall(PetscFree(a->inode.size));
4342     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4343   } else {
4344     if (!A->factortype) {
4345       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4346       if (A->rmap->n == A->cmap->n) {
4347         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4348         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4349         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4350         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4351         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4352       }
4353     } else {
4354       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4355     }
4356     a->inode.node_count = node_count;
4357     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4358   }
4359   a->inode.checked          = PETSC_TRUE;
4360   a->inode.mat_nonzerostate = A->nonzerostate;
4361   PetscFunctionReturn(0);
4362 }
4363 
4364 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4365 {
4366   Mat         B = *C;
4367   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4368   PetscInt    m = A->rmap->n;
4369 
4370   PetscFunctionBegin;
4371   c->inode.use              = a->inode.use;
4372   c->inode.limit            = a->inode.limit;
4373   c->inode.max_limit        = a->inode.max_limit;
4374   c->inode.checked          = PETSC_FALSE;
4375   c->inode.size             = NULL;
4376   c->inode.node_count       = 0;
4377   c->inode.ibdiagvalid      = PETSC_FALSE;
4378   c->inode.ibdiag           = NULL;
4379   c->inode.bdiag            = NULL;
4380   c->inode.mat_nonzerostate = -1;
4381   if (a->inode.use) {
4382     if (a->inode.checked && a->inode.size) {
4383       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4384       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4385 
4386       c->inode.checked          = PETSC_TRUE;
4387       c->inode.node_count       = a->inode.node_count;
4388       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4389     }
4390     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4391     if (!B->factortype) {
4392       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4393       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4394       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4395       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4396       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4397       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4398     } else {
4399       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4400     }
4401   }
4402   PetscFunctionReturn(0);
4403 }
4404 
4405 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4406 {
4407   PetscInt        k;
4408   const PetscInt *vi;
4409 
4410   PetscFunctionBegin;
4411   vi = aj + ai[row];
4412   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4413   vi        = aj + adiag[row];
4414   cols[nzl] = vi[0];
4415   vi        = aj + adiag[row + 1] + 1;
4416   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4417   PetscFunctionReturn(0);
4418 }
4419 /*
4420    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4421    Modified from MatSeqAIJCheckInode().
4422 
4423    Input Parameters:
4424 .  Mat A - ILU or LU matrix factor
4425 
4426 */
4427 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4428 {
4429   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4430   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4431   PetscInt       *cols1, *cols2, *ns;
4432   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4433   PetscBool       flag;
4434 
4435   PetscFunctionBegin;
4436   if (!a->inode.use) PetscFunctionReturn(0);
4437   if (a->inode.checked) PetscFunctionReturn(0);
4438 
4439   m = A->rmap->n;
4440   if (a->inode.size) ns = a->inode.size;
4441   else PetscCall(PetscMalloc1(m + 1, &ns));
4442 
4443   i          = 0;
4444   node_count = 0;
4445   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4446   while (i < m) {                       /* For each row */
4447     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4448     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4449     nzx  = nzl1 + nzu1 + 1;
4450     MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i);
4451 
4452     /* Limits the number of elements in a node to 'a->inode.limit' */
4453     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4454       nzl2 = ai[j + 1] - ai[j];
4455       nzu2 = adiag[j] - adiag[j + 1] - 1;
4456       nzy  = nzl2 + nzu2 + 1;
4457       if (nzy != nzx) break;
4458       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4459       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4460       if (!flag) break;
4461     }
4462     ns[node_count++] = blk_size;
4463     i                = j;
4464   }
4465   PetscCall(PetscFree2(cols1, cols2));
4466   /* If not enough inodes found,, do not use inode version of the routines */
4467   if (!m || node_count > .8 * m) {
4468     PetscCall(PetscFree(ns));
4469 
4470     a->inode.node_count = 0;
4471     a->inode.size       = NULL;
4472     a->inode.use        = PETSC_FALSE;
4473 
4474     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4475   } else {
4476     A->ops->mult              = NULL;
4477     A->ops->sor               = NULL;
4478     A->ops->multadd           = NULL;
4479     A->ops->getrowij          = NULL;
4480     A->ops->restorerowij      = NULL;
4481     A->ops->getcolumnij       = NULL;
4482     A->ops->restorecolumnij   = NULL;
4483     A->ops->coloringpatch     = NULL;
4484     A->ops->multdiagonalblock = NULL;
4485     a->inode.node_count       = node_count;
4486     a->inode.size             = ns;
4487 
4488     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4489   }
4490   a->inode.checked = PETSC_TRUE;
4491   PetscFunctionReturn(0);
4492 }
4493 
4494 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4495 {
4496   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4497 
4498   PetscFunctionBegin;
4499   a->inode.ibdiagvalid = PETSC_FALSE;
4500   PetscFunctionReturn(0);
4501 }
4502 
4503 /*
4504      This is really ugly. if inodes are used this replaces the
4505   permutations with ones that correspond to rows/cols of the matrix
4506   rather then inode blocks
4507 */
4508 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4509 {
4510   PetscFunctionBegin;
4511   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4512   PetscFunctionReturn(0);
4513 }
4514 
4515 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4516 {
4517   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4518   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4519   const PetscInt *ridx, *cidx;
4520   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4521   PetscInt        nslim_col, *ns_col;
4522   IS              ris = *rperm, cis = *cperm;
4523 
4524   PetscFunctionBegin;
4525   if (!a->inode.size) PetscFunctionReturn(0);           /* no inodes so return */
4526   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4527 
4528   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4529   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4530   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4531 
4532   PetscCall(ISGetIndices(ris, &ridx));
4533   PetscCall(ISGetIndices(cis, &cidx));
4534 
4535   /* Form the inode structure for the rows of permuted matric using inv perm*/
4536   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4537 
4538   /* Construct the permutations for rows*/
4539   for (i = 0, row = 0; i < nslim_row; ++i) {
4540     indx      = ridx[i];
4541     start_val = tns[indx];
4542     end_val   = tns[indx + 1];
4543     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4544   }
4545 
4546   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4547   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4548 
4549   /* Construct permutations for columns */
4550   for (i = 0, col = 0; i < nslim_col; ++i) {
4551     indx      = cidx[i];
4552     start_val = tns[indx];
4553     end_val   = tns[indx + 1];
4554     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4555   }
4556 
4557   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4558   PetscCall(ISSetPermutation(*rperm));
4559   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4560   PetscCall(ISSetPermutation(*cperm));
4561 
4562   PetscCall(ISRestoreIndices(ris, &ridx));
4563   PetscCall(ISRestoreIndices(cis, &cidx));
4564 
4565   PetscCall(PetscFree(ns_col));
4566   PetscCall(PetscFree2(permr, permc));
4567   PetscCall(ISDestroy(&cis));
4568   PetscCall(ISDestroy(&ris));
4569   PetscCall(PetscFree(tns));
4570   PetscFunctionReturn(0);
4571 }
4572 
4573 /*@C
4574    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4575 
4576    Not Collective
4577 
4578    Input Parameter:
4579 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4580 
4581    Output Parameters:
4582 +  node_count - no of inodes present in the matrix.
4583 .  sizes      - an array of size node_count,with sizes of each inode.
4584 -  limit      - the max size used to generate the inodes.
4585 
4586    Level: advanced
4587 
4588    Note:
4589     This routine returns some internal storage information
4590    of the matrix, it is intended to be used by advanced users.
4591    It should be called after the matrix is assembled.
4592    The contents of the sizes[] array should not be changed.
4593    NULL may be passed for information not requested.
4594 
4595 .seealso: `MatGetInfo()`
4596 @*/
4597 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4598 {
4599   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4600 
4601   PetscFunctionBegin;
4602   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4603   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4604   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4605   PetscFunctionReturn(0);
4606 }
4607 
4608 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4609 {
4610   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4611 
4612   PetscFunctionBegin;
4613   if (node_count) *node_count = a->inode.node_count;
4614   if (sizes) *sizes = a->inode.size;
4615   if (limit) *limit = a->inode.limit;
4616   PetscFunctionReturn(0);
4617 }
4618