xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 66af8762ec03dbef0e079729eb2a1734a35ed7ff)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
24       ;
25     for (; count + 1 < n; count++, i++)
26       ;
27     if (count < n) i++;
28     *size = i;
29     PetscFunctionReturn(PETSC_SUCCESS);
30   }
31   PetscCall(PetscMalloc1(n + 1, &ns_col));
32 
33   /* Use the same row structure wherever feasible. */
34   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
38   /* The last node is the odd ball. pad it up with the remaining rows; */
39   if (count < n) {
40     ns_col[i] = n - count;
41     i++;
42   } else if (count > n) {
43     /* Adjust for the over estimation */
44     ns_col[i - 1] += n - count;
45   }
46   *size = i;
47   *ns   = ns_col;
48   PetscFunctionReturn(PETSC_SUCCESS);
49 }
50 
51 /*
52       This builds symmetric version of nonzero structure,
53 */
54 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
55 {
56   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
57   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
58   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
59   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
60 
61   PetscFunctionBegin;
62   nslim_row = a->inode.node_count;
63   m         = A->rmap->n;
64   n         = A->cmap->n;
65   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
66   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
67 
68   /* Use the row_inode as column_inode */
69   nslim_col = nslim_row;
70   ns_col    = ns_row;
71 
72   /* allocate space for reformatted inode structure */
73   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
74   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
75 
76   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
77     nsz = ns_col[i1];
78     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
79   }
80   /* allocate space for row pointers */
81   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
82   *iia = ia;
83   PetscCall(PetscMalloc1(nslim_row + 1, &work));
84 
85   /* determine the number of columns in each row */
86   ia[0] = oshift;
87   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
117     j    = aj + ai[row] + ishift;
118     jmax = aj + ai[row + 1] + ishift;
119     if (j == jmax) continue; /* empty row */
120     col = *j++ + ishift;
121     i2  = tvc[col];
122     while (i2 < i1 && j < jmax) {
123       ja[work[i2]++] = i1 + oshift;
124       ja[work[i1]++] = i2 + oshift;
125       ++i2;
126       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
127       i2 = tvc[col];
128     }
129     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
130   }
131   PetscCall(PetscFree(work));
132   PetscCall(PetscFree2(tns, tvc));
133   PetscFunctionReturn(PETSC_SUCCESS);
134 }
135 
136 /*
137       This builds nonsymmetric version of nonzero structure,
138 */
139 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
140 {
141   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
142   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
143   PetscInt       *tns, *tvc, nsz, i1, i2;
144   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
145 
146   PetscFunctionBegin;
147   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
148   nslim_row = a->inode.node_count;
149   n         = A->cmap->n;
150 
151   /* Create The column_inode for this matrix */
152   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
153 
154   /* allocate space for reformatted column_inode structure */
155   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
156   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
157 
158   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
159     nsz = ns_col[i1];
160     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
161   }
162   /* allocate space for row pointers */
163   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
164   *iia = ia;
165   PetscCall(PetscMalloc1(nslim_row + 1, &work));
166 
167   /* determine the number of columns in each row */
168   ia[0] = oshift;
169   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
170     j  = aj + ai[row] + ishift;
171     nz = ai[row + 1] - ai[row];
172     if (!nz) continue; /* empty row */
173     col = *j++ + ishift;
174     i2  = tvc[col];
175     while (nz-- > 0) { /* off-diagonal elements */
176       ia[i1 + 1]++;
177       i2++; /* Start col of next node */
178       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
179       if (nz > 0) i2 = tvc[col];
180     }
181   }
182 
183   /* shift ia[i] to point to next row */
184   for (i1 = 1; i1 < nslim_row + 1; i1++) {
185     row = ia[i1 - 1];
186     ia[i1] += row;
187     work[i1 - 1] = row - oshift;
188   }
189 
190   /* allocate space for column pointers */
191   nz = ia[nslim_row] + (!ishift);
192   PetscCall(PetscMalloc1(nz, &ja));
193   *jja = ja;
194 
195   /* loop over matrix putting into ja */
196   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
197     j  = aj + ai[row] + ishift;
198     nz = ai[row + 1] - ai[row];
199     if (!nz) continue; /* empty row */
200     col = *j++ + ishift;
201     i2  = tvc[col];
202     while (nz-- > 0) {
203       ja[work[i1]++] = i2 + oshift;
204       ++i2;
205       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
206       if (nz > 0) i2 = tvc[col];
207     }
208   }
209   PetscCall(PetscFree(ns_col));
210   PetscCall(PetscFree(work));
211   PetscCall(PetscFree2(tns, tvc));
212   PetscFunctionReturn(PETSC_SUCCESS);
213 }
214 
215 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
216 {
217   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
218 
219   PetscFunctionBegin;
220   if (n) *n = a->inode.node_count;
221   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
222   if (!blockcompressed) {
223     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
224   } else if (symmetric) {
225     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
226   } else {
227     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
228   }
229   PetscFunctionReturn(PETSC_SUCCESS);
230 }
231 
232 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
233 {
234   PetscFunctionBegin;
235   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
236 
237   if (!blockcompressed) {
238     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
239   } else {
240     PetscCall(PetscFree(*ia));
241     PetscCall(PetscFree(*ja));
242   }
243   PetscFunctionReturn(PETSC_SUCCESS);
244 }
245 
246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247 {
248   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
250   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
251 
252   PetscFunctionBegin;
253   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
254   nslim_row = a->inode.node_count;
255   n         = A->cmap->n;
256 
257   /* Create The column_inode for this matrix */
258   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
259 
260   /* allocate space for reformatted column_inode structure */
261   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
262   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
263 
264   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
265     nsz = ns_col[i1];
266     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
267   }
268   /* allocate space for column pointers */
269   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
270   *iia = ia;
271   PetscCall(PetscMalloc1(nslim_col + 1, &work));
272 
273   /* determine the number of columns in each row */
274   ia[0] = oshift;
275   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
276     j   = aj + ai[row] + ishift;
277     col = *j++ + ishift;
278     i2  = tvc[col];
279     nz  = ai[row + 1] - ai[row];
280     while (nz-- > 0) { /* off-diagonal elements */
281       /* ia[i1+1]++; */
282       ia[i2 + 1]++;
283       i2++;
284       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285       if (nz > 0) i2 = tvc[col];
286     }
287   }
288 
289   /* shift ia[i] to point to next col */
290   for (i1 = 1; i1 < nslim_col + 1; i1++) {
291     col = ia[i1 - 1];
292     ia[i1] += col;
293     work[i1 - 1] = col - oshift;
294   }
295 
296   /* allocate space for column pointers */
297   nz = ia[nslim_col] + (!ishift);
298   PetscCall(PetscMalloc1(nz, &ja));
299   *jja = ja;
300 
301   /* loop over matrix putting into ja */
302   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
303     j   = aj + ai[row] + ishift;
304     col = *j++ + ishift;
305     i2  = tvc[col];
306     nz  = ai[row + 1] - ai[row];
307     while (nz-- > 0) {
308       /* ja[work[i1]++] = i2 + oshift; */
309       ja[work[i2]++] = i1 + oshift;
310       i2++;
311       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312       if (nz > 0) i2 = tvc[col];
313     }
314   }
315   PetscCall(PetscFree(ns_col));
316   PetscCall(PetscFree(work));
317   PetscCall(PetscFree2(tns, tvc));
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322 {
323   PetscFunctionBegin;
324   PetscCall(MatCreateColInode_Private(A, n, NULL));
325   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
326 
327   if (!blockcompressed) {
328     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
329   } else if (symmetric) {
330     /* Since the indices are symmetric it doesn't matter */
331     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
332   } else {
333     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
334   }
335   PetscFunctionReturn(PETSC_SUCCESS);
336 }
337 
338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339 {
340   PetscFunctionBegin;
341   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
342   if (!blockcompressed) {
343     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
344   } else {
345     PetscCall(PetscFree(*ia));
346     PetscCall(PetscFree(*ja));
347   }
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
352 {
353   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
354   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
355   PetscScalar       *y;
356   const PetscScalar *x;
357   const MatScalar   *v1, *v2, *v3, *v4, *v5;
358   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
359   const PetscInt    *idx, *ns, *ii;
360 
361 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
362   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
363 #endif
364 
365   PetscFunctionBegin;
366   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
367   node_max = a->inode.node_count;
368   ns       = a->inode.size; /* Node Size array */
369   PetscCall(VecGetArrayRead(xx, &x));
370   PetscCall(VecGetArray(yy, &y));
371   idx = a->j;
372   v1  = a->a;
373   ii  = a->i;
374 
375   for (i = 0, row = 0; i < node_max; ++i) {
376     nsz = ns[i];
377     n   = ii[1] - ii[0];
378     nonzerorow += (n > 0) * nsz;
379     ii += nsz;
380     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
381     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
382     sz = n;                                                                /* No of non zeros in this row */
383                                                                            /* Switch on the size of Node */
384     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
385     case 1:
386       sum1 = 0.;
387 
388       for (n = 0; n < sz - 1; n += 2) {
389         i1 = idx[0]; /* The instructions are ordered to */
390         i2 = idx[1]; /* make the compiler's job easy */
391         idx += 2;
392         tmp0 = x[i1];
393         tmp1 = x[i2];
394         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
395         v1 += 2;
396       }
397 
398       if (n == sz - 1) { /* Take care of the last nonzero  */
399         tmp0 = x[*idx++];
400         sum1 += *v1++ * tmp0;
401       }
402       y[row++] = sum1;
403       break;
404     case 2:
405       sum1 = 0.;
406       sum2 = 0.;
407       v2   = v1 + n;
408 
409       for (n = 0; n < sz - 1; n += 2) {
410         i1 = idx[0];
411         i2 = idx[1];
412         idx += 2;
413         tmp0 = x[i1];
414         tmp1 = x[i2];
415         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
416         v1 += 2;
417         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
418         v2 += 2;
419       }
420       if (n == sz - 1) {
421         tmp0 = x[*idx++];
422         sum1 += *v1++ * tmp0;
423         sum2 += *v2++ * tmp0;
424       }
425       y[row++] = sum1;
426       y[row++] = sum2;
427       v1       = v2; /* Since the next block to be processed starts there*/
428       idx += sz;
429       break;
430     case 3:
431       sum1 = 0.;
432       sum2 = 0.;
433       sum3 = 0.;
434       v2   = v1 + n;
435       v3   = v2 + n;
436 
437       for (n = 0; n < sz - 1; n += 2) {
438         i1 = idx[0];
439         i2 = idx[1];
440         idx += 2;
441         tmp0 = x[i1];
442         tmp1 = x[i2];
443         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
444         v1 += 2;
445         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
446         v2 += 2;
447         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
448         v3 += 2;
449       }
450       if (n == sz - 1) {
451         tmp0 = x[*idx++];
452         sum1 += *v1++ * tmp0;
453         sum2 += *v2++ * tmp0;
454         sum3 += *v3++ * tmp0;
455       }
456       y[row++] = sum1;
457       y[row++] = sum2;
458       y[row++] = sum3;
459       v1       = v3; /* Since the next block to be processed starts there*/
460       idx += 2 * sz;
461       break;
462     case 4:
463       sum1 = 0.;
464       sum2 = 0.;
465       sum3 = 0.;
466       sum4 = 0.;
467       v2   = v1 + n;
468       v3   = v2 + n;
469       v4   = v3 + n;
470 
471       for (n = 0; n < sz - 1; n += 2) {
472         i1 = idx[0];
473         i2 = idx[1];
474         idx += 2;
475         tmp0 = x[i1];
476         tmp1 = x[i2];
477         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
478         v1 += 2;
479         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
480         v2 += 2;
481         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
482         v3 += 2;
483         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
484         v4 += 2;
485       }
486       if (n == sz - 1) {
487         tmp0 = x[*idx++];
488         sum1 += *v1++ * tmp0;
489         sum2 += *v2++ * tmp0;
490         sum3 += *v3++ * tmp0;
491         sum4 += *v4++ * tmp0;
492       }
493       y[row++] = sum1;
494       y[row++] = sum2;
495       y[row++] = sum3;
496       y[row++] = sum4;
497       v1       = v4; /* Since the next block to be processed starts there*/
498       idx += 3 * sz;
499       break;
500     case 5:
501       sum1 = 0.;
502       sum2 = 0.;
503       sum3 = 0.;
504       sum4 = 0.;
505       sum5 = 0.;
506       v2   = v1 + n;
507       v3   = v2 + n;
508       v4   = v3 + n;
509       v5   = v4 + n;
510 
511       for (n = 0; n < sz - 1; n += 2) {
512         i1 = idx[0];
513         i2 = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
518         v1 += 2;
519         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
520         v2 += 2;
521         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
522         v3 += 2;
523         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
524         v4 += 2;
525         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
526         v5 += 2;
527       }
528       if (n == sz - 1) {
529         tmp0 = x[*idx++];
530         sum1 += *v1++ * tmp0;
531         sum2 += *v2++ * tmp0;
532         sum3 += *v3++ * tmp0;
533         sum4 += *v4++ * tmp0;
534         sum5 += *v5++ * tmp0;
535       }
536       y[row++] = sum1;
537       y[row++] = sum2;
538       y[row++] = sum3;
539       y[row++] = sum4;
540       y[row++] = sum5;
541       v1       = v5; /* Since the next block to be processed starts there */
542       idx += 4 * sz;
543       break;
544     default:
545       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
546     }
547   }
548   PetscCall(VecRestoreArrayRead(xx, &x));
549   PetscCall(VecRestoreArray(yy, &y));
550   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
551   PetscFunctionReturn(PETSC_SUCCESS);
552 }
553 
554 /* Almost same code as the MatMult_SeqAIJ_Inode() */
555 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
556 {
557   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
558   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
559   const MatScalar   *v1, *v2, *v3, *v4, *v5;
560   const PetscScalar *x;
561   PetscScalar       *y, *z, *zt;
562   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
563   const PetscInt    *idx, *ns, *ii;
564 
565   PetscFunctionBegin;
566   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
567   node_max = a->inode.node_count;
568   ns       = a->inode.size; /* Node Size array */
569 
570   PetscCall(VecGetArrayRead(xx, &x));
571   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
572   zt = z;
573 
574   idx = a->j;
575   v1  = a->a;
576   ii  = a->i;
577 
578   for (i = 0, row = 0; i < node_max; ++i) {
579     nsz = ns[i];
580     n   = ii[1] - ii[0];
581     ii += nsz;
582     sz = n;        /* No of non zeros in this row */
583                    /* Switch on the size of Node */
584     switch (nsz) { /* Each loop in 'case' is unrolled */
585     case 1:
586       sum1 = *zt++;
587 
588       for (n = 0; n < sz - 1; n += 2) {
589         i1 = idx[0]; /* The instructions are ordered to */
590         i2 = idx[1]; /* make the compiler's job easy */
591         idx += 2;
592         tmp0 = x[i1];
593         tmp1 = x[i2];
594         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
595         v1 += 2;
596       }
597 
598       if (n == sz - 1) { /* Take care of the last nonzero  */
599         tmp0 = x[*idx++];
600         sum1 += *v1++ * tmp0;
601       }
602       y[row++] = sum1;
603       break;
604     case 2:
605       sum1 = *zt++;
606       sum2 = *zt++;
607       v2   = v1 + n;
608 
609       for (n = 0; n < sz - 1; n += 2) {
610         i1 = idx[0];
611         i2 = idx[1];
612         idx += 2;
613         tmp0 = x[i1];
614         tmp1 = x[i2];
615         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
616         v1 += 2;
617         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
618         v2 += 2;
619       }
620       if (n == sz - 1) {
621         tmp0 = x[*idx++];
622         sum1 += *v1++ * tmp0;
623         sum2 += *v2++ * tmp0;
624       }
625       y[row++] = sum1;
626       y[row++] = sum2;
627       v1       = v2; /* Since the next block to be processed starts there*/
628       idx += sz;
629       break;
630     case 3:
631       sum1 = *zt++;
632       sum2 = *zt++;
633       sum3 = *zt++;
634       v2   = v1 + n;
635       v3   = v2 + n;
636 
637       for (n = 0; n < sz - 1; n += 2) {
638         i1 = idx[0];
639         i2 = idx[1];
640         idx += 2;
641         tmp0 = x[i1];
642         tmp1 = x[i2];
643         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
644         v1 += 2;
645         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
646         v2 += 2;
647         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
648         v3 += 2;
649       }
650       if (n == sz - 1) {
651         tmp0 = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654         sum3 += *v3++ * tmp0;
655       }
656       y[row++] = sum1;
657       y[row++] = sum2;
658       y[row++] = sum3;
659       v1       = v3; /* Since the next block to be processed starts there*/
660       idx += 2 * sz;
661       break;
662     case 4:
663       sum1 = *zt++;
664       sum2 = *zt++;
665       sum3 = *zt++;
666       sum4 = *zt++;
667       v2   = v1 + n;
668       v3   = v2 + n;
669       v4   = v3 + n;
670 
671       for (n = 0; n < sz - 1; n += 2) {
672         i1 = idx[0];
673         i2 = idx[1];
674         idx += 2;
675         tmp0 = x[i1];
676         tmp1 = x[i2];
677         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
678         v1 += 2;
679         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
680         v2 += 2;
681         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
682         v3 += 2;
683         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
684         v4 += 2;
685       }
686       if (n == sz - 1) {
687         tmp0 = x[*idx++];
688         sum1 += *v1++ * tmp0;
689         sum2 += *v2++ * tmp0;
690         sum3 += *v3++ * tmp0;
691         sum4 += *v4++ * tmp0;
692       }
693       y[row++] = sum1;
694       y[row++] = sum2;
695       y[row++] = sum3;
696       y[row++] = sum4;
697       v1       = v4; /* Since the next block to be processed starts there*/
698       idx += 3 * sz;
699       break;
700     case 5:
701       sum1 = *zt++;
702       sum2 = *zt++;
703       sum3 = *zt++;
704       sum4 = *zt++;
705       sum5 = *zt++;
706       v2   = v1 + n;
707       v3   = v2 + n;
708       v4   = v3 + n;
709       v5   = v4 + n;
710 
711       for (n = 0; n < sz - 1; n += 2) {
712         i1 = idx[0];
713         i2 = idx[1];
714         idx += 2;
715         tmp0 = x[i1];
716         tmp1 = x[i2];
717         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
718         v1 += 2;
719         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
720         v2 += 2;
721         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
722         v3 += 2;
723         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
724         v4 += 2;
725         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
726         v5 += 2;
727       }
728       if (n == sz - 1) {
729         tmp0 = x[*idx++];
730         sum1 += *v1++ * tmp0;
731         sum2 += *v2++ * tmp0;
732         sum3 += *v3++ * tmp0;
733         sum4 += *v4++ * tmp0;
734         sum5 += *v5++ * tmp0;
735       }
736       y[row++] = sum1;
737       y[row++] = sum2;
738       y[row++] = sum3;
739       y[row++] = sum4;
740       y[row++] = sum5;
741       v1       = v5; /* Since the next block to be processed starts there */
742       idx += 4 * sz;
743       break;
744     default:
745       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
746     }
747   }
748   PetscCall(VecRestoreArrayRead(xx, &x));
749   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
750   PetscCall(PetscLogFlops(2.0 * a->nz));
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
755 {
756   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
757   IS                 iscol = a->col, isrow = a->row;
758   const PetscInt    *r, *c, *rout, *cout;
759   PetscInt           i, j, n = A->rmap->n, nz;
760   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
761   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
762   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
763   PetscScalar        sum1, sum2, sum3, sum4, sum5;
764   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
765   const PetscScalar *b;
766 
767   PetscFunctionBegin;
768   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
769   node_max = a->inode.node_count;
770   ns       = a->inode.size; /* Node Size array */
771 
772   PetscCall(VecGetArrayRead(bb, &b));
773   PetscCall(VecGetArrayWrite(xx, &x));
774   tmp = a->solve_work;
775 
776   PetscCall(ISGetIndices(isrow, &rout));
777   r = rout;
778   PetscCall(ISGetIndices(iscol, &cout));
779   c = cout + (n - 1);
780 
781   /* forward solve the lower triangular */
782   tmps = tmp;
783   aa   = a_a;
784   aj   = a_j;
785   ad   = a->diag;
786 
787   for (i = 0, row = 0; i < node_max; ++i) {
788     nsz = ns[i];
789     aii = ai[row];
790     v1  = aa + aii;
791     vi  = aj + aii;
792     nz  = ad[row] - aii;
793     if (i < node_max - 1) {
794       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
795       * but our indexing to determine its size could. */
796       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
797       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
798       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
799       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
800     }
801 
802     switch (nsz) { /* Each loop in 'case' is unrolled */
803     case 1:
804       sum1 = b[*r++];
805       for (j = 0; j < nz - 1; j += 2) {
806         i0 = vi[0];
807         i1 = vi[1];
808         vi += 2;
809         tmp0 = tmps[i0];
810         tmp1 = tmps[i1];
811         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
812         v1 += 2;
813       }
814       if (j == nz - 1) {
815         tmp0 = tmps[*vi++];
816         sum1 -= *v1++ * tmp0;
817       }
818       tmp[row++] = sum1;
819       break;
820     case 2:
821       sum1 = b[*r++];
822       sum2 = b[*r++];
823       v2   = aa + ai[row + 1];
824 
825       for (j = 0; j < nz - 1; j += 2) {
826         i0 = vi[0];
827         i1 = vi[1];
828         vi += 2;
829         tmp0 = tmps[i0];
830         tmp1 = tmps[i1];
831         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
832         v1 += 2;
833         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
834         v2 += 2;
835       }
836       if (j == nz - 1) {
837         tmp0 = tmps[*vi++];
838         sum1 -= *v1++ * tmp0;
839         sum2 -= *v2++ * tmp0;
840       }
841       sum2 -= *v2++ * sum1;
842       tmp[row++] = sum1;
843       tmp[row++] = sum2;
844       break;
845     case 3:
846       sum1 = b[*r++];
847       sum2 = b[*r++];
848       sum3 = b[*r++];
849       v2   = aa + ai[row + 1];
850       v3   = aa + ai[row + 2];
851 
852       for (j = 0; j < nz - 1; j += 2) {
853         i0 = vi[0];
854         i1 = vi[1];
855         vi += 2;
856         tmp0 = tmps[i0];
857         tmp1 = tmps[i1];
858         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
859         v1 += 2;
860         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
861         v2 += 2;
862         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
863         v3 += 2;
864       }
865       if (j == nz - 1) {
866         tmp0 = tmps[*vi++];
867         sum1 -= *v1++ * tmp0;
868         sum2 -= *v2++ * tmp0;
869         sum3 -= *v3++ * tmp0;
870       }
871       sum2 -= *v2++ * sum1;
872       sum3 -= *v3++ * sum1;
873       sum3 -= *v3++ * sum2;
874 
875       tmp[row++] = sum1;
876       tmp[row++] = sum2;
877       tmp[row++] = sum3;
878       break;
879 
880     case 4:
881       sum1 = b[*r++];
882       sum2 = b[*r++];
883       sum3 = b[*r++];
884       sum4 = b[*r++];
885       v2   = aa + ai[row + 1];
886       v3   = aa + ai[row + 2];
887       v4   = aa + ai[row + 3];
888 
889       for (j = 0; j < nz - 1; j += 2) {
890         i0 = vi[0];
891         i1 = vi[1];
892         vi += 2;
893         tmp0 = tmps[i0];
894         tmp1 = tmps[i1];
895         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
896         v1 += 2;
897         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
898         v2 += 2;
899         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
900         v3 += 2;
901         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
902         v4 += 2;
903       }
904       if (j == nz - 1) {
905         tmp0 = tmps[*vi++];
906         sum1 -= *v1++ * tmp0;
907         sum2 -= *v2++ * tmp0;
908         sum3 -= *v3++ * tmp0;
909         sum4 -= *v4++ * tmp0;
910       }
911       sum2 -= *v2++ * sum1;
912       sum3 -= *v3++ * sum1;
913       sum4 -= *v4++ * sum1;
914       sum3 -= *v3++ * sum2;
915       sum4 -= *v4++ * sum2;
916       sum4 -= *v4++ * sum3;
917 
918       tmp[row++] = sum1;
919       tmp[row++] = sum2;
920       tmp[row++] = sum3;
921       tmp[row++] = sum4;
922       break;
923     case 5:
924       sum1 = b[*r++];
925       sum2 = b[*r++];
926       sum3 = b[*r++];
927       sum4 = b[*r++];
928       sum5 = b[*r++];
929       v2   = aa + ai[row + 1];
930       v3   = aa + ai[row + 2];
931       v4   = aa + ai[row + 3];
932       v5   = aa + ai[row + 4];
933 
934       for (j = 0; j < nz - 1; j += 2) {
935         i0 = vi[0];
936         i1 = vi[1];
937         vi += 2;
938         tmp0 = tmps[i0];
939         tmp1 = tmps[i1];
940         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
941         v1 += 2;
942         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
943         v2 += 2;
944         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
945         v3 += 2;
946         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
947         v4 += 2;
948         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
949         v5 += 2;
950       }
951       if (j == nz - 1) {
952         tmp0 = tmps[*vi++];
953         sum1 -= *v1++ * tmp0;
954         sum2 -= *v2++ * tmp0;
955         sum3 -= *v3++ * tmp0;
956         sum4 -= *v4++ * tmp0;
957         sum5 -= *v5++ * tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row++] = sum1;
972       tmp[row++] = sum2;
973       tmp[row++] = sum3;
974       tmp[row++] = sum4;
975       tmp[row++] = sum5;
976       break;
977     default:
978       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
983     nsz = ns[i];
984     aii = ai[row + 1] - 1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii - ad[row];
988     switch (nsz) { /* Each loop in 'case' is unrolled */
989     case 1:
990       sum1 = tmp[row];
991 
992       for (j = nz; j > 1; j -= 2) {
993         vi -= 2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1 -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j == 1) {
1002         tmp0 = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1006       row--;
1007       break;
1008     case 2:
1009       sum1 = tmp[row];
1010       sum2 = tmp[row - 1];
1011       v2   = aa + ai[row] - 1;
1012       for (j = nz; j > 1; j -= 2) {
1013         vi -= 2;
1014         i0   = vi[2];
1015         i1   = vi[1];
1016         tmp0 = tmps[i0];
1017         tmp1 = tmps[i1];
1018         v1 -= 2;
1019         v2 -= 2;
1020         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1021         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1022       }
1023       if (j == 1) {
1024         tmp0 = tmps[*vi--];
1025         sum1 -= *v1-- * tmp0;
1026         sum2 -= *v2-- * tmp0;
1027       }
1028 
1029       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1030       row--;
1031       sum2 -= *v2-- * tmp0;
1032       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1033       row--;
1034       break;
1035     case 3:
1036       sum1 = tmp[row];
1037       sum2 = tmp[row - 1];
1038       sum3 = tmp[row - 2];
1039       v2   = aa + ai[row] - 1;
1040       v3   = aa + ai[row - 1] - 1;
1041       for (j = nz; j > 1; j -= 2) {
1042         vi -= 2;
1043         i0   = vi[2];
1044         i1   = vi[1];
1045         tmp0 = tmps[i0];
1046         tmp1 = tmps[i1];
1047         v1 -= 2;
1048         v2 -= 2;
1049         v3 -= 2;
1050         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1051         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1052         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1053       }
1054       if (j == 1) {
1055         tmp0 = tmps[*vi--];
1056         sum1 -= *v1-- * tmp0;
1057         sum2 -= *v2-- * tmp0;
1058         sum3 -= *v3-- * tmp0;
1059       }
1060       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1061       row--;
1062       sum2 -= *v2-- * tmp0;
1063       sum3 -= *v3-- * tmp0;
1064       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1065       row--;
1066       sum3 -= *v3-- * tmp0;
1067       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1068       row--;
1069 
1070       break;
1071     case 4:
1072       sum1 = tmp[row];
1073       sum2 = tmp[row - 1];
1074       sum3 = tmp[row - 2];
1075       sum4 = tmp[row - 3];
1076       v2   = aa + ai[row] - 1;
1077       v3   = aa + ai[row - 1] - 1;
1078       v4   = aa + ai[row - 2] - 1;
1079 
1080       for (j = nz; j > 1; j -= 2) {
1081         vi -= 2;
1082         i0   = vi[2];
1083         i1   = vi[1];
1084         tmp0 = tmps[i0];
1085         tmp1 = tmps[i1];
1086         v1 -= 2;
1087         v2 -= 2;
1088         v3 -= 2;
1089         v4 -= 2;
1090         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1091         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1092         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1093         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1094       }
1095       if (j == 1) {
1096         tmp0 = tmps[*vi--];
1097         sum1 -= *v1-- * tmp0;
1098         sum2 -= *v2-- * tmp0;
1099         sum3 -= *v3-- * tmp0;
1100         sum4 -= *v4-- * tmp0;
1101       }
1102 
1103       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1104       row--;
1105       sum2 -= *v2-- * tmp0;
1106       sum3 -= *v3-- * tmp0;
1107       sum4 -= *v4-- * tmp0;
1108       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1109       row--;
1110       sum3 -= *v3-- * tmp0;
1111       sum4 -= *v4-- * tmp0;
1112       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1113       row--;
1114       sum4 -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1116       row--;
1117       break;
1118     case 5:
1119       sum1 = tmp[row];
1120       sum2 = tmp[row - 1];
1121       sum3 = tmp[row - 2];
1122       sum4 = tmp[row - 3];
1123       sum5 = tmp[row - 4];
1124       v2   = aa + ai[row] - 1;
1125       v3   = aa + ai[row - 1] - 1;
1126       v4   = aa + ai[row - 2] - 1;
1127       v5   = aa + ai[row - 3] - 1;
1128       for (j = nz; j > 1; j -= 2) {
1129         vi -= 2;
1130         i0   = vi[2];
1131         i1   = vi[1];
1132         tmp0 = tmps[i0];
1133         tmp1 = tmps[i1];
1134         v1 -= 2;
1135         v2 -= 2;
1136         v3 -= 2;
1137         v4 -= 2;
1138         v5 -= 2;
1139         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1140         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1141         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1142         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1143         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1144       }
1145       if (j == 1) {
1146         tmp0 = tmps[*vi--];
1147         sum1 -= *v1-- * tmp0;
1148         sum2 -= *v2-- * tmp0;
1149         sum3 -= *v3-- * tmp0;
1150         sum4 -= *v4-- * tmp0;
1151         sum5 -= *v5-- * tmp0;
1152       }
1153 
1154       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1155       row--;
1156       sum2 -= *v2-- * tmp0;
1157       sum3 -= *v3-- * tmp0;
1158       sum4 -= *v4-- * tmp0;
1159       sum5 -= *v5-- * tmp0;
1160       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1161       row--;
1162       sum3 -= *v3-- * tmp0;
1163       sum4 -= *v4-- * tmp0;
1164       sum5 -= *v5-- * tmp0;
1165       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1166       row--;
1167       sum4 -= *v4-- * tmp0;
1168       sum5 -= *v5-- * tmp0;
1169       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1170       row--;
1171       sum5 -= *v5-- * tmp0;
1172       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1173       row--;
1174       break;
1175     default:
1176       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1177     }
1178   }
1179   PetscCall(ISRestoreIndices(isrow, &rout));
1180   PetscCall(ISRestoreIndices(iscol, &cout));
1181   PetscCall(VecRestoreArrayRead(bb, &b));
1182   PetscCall(VecRestoreArrayWrite(xx, &x));
1183   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1184   PetscFunctionReturn(PETSC_SUCCESS);
1185 }
1186 
1187 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1188 {
1189   Mat              C = B;
1190   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1191   IS               isrow = b->row, isicol = b->icol;
1192   const PetscInt  *r, *ic, *ics;
1193   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1194   PetscInt         i, j, k, nz, nzL, row, *pj;
1195   const PetscInt  *ajtmp, *bjtmp;
1196   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1197   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1198   FactorShiftCtx   sctx;
1199   const PetscInt  *ddiag;
1200   PetscReal        rs;
1201   MatScalar        d;
1202   PetscInt         inod, nodesz, node_max, col;
1203   const PetscInt  *ns;
1204   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1205 
1206   PetscFunctionBegin;
1207   /* MatPivotSetUp(): initialize shift context sctx */
1208   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1209 
1210   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1211     ddiag          = a->diag;
1212     sctx.shift_top = info->zeropivot;
1213     for (i = 0; i < n; i++) {
1214       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1215       d  = (aa)[ddiag[i]];
1216       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1217       v  = aa + ai[i];
1218       nz = ai[i + 1] - ai[i];
1219       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1220       if (rs > sctx.shift_top) sctx.shift_top = rs;
1221     }
1222     sctx.shift_top *= 1.1;
1223     sctx.nshift_max = 5;
1224     sctx.shift_lo   = 0.;
1225     sctx.shift_hi   = 1.;
1226   }
1227 
1228   PetscCall(ISGetIndices(isrow, &r));
1229   PetscCall(ISGetIndices(isicol, &ic));
1230 
1231   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1232   ics = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1237 
1238   /* If max inode size > 4, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1241   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1242     if (ns[i] > 4) {
1243       tmp_vec1[j] = 4;
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   PetscCall(PetscMalloc1(n + 1, &nsmap));
1256   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1257   for (i = 0, row = 0; i < node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1260   }
1261   /* Using nsmap, create a reordered ns structure */
1262   for (i = 0, j = 0; i < node_max; i++) {
1263     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1264     tmp_vec2[i] = nodesz;
1265     j += nodesz;
1266   }
1267   PetscCall(PetscFree(nsmap));
1268   PetscCall(PetscFree(tmp_vec1));
1269 
1270   /* Now use the correct ns */
1271   ns = tmp_vec2;
1272 
1273   do {
1274     sctx.newshift = PETSC_FALSE;
1275     /* Now loop over each block-row, and do the factorization */
1276     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1277       nodesz = ns[inod];
1278 
1279       switch (nodesz) {
1280       case 1:
1281         /* zero rtmp1 */
1282         /* L part */
1283         nz    = bi[i + 1] - bi[i];
1284         bjtmp = bj + bi[i];
1285         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286 
1287         /* U part */
1288         nz    = bdiag[i] - bdiag[i + 1];
1289         bjtmp = bj + bdiag[i + 1] + 1;
1290         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* load in initial (unfactored row) */
1293         nz    = ai[r[i] + 1] - ai[r[i]];
1294         ajtmp = aj + ai[r[i]];
1295         v     = aa + ai[r[i]];
1296         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1297 
1298         /* ZeropivotApply() */
1299         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1300 
1301         /* elimination */
1302         bjtmp = bj + bi[i];
1303         row   = *bjtmp++;
1304         nzL   = bi[i + 1] - bi[i];
1305         for (k = 0; k < nzL; k++) {
1306           pc = rtmp1 + row;
1307           if (*pc != 0.0) {
1308             pv   = b->a + bdiag[row];
1309             mul1 = *pc * (*pv);
1310             *pc  = mul1;
1311             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1312             pv   = b->a + bdiag[row + 1] + 1;
1313             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1314             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1315             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1316           }
1317           row = *bjtmp++;
1318         }
1319 
1320         /* finished row so stick it into b->a */
1321         rs = 0.0;
1322         /* L part */
1323         pv = b->a + bi[i];
1324         pj = b->j + bi[i];
1325         nz = bi[i + 1] - bi[i];
1326         for (j = 0; j < nz; j++) {
1327           pv[j] = rtmp1[pj[j]];
1328           rs += PetscAbsScalar(pv[j]);
1329         }
1330 
1331         /* U part */
1332         pv = b->a + bdiag[i + 1] + 1;
1333         pj = b->j + bdiag[i + 1] + 1;
1334         nz = bdiag[i] - bdiag[i + 1] - 1;
1335         for (j = 0; j < nz; j++) {
1336           pv[j] = rtmp1[pj[j]];
1337           rs += PetscAbsScalar(pv[j]);
1338         }
1339 
1340         /* Check zero pivot */
1341         sctx.rs = rs;
1342         sctx.pv = rtmp1[i];
1343         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1344         if (sctx.newshift) break;
1345 
1346         /* Mark diagonal and invert diagonal for simpler triangular solves */
1347         pv  = b->a + bdiag[i];
1348         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1349         break;
1350 
1351       case 2:
1352         /* zero rtmp1 and rtmp2 */
1353         /* L part */
1354         nz    = bi[i + 1] - bi[i];
1355         bjtmp = bj + bi[i];
1356         for (j = 0; j < nz; j++) {
1357           col        = bjtmp[j];
1358           rtmp1[col] = 0.0;
1359           rtmp2[col] = 0.0;
1360         }
1361 
1362         /* U part */
1363         nz    = bdiag[i] - bdiag[i + 1];
1364         bjtmp = bj + bdiag[i + 1] + 1;
1365         for (j = 0; j < nz; j++) {
1366           col        = bjtmp[j];
1367           rtmp1[col] = 0.0;
1368           rtmp2[col] = 0.0;
1369         }
1370 
1371         /* load in initial (unfactored row) */
1372         nz    = ai[r[i] + 1] - ai[r[i]];
1373         ajtmp = aj + ai[r[i]];
1374         v1    = aa + ai[r[i]];
1375         v2    = aa + ai[r[i] + 1];
1376         for (j = 0; j < nz; j++) {
1377           col        = ics[ajtmp[j]];
1378           rtmp1[col] = v1[j];
1379           rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount;
1383         rtmp2[i + 1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i + 1] - bi[i];
1389         for (k = 0; k < nzL; k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1 * (*pv);
1395             mul2 = *pc2 * (*pv);
1396             *pc1 = mul1;
1397             *pc2 = mul2;
1398 
1399             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1400             pv = b->a + bdiag[row + 1] + 1;
1401             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1402             for (j = 0; j < nz; j++) {
1403               col = pj[j];
1404               rtmp1[col] -= mul1 * pv[j];
1405               rtmp2[col] -= mul2 * pv[j];
1406             }
1407             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1408           }
1409           row = *bjtmp++;
1410         }
1411 
1412         /* finished row i; check zero pivot, then stick row i into b->a */
1413         rs = 0.0;
1414         /* L part */
1415         pc1 = b->a + bi[i];
1416         pj  = b->j + bi[i];
1417         nz  = bi[i + 1] - bi[i];
1418         for (j = 0; j < nz; j++) {
1419           col    = pj[j];
1420           pc1[j] = rtmp1[col];
1421           rs += PetscAbsScalar(pc1[j]);
1422         }
1423         /* U part */
1424         pc1 = b->a + bdiag[i + 1] + 1;
1425         pj  = b->j + bdiag[i + 1] + 1;
1426         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1427         for (j = 0; j < nz; j++) {
1428           col    = pj[j];
1429           pc1[j] = rtmp1[col];
1430           rs += PetscAbsScalar(pc1[j]);
1431         }
1432 
1433         sctx.rs = rs;
1434         sctx.pv = rtmp1[i];
1435         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1436         if (sctx.newshift) break;
1437         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1438         *pc1 = 1.0 / sctx.pv;
1439 
1440         /* Now take care of diagonal 2x2 block. */
1441         pc2 = rtmp2 + i;
1442         if (*pc2 != 0.0) {
1443           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1444           *pc2 = mul1;                        /* insert L entry */
1445           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1446           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1447           for (j = 0; j < nz; j++) {
1448             col = pj[j];
1449             rtmp2[col] -= mul1 * rtmp1[col];
1450           }
1451           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1452         }
1453 
1454         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1455         rs = 0.0;
1456         /* L part */
1457         pc2 = b->a + bi[i + 1];
1458         pj  = b->j + bi[i + 1];
1459         nz  = bi[i + 2] - bi[i + 1];
1460         for (j = 0; j < nz; j++) {
1461           col    = pj[j];
1462           pc2[j] = rtmp2[col];
1463           rs += PetscAbsScalar(pc2[j]);
1464         }
1465         /* U part */
1466         pc2 = b->a + bdiag[i + 2] + 1;
1467         pj  = b->j + bdiag[i + 2] + 1;
1468         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1469         for (j = 0; j < nz; j++) {
1470           col    = pj[j];
1471           pc2[j] = rtmp2[col];
1472           rs += PetscAbsScalar(pc2[j]);
1473         }
1474 
1475         sctx.rs = rs;
1476         sctx.pv = rtmp2[i + 1];
1477         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1478         if (sctx.newshift) break;
1479         pc2  = b->a + bdiag[i + 1];
1480         *pc2 = 1.0 / sctx.pv;
1481         break;
1482 
1483       case 3:
1484         /* zero rtmp */
1485         /* L part */
1486         nz    = bi[i + 1] - bi[i];
1487         bjtmp = bj + bi[i];
1488         for (j = 0; j < nz; j++) {
1489           col        = bjtmp[j];
1490           rtmp1[col] = 0.0;
1491           rtmp2[col] = 0.0;
1492           rtmp3[col] = 0.0;
1493         }
1494 
1495         /* U part */
1496         nz    = bdiag[i] - bdiag[i + 1];
1497         bjtmp = bj + bdiag[i + 1] + 1;
1498         for (j = 0; j < nz; j++) {
1499           col        = bjtmp[j];
1500           rtmp1[col] = 0.0;
1501           rtmp2[col] = 0.0;
1502           rtmp3[col] = 0.0;
1503         }
1504 
1505         /* load in initial (unfactored row) */
1506         nz    = ai[r[i] + 1] - ai[r[i]];
1507         ajtmp = aj + ai[r[i]];
1508         v1    = aa + ai[r[i]];
1509         v2    = aa + ai[r[i] + 1];
1510         v3    = aa + ai[r[i] + 2];
1511         for (j = 0; j < nz; j++) {
1512           col        = ics[ajtmp[j]];
1513           rtmp1[col] = v1[j];
1514           rtmp2[col] = v2[j];
1515           rtmp3[col] = v3[j];
1516         }
1517         /* ZeropivotApply(): shift the diagonal of the matrix  */
1518         rtmp1[i] += sctx.shift_amount;
1519         rtmp2[i + 1] += sctx.shift_amount;
1520         rtmp3[i + 2] += sctx.shift_amount;
1521 
1522         /* elimination */
1523         bjtmp = bj + bi[i];
1524         row   = *bjtmp++; /* pivot row */
1525         nzL   = bi[i + 1] - bi[i];
1526         for (k = 0; k < nzL; k++) {
1527           pc1 = rtmp1 + row;
1528           pc2 = rtmp2 + row;
1529           pc3 = rtmp3 + row;
1530           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1531             pv   = b->a + bdiag[row];
1532             mul1 = *pc1 * (*pv);
1533             mul2 = *pc2 * (*pv);
1534             mul3 = *pc3 * (*pv);
1535             *pc1 = mul1;
1536             *pc2 = mul2;
1537             *pc3 = mul3;
1538 
1539             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1540             pv = b->a + bdiag[row + 1] + 1;
1541             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1542             for (j = 0; j < nz; j++) {
1543               col = pj[j];
1544               rtmp1[col] -= mul1 * pv[j];
1545               rtmp2[col] -= mul2 * pv[j];
1546               rtmp3[col] -= mul3 * pv[j];
1547             }
1548             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1549           }
1550           row = *bjtmp++;
1551         }
1552 
1553         /* finished row i; check zero pivot, then stick row i into b->a */
1554         rs = 0.0;
1555         /* L part */
1556         pc1 = b->a + bi[i];
1557         pj  = b->j + bi[i];
1558         nz  = bi[i + 1] - bi[i];
1559         for (j = 0; j < nz; j++) {
1560           col    = pj[j];
1561           pc1[j] = rtmp1[col];
1562           rs += PetscAbsScalar(pc1[j]);
1563         }
1564         /* U part */
1565         pc1 = b->a + bdiag[i + 1] + 1;
1566         pj  = b->j + bdiag[i + 1] + 1;
1567         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1568         for (j = 0; j < nz; j++) {
1569           col    = pj[j];
1570           pc1[j] = rtmp1[col];
1571           rs += PetscAbsScalar(pc1[j]);
1572         }
1573 
1574         sctx.rs = rs;
1575         sctx.pv = rtmp1[i];
1576         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1577         if (sctx.newshift) break;
1578         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1579         *pc1 = 1.0 / sctx.pv;
1580 
1581         /* Now take care of 1st column of diagonal 3x3 block. */
1582         pc2 = rtmp2 + i;
1583         pc3 = rtmp3 + i;
1584         if (*pc2 != 0.0 || *pc3 != 0.0) {
1585           mul2 = (*pc2) * (*pc1);
1586           *pc2 = mul2;
1587           mul3 = (*pc3) * (*pc1);
1588           *pc3 = mul3;
1589           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1590           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1591           for (j = 0; j < nz; j++) {
1592             col = pj[j];
1593             rtmp2[col] -= mul2 * rtmp1[col];
1594             rtmp3[col] -= mul3 * rtmp1[col];
1595           }
1596           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1597         }
1598 
1599         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1600         rs = 0.0;
1601         /* L part */
1602         pc2 = b->a + bi[i + 1];
1603         pj  = b->j + bi[i + 1];
1604         nz  = bi[i + 2] - bi[i + 1];
1605         for (j = 0; j < nz; j++) {
1606           col    = pj[j];
1607           pc2[j] = rtmp2[col];
1608           rs += PetscAbsScalar(pc2[j]);
1609         }
1610         /* U part */
1611         pc2 = b->a + bdiag[i + 2] + 1;
1612         pj  = b->j + bdiag[i + 2] + 1;
1613         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1614         for (j = 0; j < nz; j++) {
1615           col    = pj[j];
1616           pc2[j] = rtmp2[col];
1617           rs += PetscAbsScalar(pc2[j]);
1618         }
1619 
1620         sctx.rs = rs;
1621         sctx.pv = rtmp2[i + 1];
1622         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1623         if (sctx.newshift) break;
1624         pc2  = b->a + bdiag[i + 1];
1625         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1626 
1627         /* Now take care of 2nd column of diagonal 3x3 block. */
1628         pc3 = rtmp3 + i + 1;
1629         if (*pc3 != 0.0) {
1630           mul3 = (*pc3) * (*pc2);
1631           *pc3 = mul3;
1632           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1633           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1634           for (j = 0; j < nz; j++) {
1635             col = pj[j];
1636             rtmp3[col] -= mul3 * rtmp2[col];
1637           }
1638           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1639         }
1640 
1641         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1642         rs = 0.0;
1643         /* L part */
1644         pc3 = b->a + bi[i + 2];
1645         pj  = b->j + bi[i + 2];
1646         nz  = bi[i + 3] - bi[i + 2];
1647         for (j = 0; j < nz; j++) {
1648           col    = pj[j];
1649           pc3[j] = rtmp3[col];
1650           rs += PetscAbsScalar(pc3[j]);
1651         }
1652         /* U part */
1653         pc3 = b->a + bdiag[i + 3] + 1;
1654         pj  = b->j + bdiag[i + 3] + 1;
1655         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1656         for (j = 0; j < nz; j++) {
1657           col    = pj[j];
1658           pc3[j] = rtmp3[col];
1659           rs += PetscAbsScalar(pc3[j]);
1660         }
1661 
1662         sctx.rs = rs;
1663         sctx.pv = rtmp3[i + 2];
1664         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1665         if (sctx.newshift) break;
1666         pc3  = b->a + bdiag[i + 2];
1667         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1668         break;
1669       case 4:
1670         /* zero rtmp */
1671         /* L part */
1672         nz    = bi[i + 1] - bi[i];
1673         bjtmp = bj + bi[i];
1674         for (j = 0; j < nz; j++) {
1675           col        = bjtmp[j];
1676           rtmp1[col] = 0.0;
1677           rtmp2[col] = 0.0;
1678           rtmp3[col] = 0.0;
1679           rtmp4[col] = 0.0;
1680         }
1681 
1682         /* U part */
1683         nz    = bdiag[i] - bdiag[i + 1];
1684         bjtmp = bj + bdiag[i + 1] + 1;
1685         for (j = 0; j < nz; j++) {
1686           col        = bjtmp[j];
1687           rtmp1[col] = 0.0;
1688           rtmp2[col] = 0.0;
1689           rtmp3[col] = 0.0;
1690           rtmp4[col] = 0.0;
1691         }
1692 
1693         /* load in initial (unfactored row) */
1694         nz    = ai[r[i] + 1] - ai[r[i]];
1695         ajtmp = aj + ai[r[i]];
1696         v1    = aa + ai[r[i]];
1697         v2    = aa + ai[r[i] + 1];
1698         v3    = aa + ai[r[i] + 2];
1699         v4    = aa + ai[r[i] + 3];
1700         for (j = 0; j < nz; j++) {
1701           col        = ics[ajtmp[j]];
1702           rtmp1[col] = v1[j];
1703           rtmp2[col] = v2[j];
1704           rtmp3[col] = v3[j];
1705           rtmp4[col] = v4[j];
1706         }
1707         /* ZeropivotApply(): shift the diagonal of the matrix  */
1708         rtmp1[i] += sctx.shift_amount;
1709         rtmp2[i + 1] += sctx.shift_amount;
1710         rtmp3[i + 2] += sctx.shift_amount;
1711         rtmp4[i + 3] += sctx.shift_amount;
1712 
1713         /* elimination */
1714         bjtmp = bj + bi[i];
1715         row   = *bjtmp++; /* pivot row */
1716         nzL   = bi[i + 1] - bi[i];
1717         for (k = 0; k < nzL; k++) {
1718           pc1 = rtmp1 + row;
1719           pc2 = rtmp2 + row;
1720           pc3 = rtmp3 + row;
1721           pc4 = rtmp4 + row;
1722           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723             pv   = b->a + bdiag[row];
1724             mul1 = *pc1 * (*pv);
1725             mul2 = *pc2 * (*pv);
1726             mul3 = *pc3 * (*pv);
1727             mul4 = *pc4 * (*pv);
1728             *pc1 = mul1;
1729             *pc2 = mul2;
1730             *pc3 = mul3;
1731             *pc4 = mul4;
1732 
1733             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1734             pv = b->a + bdiag[row + 1] + 1;
1735             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1736             for (j = 0; j < nz; j++) {
1737               col = pj[j];
1738               rtmp1[col] -= mul1 * pv[j];
1739               rtmp2[col] -= mul2 * pv[j];
1740               rtmp3[col] -= mul3 * pv[j];
1741               rtmp4[col] -= mul4 * pv[j];
1742             }
1743             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1744           }
1745           row = *bjtmp++;
1746         }
1747 
1748         /* finished row i; check zero pivot, then stick row i into b->a */
1749         rs = 0.0;
1750         /* L part */
1751         pc1 = b->a + bi[i];
1752         pj  = b->j + bi[i];
1753         nz  = bi[i + 1] - bi[i];
1754         for (j = 0; j < nz; j++) {
1755           col    = pj[j];
1756           pc1[j] = rtmp1[col];
1757           rs += PetscAbsScalar(pc1[j]);
1758         }
1759         /* U part */
1760         pc1 = b->a + bdiag[i + 1] + 1;
1761         pj  = b->j + bdiag[i + 1] + 1;
1762         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1763         for (j = 0; j < nz; j++) {
1764           col    = pj[j];
1765           pc1[j] = rtmp1[col];
1766           rs += PetscAbsScalar(pc1[j]);
1767         }
1768 
1769         sctx.rs = rs;
1770         sctx.pv = rtmp1[i];
1771         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1772         if (sctx.newshift) break;
1773         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1774         *pc1 = 1.0 / sctx.pv;
1775 
1776         /* Now take care of 1st column of diagonal 4x4 block. */
1777         pc2 = rtmp2 + i;
1778         pc3 = rtmp3 + i;
1779         pc4 = rtmp4 + i;
1780         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1781           mul2 = (*pc2) * (*pc1);
1782           *pc2 = mul2;
1783           mul3 = (*pc3) * (*pc1);
1784           *pc3 = mul3;
1785           mul4 = (*pc4) * (*pc1);
1786           *pc4 = mul4;
1787           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1788           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1789           for (j = 0; j < nz; j++) {
1790             col = pj[j];
1791             rtmp2[col] -= mul2 * rtmp1[col];
1792             rtmp3[col] -= mul3 * rtmp1[col];
1793             rtmp4[col] -= mul4 * rtmp1[col];
1794           }
1795           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1796         }
1797 
1798         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1799         rs = 0.0;
1800         /* L part */
1801         pc2 = b->a + bi[i + 1];
1802         pj  = b->j + bi[i + 1];
1803         nz  = bi[i + 2] - bi[i + 1];
1804         for (j = 0; j < nz; j++) {
1805           col    = pj[j];
1806           pc2[j] = rtmp2[col];
1807           rs += PetscAbsScalar(pc2[j]);
1808         }
1809         /* U part */
1810         pc2 = b->a + bdiag[i + 2] + 1;
1811         pj  = b->j + bdiag[i + 2] + 1;
1812         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1813         for (j = 0; j < nz; j++) {
1814           col    = pj[j];
1815           pc2[j] = rtmp2[col];
1816           rs += PetscAbsScalar(pc2[j]);
1817         }
1818 
1819         sctx.rs = rs;
1820         sctx.pv = rtmp2[i + 1];
1821         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1822         if (sctx.newshift) break;
1823         pc2  = b->a + bdiag[i + 1];
1824         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1825 
1826         /* Now take care of 2nd column of diagonal 4x4 block. */
1827         pc3 = rtmp3 + i + 1;
1828         pc4 = rtmp4 + i + 1;
1829         if (*pc3 != 0.0 || *pc4 != 0.0) {
1830           mul3 = (*pc3) * (*pc2);
1831           *pc3 = mul3;
1832           mul4 = (*pc4) * (*pc2);
1833           *pc4 = mul4;
1834           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1835           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1836           for (j = 0; j < nz; j++) {
1837             col = pj[j];
1838             rtmp3[col] -= mul3 * rtmp2[col];
1839             rtmp4[col] -= mul4 * rtmp2[col];
1840           }
1841           PetscCall(PetscLogFlops(4.0 * nz));
1842         }
1843 
1844         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1845         rs = 0.0;
1846         /* L part */
1847         pc3 = b->a + bi[i + 2];
1848         pj  = b->j + bi[i + 2];
1849         nz  = bi[i + 3] - bi[i + 2];
1850         for (j = 0; j < nz; j++) {
1851           col    = pj[j];
1852           pc3[j] = rtmp3[col];
1853           rs += PetscAbsScalar(pc3[j]);
1854         }
1855         /* U part */
1856         pc3 = b->a + bdiag[i + 3] + 1;
1857         pj  = b->j + bdiag[i + 3] + 1;
1858         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1859         for (j = 0; j < nz; j++) {
1860           col    = pj[j];
1861           pc3[j] = rtmp3[col];
1862           rs += PetscAbsScalar(pc3[j]);
1863         }
1864 
1865         sctx.rs = rs;
1866         sctx.pv = rtmp3[i + 2];
1867         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1868         if (sctx.newshift) break;
1869         pc3  = b->a + bdiag[i + 2];
1870         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1871 
1872         /* Now take care of 3rd column of diagonal 4x4 block. */
1873         pc4 = rtmp4 + i + 2;
1874         if (*pc4 != 0.0) {
1875           mul4 = (*pc4) * (*pc3);
1876           *pc4 = mul4;
1877           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1878           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1879           for (j = 0; j < nz; j++) {
1880             col = pj[j];
1881             rtmp4[col] -= mul4 * rtmp3[col];
1882           }
1883           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1884         }
1885 
1886         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1887         rs = 0.0;
1888         /* L part */
1889         pc4 = b->a + bi[i + 3];
1890         pj  = b->j + bi[i + 3];
1891         nz  = bi[i + 4] - bi[i + 3];
1892         for (j = 0; j < nz; j++) {
1893           col    = pj[j];
1894           pc4[j] = rtmp4[col];
1895           rs += PetscAbsScalar(pc4[j]);
1896         }
1897         /* U part */
1898         pc4 = b->a + bdiag[i + 4] + 1;
1899         pj  = b->j + bdiag[i + 4] + 1;
1900         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1901         for (j = 0; j < nz; j++) {
1902           col    = pj[j];
1903           pc4[j] = rtmp4[col];
1904           rs += PetscAbsScalar(pc4[j]);
1905         }
1906 
1907         sctx.rs = rs;
1908         sctx.pv = rtmp4[i + 3];
1909         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1910         if (sctx.newshift) break;
1911         pc4  = b->a + bdiag[i + 3];
1912         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1913         break;
1914 
1915       default:
1916         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1917       }
1918       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1919       i += nodesz;              /* Update the row */
1920     }
1921 
1922     /* MatPivotRefine() */
1923     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1924       /*
1925        * if no shift in this attempt & shifting & started shifting & can refine,
1926        * then try lower shift
1927        */
1928       sctx.shift_hi       = sctx.shift_fraction;
1929       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1930       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1931       sctx.newshift       = PETSC_TRUE;
1932       sctx.nshift++;
1933     }
1934   } while (sctx.newshift);
1935 
1936   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1937   PetscCall(PetscFree(tmp_vec2));
1938   PetscCall(ISRestoreIndices(isicol, &ic));
1939   PetscCall(ISRestoreIndices(isrow, &r));
1940 
1941   if (b->inode.size) {
1942     C->ops->solve = MatSolve_SeqAIJ_Inode;
1943   } else {
1944     C->ops->solve = MatSolve_SeqAIJ;
1945   }
1946   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1947   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1948   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1949   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1950   C->assembled              = PETSC_TRUE;
1951   C->preallocated           = PETSC_TRUE;
1952 
1953   PetscCall(PetscLogFlops(C->cmap->n));
1954 
1955   /* MatShiftView(A,info,&sctx) */
1956   if (sctx.nshift) {
1957     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1958       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1959     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1960       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1961     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1962       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1963     }
1964   }
1965   PetscFunctionReturn(PETSC_SUCCESS);
1966 }
1967 
1968 #if 0
1969 // unused
1970 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1971 {
1972   Mat              C = B;
1973   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1974   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1975   const PetscInt  *r, *ic, *c, *ics;
1976   PetscInt         n = A->rmap->n, *bi = b->i;
1977   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1978   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1979   PetscInt        *ai = a->i, *aj = a->j;
1980   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1981   PetscScalar      mul1, mul2, mul3, tmp;
1982   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1983   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1984   PetscReal        rs = 0.0;
1985   FactorShiftCtx   sctx;
1986 
1987   PetscFunctionBegin;
1988   sctx.shift_top      = 0;
1989   sctx.nshift_max     = 0;
1990   sctx.shift_lo       = 0;
1991   sctx.shift_hi       = 0;
1992   sctx.shift_fraction = 0;
1993 
1994   /* if both shift schemes are chosen by user, only use info->shiftpd */
1995   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1996     sctx.shift_top = 0;
1997     for (i = 0; i < n; i++) {
1998       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1999       rs    = 0.0;
2000       ajtmp = aj + ai[i];
2001       rtmp1 = aa + ai[i];
2002       nz    = ai[i + 1] - ai[i];
2003       for (j = 0; j < nz; j++) {
2004         if (*ajtmp != i) {
2005           rs += PetscAbsScalar(*rtmp1++);
2006         } else {
2007           rs -= PetscRealPart(*rtmp1++);
2008         }
2009         ajtmp++;
2010       }
2011       if (rs > sctx.shift_top) sctx.shift_top = rs;
2012     }
2013     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2014     sctx.shift_top *= 1.1;
2015     sctx.nshift_max = 5;
2016     sctx.shift_lo   = 0.;
2017     sctx.shift_hi   = 1.;
2018   }
2019   sctx.shift_amount = 0;
2020   sctx.nshift       = 0;
2021 
2022   PetscCall(ISGetIndices(isrow, &r));
2023   PetscCall(ISGetIndices(iscol, &c));
2024   PetscCall(ISGetIndices(isicol, &ic));
2025   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2026   ics = ic;
2027 
2028   node_max = a->inode.node_count;
2029   ns       = a->inode.size;
2030   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2031 
2032   /* If max inode size > 3, split it into two inodes.*/
2033   /* also map the inode sizes according to the ordering */
2034   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2035   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2036     if (ns[i] > 3) {
2037       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2038       ++j;
2039       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2040     } else {
2041       tmp_vec1[j] = ns[i];
2042     }
2043   }
2044   /* Use the correct node_max */
2045   node_max = j;
2046 
2047   /* Now reorder the inode info based on mat re-ordering info */
2048   /* First create a row -> inode_size_array_index map */
2049   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2050   for (i = 0, row = 0; i < node_max; i++) {
2051     nodesz = tmp_vec1[i];
2052     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2053   }
2054   /* Using nsmap, create a reordered ns structure */
2055   for (i = 0, j = 0; i < node_max; i++) {
2056     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2057     tmp_vec2[i] = nodesz;
2058     j += nodesz;
2059   }
2060   PetscCall(PetscFree2(nsmap, tmp_vec1));
2061   /* Now use the correct ns */
2062   ns = tmp_vec2;
2063 
2064   do {
2065     sctx.newshift = PETSC_FALSE;
2066     /* Now loop over each block-row, and do the factorization */
2067     for (i = 0, row = 0; i < node_max; i++) {
2068       nodesz = ns[i];
2069       nz     = bi[row + 1] - bi[row];
2070       bjtmp  = bj + bi[row];
2071 
2072       switch (nodesz) {
2073       case 1:
2074         for (j = 0; j < nz; j++) {
2075           idx         = bjtmp[j];
2076           rtmp11[idx] = 0.0;
2077         }
2078 
2079         /* load in initial (unfactored row) */
2080         idx    = r[row];
2081         nz_tmp = ai[idx + 1] - ai[idx];
2082         ajtmp  = aj + ai[idx];
2083         v1     = aa + ai[idx];
2084 
2085         for (j = 0; j < nz_tmp; j++) {
2086           idx         = ics[ajtmp[j]];
2087           rtmp11[idx] = v1[j];
2088         }
2089         rtmp11[ics[r[row]]] += sctx.shift_amount;
2090 
2091         prow = *bjtmp++;
2092         while (prow < row) {
2093           pc1 = rtmp11 + prow;
2094           if (*pc1 != 0.0) {
2095             pv     = ba + bd[prow];
2096             pj     = nbj + bd[prow];
2097             mul1   = *pc1 * *pv++;
2098             *pc1   = mul1;
2099             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2100             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2101             for (j = 0; j < nz_tmp; j++) {
2102               tmp = pv[j];
2103               idx = pj[j];
2104               rtmp11[idx] -= mul1 * tmp;
2105             }
2106           }
2107           prow = *bjtmp++;
2108         }
2109         pj  = bj + bi[row];
2110         pc1 = ba + bi[row];
2111 
2112         sctx.pv     = rtmp11[row];
2113         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2114         rs          = 0.0;
2115         for (j = 0; j < nz; j++) {
2116           idx    = pj[j];
2117           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2118           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2119         }
2120         sctx.rs = rs;
2121         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2122         if (sctx.newshift) goto endofwhile;
2123         break;
2124 
2125       case 2:
2126         for (j = 0; j < nz; j++) {
2127           idx         = bjtmp[j];
2128           rtmp11[idx] = 0.0;
2129           rtmp22[idx] = 0.0;
2130         }
2131 
2132         /* load in initial (unfactored row) */
2133         idx    = r[row];
2134         nz_tmp = ai[idx + 1] - ai[idx];
2135         ajtmp  = aj + ai[idx];
2136         v1     = aa + ai[idx];
2137         v2     = aa + ai[idx + 1];
2138         for (j = 0; j < nz_tmp; j++) {
2139           idx         = ics[ajtmp[j]];
2140           rtmp11[idx] = v1[j];
2141           rtmp22[idx] = v2[j];
2142         }
2143         rtmp11[ics[r[row]]] += sctx.shift_amount;
2144         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2145 
2146         prow = *bjtmp++;
2147         while (prow < row) {
2148           pc1 = rtmp11 + prow;
2149           pc2 = rtmp22 + prow;
2150           if (*pc1 != 0.0 || *pc2 != 0.0) {
2151             pv   = ba + bd[prow];
2152             pj   = nbj + bd[prow];
2153             mul1 = *pc1 * *pv;
2154             mul2 = *pc2 * *pv;
2155             ++pv;
2156             *pc1 = mul1;
2157             *pc2 = mul2;
2158 
2159             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2160             for (j = 0; j < nz_tmp; j++) {
2161               tmp = pv[j];
2162               idx = pj[j];
2163               rtmp11[idx] -= mul1 * tmp;
2164               rtmp22[idx] -= mul2 * tmp;
2165             }
2166             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2167           }
2168           prow = *bjtmp++;
2169         }
2170 
2171         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2172         pc1 = rtmp11 + prow;
2173         pc2 = rtmp22 + prow;
2174 
2175         sctx.pv = *pc1;
2176         pj      = bj + bi[prow];
2177         rs      = 0.0;
2178         for (j = 0; j < nz; j++) {
2179           idx = pj[j];
2180           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2181         }
2182         sctx.rs = rs;
2183         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2184         if (sctx.newshift) goto endofwhile;
2185 
2186         if (*pc2 != 0.0) {
2187           pj     = nbj + bd[prow];
2188           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2189           *pc2   = mul2;
2190           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2191           for (j = 0; j < nz_tmp; j++) {
2192             idx = pj[j];
2193             tmp = rtmp11[idx];
2194             rtmp22[idx] -= mul2 * tmp;
2195           }
2196           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2197         }
2198 
2199         pj  = bj + bi[row];
2200         pc1 = ba + bi[row];
2201         pc2 = ba + bi[row + 1];
2202 
2203         sctx.pv         = rtmp22[row + 1];
2204         rs              = 0.0;
2205         rtmp11[row]     = 1.0 / rtmp11[row];
2206         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2207         /* copy row entries from dense representation to sparse */
2208         for (j = 0; j < nz; j++) {
2209           idx    = pj[j];
2210           pc1[j] = rtmp11[idx];
2211           pc2[j] = rtmp22[idx];
2212           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2213         }
2214         sctx.rs = rs;
2215         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2216         if (sctx.newshift) goto endofwhile;
2217         break;
2218 
2219       case 3:
2220         for (j = 0; j < nz; j++) {
2221           idx         = bjtmp[j];
2222           rtmp11[idx] = 0.0;
2223           rtmp22[idx] = 0.0;
2224           rtmp33[idx] = 0.0;
2225         }
2226         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2227         idx    = r[row];
2228         nz_tmp = ai[idx + 1] - ai[idx];
2229         ajtmp  = aj + ai[idx];
2230         v1     = aa + ai[idx];
2231         v2     = aa + ai[idx + 1];
2232         v3     = aa + ai[idx + 2];
2233         for (j = 0; j < nz_tmp; j++) {
2234           idx         = ics[ajtmp[j]];
2235           rtmp11[idx] = v1[j];
2236           rtmp22[idx] = v2[j];
2237           rtmp33[idx] = v3[j];
2238         }
2239         rtmp11[ics[r[row]]] += sctx.shift_amount;
2240         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2241         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2242 
2243         /* loop over all pivot row blocks above this row block */
2244         prow = *bjtmp++;
2245         while (prow < row) {
2246           pc1 = rtmp11 + prow;
2247           pc2 = rtmp22 + prow;
2248           pc3 = rtmp33 + prow;
2249           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2250             pv   = ba + bd[prow];
2251             pj   = nbj + bd[prow];
2252             mul1 = *pc1 * *pv;
2253             mul2 = *pc2 * *pv;
2254             mul3 = *pc3 * *pv;
2255             ++pv;
2256             *pc1 = mul1;
2257             *pc2 = mul2;
2258             *pc3 = mul3;
2259 
2260             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2261             /* update this row based on pivot row */
2262             for (j = 0; j < nz_tmp; j++) {
2263               tmp = pv[j];
2264               idx = pj[j];
2265               rtmp11[idx] -= mul1 * tmp;
2266               rtmp22[idx] -= mul2 * tmp;
2267               rtmp33[idx] -= mul3 * tmp;
2268             }
2269             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2270           }
2271           prow = *bjtmp++;
2272         }
2273 
2274         /* Now take care of diagonal 3x3 block in this set of rows */
2275         /* note: prow = row here */
2276         pc1 = rtmp11 + prow;
2277         pc2 = rtmp22 + prow;
2278         pc3 = rtmp33 + prow;
2279 
2280         sctx.pv = *pc1;
2281         pj      = bj + bi[prow];
2282         rs      = 0.0;
2283         for (j = 0; j < nz; j++) {
2284           idx = pj[j];
2285           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2286         }
2287         sctx.rs = rs;
2288         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2289         if (sctx.newshift) goto endofwhile;
2290 
2291         if (*pc2 != 0.0 || *pc3 != 0.0) {
2292           mul2   = (*pc2) / (*pc1);
2293           mul3   = (*pc3) / (*pc1);
2294           *pc2   = mul2;
2295           *pc3   = mul3;
2296           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2297           pj     = nbj + bd[prow];
2298           for (j = 0; j < nz_tmp; j++) {
2299             idx = pj[j];
2300             tmp = rtmp11[idx];
2301             rtmp22[idx] -= mul2 * tmp;
2302             rtmp33[idx] -= mul3 * tmp;
2303           }
2304           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2305         }
2306         ++prow;
2307 
2308         pc2     = rtmp22 + prow;
2309         pc3     = rtmp33 + prow;
2310         sctx.pv = *pc2;
2311         pj      = bj + bi[prow];
2312         rs      = 0.0;
2313         for (j = 0; j < nz; j++) {
2314           idx = pj[j];
2315           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2316         }
2317         sctx.rs = rs;
2318         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2319         if (sctx.newshift) goto endofwhile;
2320 
2321         if (*pc3 != 0.0) {
2322           mul3   = (*pc3) / (*pc2);
2323           *pc3   = mul3;
2324           pj     = nbj + bd[prow];
2325           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2326           for (j = 0; j < nz_tmp; j++) {
2327             idx = pj[j];
2328             tmp = rtmp22[idx];
2329             rtmp33[idx] -= mul3 * tmp;
2330           }
2331           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2332         }
2333 
2334         pj  = bj + bi[row];
2335         pc1 = ba + bi[row];
2336         pc2 = ba + bi[row + 1];
2337         pc3 = ba + bi[row + 2];
2338 
2339         sctx.pv         = rtmp33[row + 2];
2340         rs              = 0.0;
2341         rtmp11[row]     = 1.0 / rtmp11[row];
2342         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2343         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2344         /* copy row entries from dense representation to sparse */
2345         for (j = 0; j < nz; j++) {
2346           idx    = pj[j];
2347           pc1[j] = rtmp11[idx];
2348           pc2[j] = rtmp22[idx];
2349           pc3[j] = rtmp33[idx];
2350           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2351         }
2352 
2353         sctx.rs = rs;
2354         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2355         if (sctx.newshift) goto endofwhile;
2356         break;
2357 
2358       default:
2359         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2360       }
2361       row += nodesz; /* Update the row */
2362     }
2363   endofwhile:;
2364   } while (sctx.newshift);
2365   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2366   PetscCall(PetscFree(tmp_vec2));
2367   PetscCall(ISRestoreIndices(isicol, &ic));
2368   PetscCall(ISRestoreIndices(isrow, &r));
2369   PetscCall(ISRestoreIndices(iscol, &c));
2370 
2371   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2372   /* do not set solve add, since MatSolve_Inode + Add is faster */
2373   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2374   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2375   C->assembled              = PETSC_TRUE;
2376   C->preallocated           = PETSC_TRUE;
2377   if (sctx.nshift) {
2378     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2379       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2380     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2381       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2382     }
2383   }
2384   PetscCall(PetscLogFlops(C->cmap->n));
2385   PetscCall(MatSeqAIJCheckInode(C));
2386   PetscFunctionReturn(PETSC_SUCCESS);
2387 }
2388 #endif
2389 
2390 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2391 {
2392   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2393   IS                 iscol = a->col, isrow = a->row;
2394   const PetscInt    *r, *c, *rout, *cout;
2395   PetscInt           i, j, n = A->rmap->n;
2396   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2397   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2398   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2399   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2400   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2401   const PetscScalar *b;
2402 
2403   PetscFunctionBegin;
2404   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2405   node_max = a->inode.node_count;
2406   ns       = a->inode.size; /* Node Size array */
2407 
2408   PetscCall(VecGetArrayRead(bb, &b));
2409   PetscCall(VecGetArrayWrite(xx, &x));
2410   tmp = a->solve_work;
2411 
2412   PetscCall(ISGetIndices(isrow, &rout));
2413   r = rout;
2414   PetscCall(ISGetIndices(iscol, &cout));
2415   c = cout;
2416 
2417   /* forward solve the lower triangular */
2418   tmps = tmp;
2419   aa   = a_a;
2420   aj   = a_j;
2421   ad   = a->diag;
2422 
2423   for (i = 0, row = 0; i < node_max; ++i) {
2424     nsz = ns[i];
2425     aii = ai[row];
2426     v1  = aa + aii;
2427     vi  = aj + aii;
2428     nz  = ai[row + 1] - ai[row];
2429 
2430     if (i < node_max - 1) {
2431       /* Prefetch the indices for the next block */
2432       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2433       /* Prefetch the data for the next block */
2434       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2435     }
2436 
2437     switch (nsz) { /* Each loop in 'case' is unrolled */
2438     case 1:
2439       sum1 = b[r[row]];
2440       for (j = 0; j < nz - 1; j += 2) {
2441         i0   = vi[j];
2442         i1   = vi[j + 1];
2443         tmp0 = tmps[i0];
2444         tmp1 = tmps[i1];
2445         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2446       }
2447       if (j == nz - 1) {
2448         tmp0 = tmps[vi[j]];
2449         sum1 -= v1[j] * tmp0;
2450       }
2451       tmp[row++] = sum1;
2452       break;
2453     case 2:
2454       sum1 = b[r[row]];
2455       sum2 = b[r[row + 1]];
2456       v2   = aa + ai[row + 1];
2457 
2458       for (j = 0; j < nz - 1; j += 2) {
2459         i0   = vi[j];
2460         i1   = vi[j + 1];
2461         tmp0 = tmps[i0];
2462         tmp1 = tmps[i1];
2463         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2464         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2465       }
2466       if (j == nz - 1) {
2467         tmp0 = tmps[vi[j]];
2468         sum1 -= v1[j] * tmp0;
2469         sum2 -= v2[j] * tmp0;
2470       }
2471       sum2 -= v2[nz] * sum1;
2472       tmp[row++] = sum1;
2473       tmp[row++] = sum2;
2474       break;
2475     case 3:
2476       sum1 = b[r[row]];
2477       sum2 = b[r[row + 1]];
2478       sum3 = b[r[row + 2]];
2479       v2   = aa + ai[row + 1];
2480       v3   = aa + ai[row + 2];
2481 
2482       for (j = 0; j < nz - 1; j += 2) {
2483         i0   = vi[j];
2484         i1   = vi[j + 1];
2485         tmp0 = tmps[i0];
2486         tmp1 = tmps[i1];
2487         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2488         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2489         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2490       }
2491       if (j == nz - 1) {
2492         tmp0 = tmps[vi[j]];
2493         sum1 -= v1[j] * tmp0;
2494         sum2 -= v2[j] * tmp0;
2495         sum3 -= v3[j] * tmp0;
2496       }
2497       sum2 -= v2[nz] * sum1;
2498       sum3 -= v3[nz] * sum1;
2499       sum3 -= v3[nz + 1] * sum2;
2500       tmp[row++] = sum1;
2501       tmp[row++] = sum2;
2502       tmp[row++] = sum3;
2503       break;
2504 
2505     case 4:
2506       sum1 = b[r[row]];
2507       sum2 = b[r[row + 1]];
2508       sum3 = b[r[row + 2]];
2509       sum4 = b[r[row + 3]];
2510       v2   = aa + ai[row + 1];
2511       v3   = aa + ai[row + 2];
2512       v4   = aa + ai[row + 3];
2513 
2514       for (j = 0; j < nz - 1; j += 2) {
2515         i0   = vi[j];
2516         i1   = vi[j + 1];
2517         tmp0 = tmps[i0];
2518         tmp1 = tmps[i1];
2519         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2520         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2521         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2522         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2523       }
2524       if (j == nz - 1) {
2525         tmp0 = tmps[vi[j]];
2526         sum1 -= v1[j] * tmp0;
2527         sum2 -= v2[j] * tmp0;
2528         sum3 -= v3[j] * tmp0;
2529         sum4 -= v4[j] * tmp0;
2530       }
2531       sum2 -= v2[nz] * sum1;
2532       sum3 -= v3[nz] * sum1;
2533       sum4 -= v4[nz] * sum1;
2534       sum3 -= v3[nz + 1] * sum2;
2535       sum4 -= v4[nz + 1] * sum2;
2536       sum4 -= v4[nz + 2] * sum3;
2537 
2538       tmp[row++] = sum1;
2539       tmp[row++] = sum2;
2540       tmp[row++] = sum3;
2541       tmp[row++] = sum4;
2542       break;
2543     case 5:
2544       sum1 = b[r[row]];
2545       sum2 = b[r[row + 1]];
2546       sum3 = b[r[row + 2]];
2547       sum4 = b[r[row + 3]];
2548       sum5 = b[r[row + 4]];
2549       v2   = aa + ai[row + 1];
2550       v3   = aa + ai[row + 2];
2551       v4   = aa + ai[row + 3];
2552       v5   = aa + ai[row + 4];
2553 
2554       for (j = 0; j < nz - 1; j += 2) {
2555         i0   = vi[j];
2556         i1   = vi[j + 1];
2557         tmp0 = tmps[i0];
2558         tmp1 = tmps[i1];
2559         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2560         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2561         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2562         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2563         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2564       }
2565       if (j == nz - 1) {
2566         tmp0 = tmps[vi[j]];
2567         sum1 -= v1[j] * tmp0;
2568         sum2 -= v2[j] * tmp0;
2569         sum3 -= v3[j] * tmp0;
2570         sum4 -= v4[j] * tmp0;
2571         sum5 -= v5[j] * tmp0;
2572       }
2573 
2574       sum2 -= v2[nz] * sum1;
2575       sum3 -= v3[nz] * sum1;
2576       sum4 -= v4[nz] * sum1;
2577       sum5 -= v5[nz] * sum1;
2578       sum3 -= v3[nz + 1] * sum2;
2579       sum4 -= v4[nz + 1] * sum2;
2580       sum5 -= v5[nz + 1] * sum2;
2581       sum4 -= v4[nz + 2] * sum3;
2582       sum5 -= v5[nz + 2] * sum3;
2583       sum5 -= v5[nz + 3] * sum4;
2584 
2585       tmp[row++] = sum1;
2586       tmp[row++] = sum2;
2587       tmp[row++] = sum3;
2588       tmp[row++] = sum4;
2589       tmp[row++] = sum5;
2590       break;
2591     default:
2592       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2593     }
2594   }
2595   /* backward solve the upper triangular */
2596   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2597     nsz = ns[i];
2598     aii = ad[row + 1] + 1;
2599     v1  = aa + aii;
2600     vi  = aj + aii;
2601     nz  = ad[row] - ad[row + 1] - 1;
2602 
2603     if (i > 0) {
2604       /* Prefetch the indices for the next block */
2605       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2606       /* Prefetch the data for the next block */
2607       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2608     }
2609 
2610     switch (nsz) { /* Each loop in 'case' is unrolled */
2611     case 1:
2612       sum1 = tmp[row];
2613 
2614       for (j = 0; j < nz - 1; j += 2) {
2615         i0   = vi[j];
2616         i1   = vi[j + 1];
2617         tmp0 = tmps[i0];
2618         tmp1 = tmps[i1];
2619         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2620       }
2621       if (j == nz - 1) {
2622         tmp0 = tmps[vi[j]];
2623         sum1 -= v1[j] * tmp0;
2624       }
2625       x[c[row]] = tmp[row] = sum1 * v1[nz];
2626       row--;
2627       break;
2628     case 2:
2629       sum1 = tmp[row];
2630       sum2 = tmp[row - 1];
2631       v2   = aa + ad[row] + 1;
2632       for (j = 0; j < nz - 1; j += 2) {
2633         i0   = vi[j];
2634         i1   = vi[j + 1];
2635         tmp0 = tmps[i0];
2636         tmp1 = tmps[i1];
2637         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2638         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2639       }
2640       if (j == nz - 1) {
2641         tmp0 = tmps[vi[j]];
2642         sum1 -= v1[j] * tmp0;
2643         sum2 -= v2[j + 1] * tmp0;
2644       }
2645 
2646       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2647       row--;
2648       sum2 -= v2[0] * tmp0;
2649       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2650       row--;
2651       break;
2652     case 3:
2653       sum1 = tmp[row];
2654       sum2 = tmp[row - 1];
2655       sum3 = tmp[row - 2];
2656       v2   = aa + ad[row] + 1;
2657       v3   = aa + ad[row - 1] + 1;
2658       for (j = 0; j < nz - 1; j += 2) {
2659         i0   = vi[j];
2660         i1   = vi[j + 1];
2661         tmp0 = tmps[i0];
2662         tmp1 = tmps[i1];
2663         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2664         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2665         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2666       }
2667       if (j == nz - 1) {
2668         tmp0 = tmps[vi[j]];
2669         sum1 -= v1[j] * tmp0;
2670         sum2 -= v2[j + 1] * tmp0;
2671         sum3 -= v3[j + 2] * tmp0;
2672       }
2673       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2674       row--;
2675       sum2 -= v2[0] * tmp0;
2676       sum3 -= v3[1] * tmp0;
2677       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2678       row--;
2679       sum3 -= v3[0] * tmp0;
2680       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2681       row--;
2682 
2683       break;
2684     case 4:
2685       sum1 = tmp[row];
2686       sum2 = tmp[row - 1];
2687       sum3 = tmp[row - 2];
2688       sum4 = tmp[row - 3];
2689       v2   = aa + ad[row] + 1;
2690       v3   = aa + ad[row - 1] + 1;
2691       v4   = aa + ad[row - 2] + 1;
2692 
2693       for (j = 0; j < nz - 1; j += 2) {
2694         i0   = vi[j];
2695         i1   = vi[j + 1];
2696         tmp0 = tmps[i0];
2697         tmp1 = tmps[i1];
2698         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2699         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2700         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2701         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2702       }
2703       if (j == nz - 1) {
2704         tmp0 = tmps[vi[j]];
2705         sum1 -= v1[j] * tmp0;
2706         sum2 -= v2[j + 1] * tmp0;
2707         sum3 -= v3[j + 2] * tmp0;
2708         sum4 -= v4[j + 3] * tmp0;
2709       }
2710 
2711       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2712       row--;
2713       sum2 -= v2[0] * tmp0;
2714       sum3 -= v3[1] * tmp0;
2715       sum4 -= v4[2] * tmp0;
2716       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2717       row--;
2718       sum3 -= v3[0] * tmp0;
2719       sum4 -= v4[1] * tmp0;
2720       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2721       row--;
2722       sum4 -= v4[0] * tmp0;
2723       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2724       row--;
2725       break;
2726     case 5:
2727       sum1 = tmp[row];
2728       sum2 = tmp[row - 1];
2729       sum3 = tmp[row - 2];
2730       sum4 = tmp[row - 3];
2731       sum5 = tmp[row - 4];
2732       v2   = aa + ad[row] + 1;
2733       v3   = aa + ad[row - 1] + 1;
2734       v4   = aa + ad[row - 2] + 1;
2735       v5   = aa + ad[row - 3] + 1;
2736       for (j = 0; j < nz - 1; j += 2) {
2737         i0   = vi[j];
2738         i1   = vi[j + 1];
2739         tmp0 = tmps[i0];
2740         tmp1 = tmps[i1];
2741         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2742         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2743         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2744         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2745         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2746       }
2747       if (j == nz - 1) {
2748         tmp0 = tmps[vi[j]];
2749         sum1 -= v1[j] * tmp0;
2750         sum2 -= v2[j + 1] * tmp0;
2751         sum3 -= v3[j + 2] * tmp0;
2752         sum4 -= v4[j + 3] * tmp0;
2753         sum5 -= v5[j + 4] * tmp0;
2754       }
2755 
2756       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2757       row--;
2758       sum2 -= v2[0] * tmp0;
2759       sum3 -= v3[1] * tmp0;
2760       sum4 -= v4[2] * tmp0;
2761       sum5 -= v5[3] * tmp0;
2762       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2763       row--;
2764       sum3 -= v3[0] * tmp0;
2765       sum4 -= v4[1] * tmp0;
2766       sum5 -= v5[2] * tmp0;
2767       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2768       row--;
2769       sum4 -= v4[0] * tmp0;
2770       sum5 -= v5[1] * tmp0;
2771       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2772       row--;
2773       sum5 -= v5[0] * tmp0;
2774       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2775       row--;
2776       break;
2777     default:
2778       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2779     }
2780   }
2781   PetscCall(ISRestoreIndices(isrow, &rout));
2782   PetscCall(ISRestoreIndices(iscol, &cout));
2783   PetscCall(VecRestoreArrayRead(bb, &b));
2784   PetscCall(VecRestoreArrayWrite(xx, &x));
2785   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2786   PetscFunctionReturn(PETSC_SUCCESS);
2787 }
2788 
2789 /*
2790      Makes a longer coloring[] array and calls the usual code with that
2791 */
2792 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2793 {
2794   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2795   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2796   PetscInt        *colorused, i;
2797   ISColoringValue *newcolor;
2798 
2799   PetscFunctionBegin;
2800   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2801   PetscCall(PetscMalloc1(n + 1, &newcolor));
2802   /* loop over inodes, marking a color for each column*/
2803   row = 0;
2804   for (i = 0; i < m; i++) {
2805     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2806   }
2807 
2808   /* eliminate unneeded colors */
2809   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2810   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2811 
2812   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2813   ncolors = colorused[5 * ncolors - 1];
2814   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2815   PetscCall(PetscFree(colorused));
2816   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2817   PetscCall(PetscFree(coloring));
2818   PetscFunctionReturn(PETSC_SUCCESS);
2819 }
2820 
2821 #include <petsc/private/kernels/blockinvert.h>
2822 
2823 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2824 {
2825   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2826   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2827   MatScalar         *ibdiag, *bdiag, work[25], *t;
2828   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2829   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2830   const PetscScalar *xb, *b;
2831   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2832   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2833   PetscInt           sz, k, ipvt[5];
2834   PetscBool          allowzeropivot, zeropivotdetected;
2835   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2836 
2837   PetscFunctionBegin;
2838   allowzeropivot = PetscNot(A->erroriffailure);
2839   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2840   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2841   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2842 
2843   if (!a->inode.ibdiagvalid) {
2844     if (!a->inode.ibdiag) {
2845       /* calculate space needed for diagonal blocks */
2846       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2847       a->inode.bdiagsize = cnt;
2848 
2849       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2850     }
2851 
2852     /* copy over the diagonal blocks and invert them */
2853     ibdiag = a->inode.ibdiag;
2854     bdiag  = a->inode.bdiag;
2855     cnt    = 0;
2856     for (i = 0, row = 0; i < m; i++) {
2857       for (j = 0; j < sizes[i]; j++) {
2858         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2859       }
2860       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2861 
2862       switch (sizes[i]) {
2863       case 1:
2864         /* Create matrix data structure */
2865         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2866           if (allowzeropivot) {
2867             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2868             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2869             A->factorerror_zeropivot_row   = row;
2870             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2871           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2872         }
2873         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2874         break;
2875       case 2:
2876         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2877         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2878         break;
2879       case 3:
2880         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2881         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2882         break;
2883       case 4:
2884         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2885         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2886         break;
2887       case 5:
2888         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2889         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2890         break;
2891       default:
2892         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2893       }
2894       cnt += sizes[i] * sizes[i];
2895       row += sizes[i];
2896     }
2897     a->inode.ibdiagvalid = PETSC_TRUE;
2898   }
2899   ibdiag = a->inode.ibdiag;
2900   bdiag  = a->inode.bdiag;
2901   t      = a->inode.ssor_work;
2902 
2903   PetscCall(VecGetArray(xx, &x));
2904   PetscCall(VecGetArrayRead(bb, &b));
2905   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2906   if (flag & SOR_ZERO_INITIAL_GUESS) {
2907     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2908       for (i = 0, row = 0; i < m; i++) {
2909         sz  = diag[row] - ii[row];
2910         v1  = a->a + ii[row];
2911         idx = a->j + ii[row];
2912 
2913         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2914         switch (sizes[i]) {
2915         case 1:
2916 
2917           sum1 = b[row];
2918           for (n = 0; n < sz - 1; n += 2) {
2919             i1 = idx[0];
2920             i2 = idx[1];
2921             idx += 2;
2922             tmp0 = x[i1];
2923             tmp1 = x[i2];
2924             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2925             v1 += 2;
2926           }
2927 
2928           if (n == sz - 1) {
2929             tmp0 = x[*idx];
2930             sum1 -= *v1 * tmp0;
2931           }
2932           t[row]   = sum1;
2933           x[row++] = sum1 * (*ibdiag++);
2934           break;
2935         case 2:
2936           v2   = a->a + ii[row + 1];
2937           sum1 = b[row];
2938           sum2 = b[row + 1];
2939           for (n = 0; n < sz - 1; n += 2) {
2940             i1 = idx[0];
2941             i2 = idx[1];
2942             idx += 2;
2943             tmp0 = x[i1];
2944             tmp1 = x[i2];
2945             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2946             v1 += 2;
2947             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2948             v2 += 2;
2949           }
2950 
2951           if (n == sz - 1) {
2952             tmp0 = x[*idx];
2953             sum1 -= v1[0] * tmp0;
2954             sum2 -= v2[0] * tmp0;
2955           }
2956           t[row]     = sum1;
2957           t[row + 1] = sum2;
2958           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2959           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2960           ibdiag += 4;
2961           break;
2962         case 3:
2963           v2   = a->a + ii[row + 1];
2964           v3   = a->a + ii[row + 2];
2965           sum1 = b[row];
2966           sum2 = b[row + 1];
2967           sum3 = b[row + 2];
2968           for (n = 0; n < sz - 1; n += 2) {
2969             i1 = idx[0];
2970             i2 = idx[1];
2971             idx += 2;
2972             tmp0 = x[i1];
2973             tmp1 = x[i2];
2974             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2975             v1 += 2;
2976             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2977             v2 += 2;
2978             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2979             v3 += 2;
2980           }
2981 
2982           if (n == sz - 1) {
2983             tmp0 = x[*idx];
2984             sum1 -= v1[0] * tmp0;
2985             sum2 -= v2[0] * tmp0;
2986             sum3 -= v3[0] * tmp0;
2987           }
2988           t[row]     = sum1;
2989           t[row + 1] = sum2;
2990           t[row + 2] = sum3;
2991           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2992           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2993           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2994           ibdiag += 9;
2995           break;
2996         case 4:
2997           v2   = a->a + ii[row + 1];
2998           v3   = a->a + ii[row + 2];
2999           v4   = a->a + ii[row + 3];
3000           sum1 = b[row];
3001           sum2 = b[row + 1];
3002           sum3 = b[row + 2];
3003           sum4 = b[row + 3];
3004           for (n = 0; n < sz - 1; n += 2) {
3005             i1 = idx[0];
3006             i2 = idx[1];
3007             idx += 2;
3008             tmp0 = x[i1];
3009             tmp1 = x[i2];
3010             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3011             v1 += 2;
3012             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3013             v2 += 2;
3014             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3015             v3 += 2;
3016             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3017             v4 += 2;
3018           }
3019 
3020           if (n == sz - 1) {
3021             tmp0 = x[*idx];
3022             sum1 -= v1[0] * tmp0;
3023             sum2 -= v2[0] * tmp0;
3024             sum3 -= v3[0] * tmp0;
3025             sum4 -= v4[0] * tmp0;
3026           }
3027           t[row]     = sum1;
3028           t[row + 1] = sum2;
3029           t[row + 2] = sum3;
3030           t[row + 3] = sum4;
3031           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3032           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3033           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3034           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3035           ibdiag += 16;
3036           break;
3037         case 5:
3038           v2   = a->a + ii[row + 1];
3039           v3   = a->a + ii[row + 2];
3040           v4   = a->a + ii[row + 3];
3041           v5   = a->a + ii[row + 4];
3042           sum1 = b[row];
3043           sum2 = b[row + 1];
3044           sum3 = b[row + 2];
3045           sum4 = b[row + 3];
3046           sum5 = b[row + 4];
3047           for (n = 0; n < sz - 1; n += 2) {
3048             i1 = idx[0];
3049             i2 = idx[1];
3050             idx += 2;
3051             tmp0 = x[i1];
3052             tmp1 = x[i2];
3053             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3054             v1 += 2;
3055             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3056             v2 += 2;
3057             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3058             v3 += 2;
3059             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3060             v4 += 2;
3061             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3062             v5 += 2;
3063           }
3064 
3065           if (n == sz - 1) {
3066             tmp0 = x[*idx];
3067             sum1 -= v1[0] * tmp0;
3068             sum2 -= v2[0] * tmp0;
3069             sum3 -= v3[0] * tmp0;
3070             sum4 -= v4[0] * tmp0;
3071             sum5 -= v5[0] * tmp0;
3072           }
3073           t[row]     = sum1;
3074           t[row + 1] = sum2;
3075           t[row + 2] = sum3;
3076           t[row + 3] = sum4;
3077           t[row + 4] = sum5;
3078           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3079           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3080           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3081           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3082           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3083           ibdiag += 25;
3084           break;
3085         default:
3086           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3087         }
3088       }
3089 
3090       xb = t;
3091       PetscCall(PetscLogFlops(a->nz));
3092     } else xb = b;
3093     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3094       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3095       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3096         ibdiag -= sizes[i] * sizes[i];
3097         sz  = ii[row + 1] - diag[row] - 1;
3098         v1  = a->a + diag[row] + 1;
3099         idx = a->j + diag[row] + 1;
3100 
3101         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3102         switch (sizes[i]) {
3103         case 1:
3104 
3105           sum1 = xb[row];
3106           for (n = 0; n < sz - 1; n += 2) {
3107             i1 = idx[0];
3108             i2 = idx[1];
3109             idx += 2;
3110             tmp0 = x[i1];
3111             tmp1 = x[i2];
3112             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3113             v1 += 2;
3114           }
3115 
3116           if (n == sz - 1) {
3117             tmp0 = x[*idx];
3118             sum1 -= *v1 * tmp0;
3119           }
3120           x[row--] = sum1 * (*ibdiag);
3121           break;
3122 
3123         case 2:
3124 
3125           sum1 = xb[row];
3126           sum2 = xb[row - 1];
3127           /* note that sum1 is associated with the second of the two rows */
3128           v2 = a->a + diag[row - 1] + 2;
3129           for (n = 0; n < sz - 1; n += 2) {
3130             i1 = idx[0];
3131             i2 = idx[1];
3132             idx += 2;
3133             tmp0 = x[i1];
3134             tmp1 = x[i2];
3135             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3136             v1 += 2;
3137             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3138             v2 += 2;
3139           }
3140 
3141           if (n == sz - 1) {
3142             tmp0 = x[*idx];
3143             sum1 -= *v1 * tmp0;
3144             sum2 -= *v2 * tmp0;
3145           }
3146           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3147           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3148           break;
3149         case 3:
3150 
3151           sum1 = xb[row];
3152           sum2 = xb[row - 1];
3153           sum3 = xb[row - 2];
3154           v2   = a->a + diag[row - 1] + 2;
3155           v3   = a->a + diag[row - 2] + 3;
3156           for (n = 0; n < sz - 1; n += 2) {
3157             i1 = idx[0];
3158             i2 = idx[1];
3159             idx += 2;
3160             tmp0 = x[i1];
3161             tmp1 = x[i2];
3162             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3163             v1 += 2;
3164             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3165             v2 += 2;
3166             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3167             v3 += 2;
3168           }
3169 
3170           if (n == sz - 1) {
3171             tmp0 = x[*idx];
3172             sum1 -= *v1 * tmp0;
3173             sum2 -= *v2 * tmp0;
3174             sum3 -= *v3 * tmp0;
3175           }
3176           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3177           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3178           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3179           break;
3180         case 4:
3181 
3182           sum1 = xb[row];
3183           sum2 = xb[row - 1];
3184           sum3 = xb[row - 2];
3185           sum4 = xb[row - 3];
3186           v2   = a->a + diag[row - 1] + 2;
3187           v3   = a->a + diag[row - 2] + 3;
3188           v4   = a->a + diag[row - 3] + 4;
3189           for (n = 0; n < sz - 1; n += 2) {
3190             i1 = idx[0];
3191             i2 = idx[1];
3192             idx += 2;
3193             tmp0 = x[i1];
3194             tmp1 = x[i2];
3195             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3196             v1 += 2;
3197             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3198             v2 += 2;
3199             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3200             v3 += 2;
3201             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3202             v4 += 2;
3203           }
3204 
3205           if (n == sz - 1) {
3206             tmp0 = x[*idx];
3207             sum1 -= *v1 * tmp0;
3208             sum2 -= *v2 * tmp0;
3209             sum3 -= *v3 * tmp0;
3210             sum4 -= *v4 * tmp0;
3211           }
3212           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3213           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3214           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3215           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3216           break;
3217         case 5:
3218 
3219           sum1 = xb[row];
3220           sum2 = xb[row - 1];
3221           sum3 = xb[row - 2];
3222           sum4 = xb[row - 3];
3223           sum5 = xb[row - 4];
3224           v2   = a->a + diag[row - 1] + 2;
3225           v3   = a->a + diag[row - 2] + 3;
3226           v4   = a->a + diag[row - 3] + 4;
3227           v5   = a->a + diag[row - 4] + 5;
3228           for (n = 0; n < sz - 1; n += 2) {
3229             i1 = idx[0];
3230             i2 = idx[1];
3231             idx += 2;
3232             tmp0 = x[i1];
3233             tmp1 = x[i2];
3234             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3235             v1 += 2;
3236             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3237             v2 += 2;
3238             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3239             v3 += 2;
3240             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3241             v4 += 2;
3242             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3243             v5 += 2;
3244           }
3245 
3246           if (n == sz - 1) {
3247             tmp0 = x[*idx];
3248             sum1 -= *v1 * tmp0;
3249             sum2 -= *v2 * tmp0;
3250             sum3 -= *v3 * tmp0;
3251             sum4 -= *v4 * tmp0;
3252             sum5 -= *v5 * tmp0;
3253           }
3254           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3255           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3256           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3257           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3258           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3259           break;
3260         default:
3261           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3262         }
3263       }
3264 
3265       PetscCall(PetscLogFlops(a->nz));
3266     }
3267     its--;
3268   }
3269   while (its--) {
3270     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3271       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3272         sz  = diag[row] - ii[row];
3273         v1  = a->a + ii[row];
3274         idx = a->j + ii[row];
3275         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3276         switch (sizes[i]) {
3277         case 1:
3278           sum1 = b[row];
3279           for (n = 0; n < sz - 1; n += 2) {
3280             i1 = idx[0];
3281             i2 = idx[1];
3282             idx += 2;
3283             tmp0 = x[i1];
3284             tmp1 = x[i2];
3285             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3286             v1 += 2;
3287           }
3288           if (n == sz - 1) {
3289             tmp0 = x[*idx++];
3290             sum1 -= *v1 * tmp0;
3291             v1++;
3292           }
3293           t[row] = sum1;
3294           sz     = ii[row + 1] - diag[row] - 1;
3295           idx    = a->j + diag[row] + 1;
3296           v1 += 1;
3297           for (n = 0; n < sz - 1; n += 2) {
3298             i1 = idx[0];
3299             i2 = idx[1];
3300             idx += 2;
3301             tmp0 = x[i1];
3302             tmp1 = x[i2];
3303             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3304             v1 += 2;
3305           }
3306           if (n == sz - 1) {
3307             tmp0 = x[*idx++];
3308             sum1 -= *v1 * tmp0;
3309           }
3310           /* in MatSOR_SeqAIJ this line would be
3311            *
3312            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3313            *
3314            * but omega == 1, so this becomes
3315            *
3316            * x[row] = sum1*(*ibdiag++);
3317            *
3318            */
3319           x[row] = sum1 * (*ibdiag);
3320           break;
3321         case 2:
3322           v2   = a->a + ii[row + 1];
3323           sum1 = b[row];
3324           sum2 = b[row + 1];
3325           for (n = 0; n < sz - 1; n += 2) {
3326             i1 = idx[0];
3327             i2 = idx[1];
3328             idx += 2;
3329             tmp0 = x[i1];
3330             tmp1 = x[i2];
3331             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3332             v1 += 2;
3333             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3334             v2 += 2;
3335           }
3336           if (n == sz - 1) {
3337             tmp0 = x[*idx++];
3338             sum1 -= v1[0] * tmp0;
3339             sum2 -= v2[0] * tmp0;
3340             v1++;
3341             v2++;
3342           }
3343           t[row]     = sum1;
3344           t[row + 1] = sum2;
3345           sz         = ii[row + 1] - diag[row] - 2;
3346           idx        = a->j + diag[row] + 2;
3347           v1 += 2;
3348           v2 += 2;
3349           for (n = 0; n < sz - 1; n += 2) {
3350             i1 = idx[0];
3351             i2 = idx[1];
3352             idx += 2;
3353             tmp0 = x[i1];
3354             tmp1 = x[i2];
3355             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3356             v1 += 2;
3357             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3358             v2 += 2;
3359           }
3360           if (n == sz - 1) {
3361             tmp0 = x[*idx];
3362             sum1 -= v1[0] * tmp0;
3363             sum2 -= v2[0] * tmp0;
3364           }
3365           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3366           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3367           break;
3368         case 3:
3369           v2   = a->a + ii[row + 1];
3370           v3   = a->a + ii[row + 2];
3371           sum1 = b[row];
3372           sum2 = b[row + 1];
3373           sum3 = b[row + 2];
3374           for (n = 0; n < sz - 1; n += 2) {
3375             i1 = idx[0];
3376             i2 = idx[1];
3377             idx += 2;
3378             tmp0 = x[i1];
3379             tmp1 = x[i2];
3380             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3381             v1 += 2;
3382             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3383             v2 += 2;
3384             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3385             v3 += 2;
3386           }
3387           if (n == sz - 1) {
3388             tmp0 = x[*idx++];
3389             sum1 -= v1[0] * tmp0;
3390             sum2 -= v2[0] * tmp0;
3391             sum3 -= v3[0] * tmp0;
3392             v1++;
3393             v2++;
3394             v3++;
3395           }
3396           t[row]     = sum1;
3397           t[row + 1] = sum2;
3398           t[row + 2] = sum3;
3399           sz         = ii[row + 1] - diag[row] - 3;
3400           idx        = a->j + diag[row] + 3;
3401           v1 += 3;
3402           v2 += 3;
3403           v3 += 3;
3404           for (n = 0; n < sz - 1; n += 2) {
3405             i1 = idx[0];
3406             i2 = idx[1];
3407             idx += 2;
3408             tmp0 = x[i1];
3409             tmp1 = x[i2];
3410             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3411             v1 += 2;
3412             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3413             v2 += 2;
3414             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3415             v3 += 2;
3416           }
3417           if (n == sz - 1) {
3418             tmp0 = x[*idx];
3419             sum1 -= v1[0] * tmp0;
3420             sum2 -= v2[0] * tmp0;
3421             sum3 -= v3[0] * tmp0;
3422           }
3423           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3424           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3425           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3426           break;
3427         case 4:
3428           v2   = a->a + ii[row + 1];
3429           v3   = a->a + ii[row + 2];
3430           v4   = a->a + ii[row + 3];
3431           sum1 = b[row];
3432           sum2 = b[row + 1];
3433           sum3 = b[row + 2];
3434           sum4 = b[row + 3];
3435           for (n = 0; n < sz - 1; n += 2) {
3436             i1 = idx[0];
3437             i2 = idx[1];
3438             idx += 2;
3439             tmp0 = x[i1];
3440             tmp1 = x[i2];
3441             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3442             v1 += 2;
3443             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3444             v2 += 2;
3445             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3446             v3 += 2;
3447             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3448             v4 += 2;
3449           }
3450           if (n == sz - 1) {
3451             tmp0 = x[*idx++];
3452             sum1 -= v1[0] * tmp0;
3453             sum2 -= v2[0] * tmp0;
3454             sum3 -= v3[0] * tmp0;
3455             sum4 -= v4[0] * tmp0;
3456             v1++;
3457             v2++;
3458             v3++;
3459             v4++;
3460           }
3461           t[row]     = sum1;
3462           t[row + 1] = sum2;
3463           t[row + 2] = sum3;
3464           t[row + 3] = sum4;
3465           sz         = ii[row + 1] - diag[row] - 4;
3466           idx        = a->j + diag[row] + 4;
3467           v1 += 4;
3468           v2 += 4;
3469           v3 += 4;
3470           v4 += 4;
3471           for (n = 0; n < sz - 1; n += 2) {
3472             i1 = idx[0];
3473             i2 = idx[1];
3474             idx += 2;
3475             tmp0 = x[i1];
3476             tmp1 = x[i2];
3477             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3478             v1 += 2;
3479             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3480             v2 += 2;
3481             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3482             v3 += 2;
3483             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3484             v4 += 2;
3485           }
3486           if (n == sz - 1) {
3487             tmp0 = x[*idx];
3488             sum1 -= v1[0] * tmp0;
3489             sum2 -= v2[0] * tmp0;
3490             sum3 -= v3[0] * tmp0;
3491             sum4 -= v4[0] * tmp0;
3492           }
3493           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3494           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3495           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3496           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3497           break;
3498         case 5:
3499           v2   = a->a + ii[row + 1];
3500           v3   = a->a + ii[row + 2];
3501           v4   = a->a + ii[row + 3];
3502           v5   = a->a + ii[row + 4];
3503           sum1 = b[row];
3504           sum2 = b[row + 1];
3505           sum3 = b[row + 2];
3506           sum4 = b[row + 3];
3507           sum5 = b[row + 4];
3508           for (n = 0; n < sz - 1; n += 2) {
3509             i1 = idx[0];
3510             i2 = idx[1];
3511             idx += 2;
3512             tmp0 = x[i1];
3513             tmp1 = x[i2];
3514             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3515             v1 += 2;
3516             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3517             v2 += 2;
3518             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3519             v3 += 2;
3520             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3521             v4 += 2;
3522             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3523             v5 += 2;
3524           }
3525           if (n == sz - 1) {
3526             tmp0 = x[*idx++];
3527             sum1 -= v1[0] * tmp0;
3528             sum2 -= v2[0] * tmp0;
3529             sum3 -= v3[0] * tmp0;
3530             sum4 -= v4[0] * tmp0;
3531             sum5 -= v5[0] * tmp0;
3532             v1++;
3533             v2++;
3534             v3++;
3535             v4++;
3536             v5++;
3537           }
3538           t[row]     = sum1;
3539           t[row + 1] = sum2;
3540           t[row + 2] = sum3;
3541           t[row + 3] = sum4;
3542           t[row + 4] = sum5;
3543           sz         = ii[row + 1] - diag[row] - 5;
3544           idx        = a->j + diag[row] + 5;
3545           v1 += 5;
3546           v2 += 5;
3547           v3 += 5;
3548           v4 += 5;
3549           v5 += 5;
3550           for (n = 0; n < sz - 1; n += 2) {
3551             i1 = idx[0];
3552             i2 = idx[1];
3553             idx += 2;
3554             tmp0 = x[i1];
3555             tmp1 = x[i2];
3556             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3557             v1 += 2;
3558             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3559             v2 += 2;
3560             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3561             v3 += 2;
3562             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3563             v4 += 2;
3564             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3565             v5 += 2;
3566           }
3567           if (n == sz - 1) {
3568             tmp0 = x[*idx];
3569             sum1 -= v1[0] * tmp0;
3570             sum2 -= v2[0] * tmp0;
3571             sum3 -= v3[0] * tmp0;
3572             sum4 -= v4[0] * tmp0;
3573             sum5 -= v5[0] * tmp0;
3574           }
3575           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3576           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3577           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3578           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3579           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3580           break;
3581         default:
3582           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3583         }
3584       }
3585       xb = t;
3586       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3587     } else xb = b;
3588 
3589     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3590       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3591       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3592         ibdiag -= sizes[i] * sizes[i];
3593 
3594         /* set RHS */
3595         if (xb == b) {
3596           /* whole (old way) */
3597           sz  = ii[row + 1] - ii[row];
3598           idx = a->j + ii[row];
3599           switch (sizes[i]) {
3600           case 5:
3601             v5 = a->a + ii[row - 4]; /* fall through */
3602           case 4:
3603             v4 = a->a + ii[row - 3]; /* fall through */
3604           case 3:
3605             v3 = a->a + ii[row - 2]; /* fall through */
3606           case 2:
3607             v2 = a->a + ii[row - 1]; /* fall through */
3608           case 1:
3609             v1 = a->a + ii[row];
3610             break;
3611           default:
3612             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3613           }
3614         } else {
3615           /* upper, no diag */
3616           sz  = ii[row + 1] - diag[row] - 1;
3617           idx = a->j + diag[row] + 1;
3618           switch (sizes[i]) {
3619           case 5:
3620             v5 = a->a + diag[row - 4] + 5; /* fall through */
3621           case 4:
3622             v4 = a->a + diag[row - 3] + 4; /* fall through */
3623           case 3:
3624             v3 = a->a + diag[row - 2] + 3; /* fall through */
3625           case 2:
3626             v2 = a->a + diag[row - 1] + 2; /* fall through */
3627           case 1:
3628             v1 = a->a + diag[row] + 1;
3629           }
3630         }
3631         /* set sum */
3632         switch (sizes[i]) {
3633         case 5:
3634           sum5 = xb[row - 4]; /* fall through */
3635         case 4:
3636           sum4 = xb[row - 3]; /* fall through */
3637         case 3:
3638           sum3 = xb[row - 2]; /* fall through */
3639         case 2:
3640           sum2 = xb[row - 1]; /* fall through */
3641         case 1:
3642           /* note that sum1 is associated with the last row */
3643           sum1 = xb[row];
3644         }
3645         /* do sums */
3646         for (n = 0; n < sz - 1; n += 2) {
3647           i1 = idx[0];
3648           i2 = idx[1];
3649           idx += 2;
3650           tmp0 = x[i1];
3651           tmp1 = x[i2];
3652           switch (sizes[i]) {
3653           case 5:
3654             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3655             v5 += 2; /* fall through */
3656           case 4:
3657             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3658             v4 += 2; /* fall through */
3659           case 3:
3660             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3661             v3 += 2; /* fall through */
3662           case 2:
3663             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3664             v2 += 2; /* fall through */
3665           case 1:
3666             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3667             v1 += 2;
3668           }
3669         }
3670         /* ragged edge */
3671         if (n == sz - 1) {
3672           tmp0 = x[*idx];
3673           switch (sizes[i]) {
3674           case 5:
3675             sum5 -= *v5 * tmp0; /* fall through */
3676           case 4:
3677             sum4 -= *v4 * tmp0; /* fall through */
3678           case 3:
3679             sum3 -= *v3 * tmp0; /* fall through */
3680           case 2:
3681             sum2 -= *v2 * tmp0; /* fall through */
3682           case 1:
3683             sum1 -= *v1 * tmp0;
3684           }
3685         }
3686         /* update */
3687         if (xb == b) {
3688           /* whole (old way) w/ diag */
3689           switch (sizes[i]) {
3690           case 5:
3691             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3692             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3693             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3694             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3695             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3696             break;
3697           case 4:
3698             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3699             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3700             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3701             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3702             break;
3703           case 3:
3704             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3705             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3706             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3707             break;
3708           case 2:
3709             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3710             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3711             break;
3712           case 1:
3713             x[row--] += sum1 * (*ibdiag);
3714             break;
3715           }
3716         } else {
3717           /* no diag so set =  */
3718           switch (sizes[i]) {
3719           case 5:
3720             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3721             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3722             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3723             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3724             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3725             break;
3726           case 4:
3727             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3728             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3729             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3730             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3731             break;
3732           case 3:
3733             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3734             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3735             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3736             break;
3737           case 2:
3738             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3739             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3740             break;
3741           case 1:
3742             x[row--] = sum1 * (*ibdiag);
3743             break;
3744           }
3745         }
3746       }
3747       if (xb == b) {
3748         PetscCall(PetscLogFlops(2.0 * a->nz));
3749       } else {
3750         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3751       }
3752     }
3753   }
3754   if (flag & SOR_EISENSTAT) {
3755     /*
3756           Apply  (U + D)^-1  where D is now the block diagonal
3757     */
3758     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3759     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3760       ibdiag -= sizes[i] * sizes[i];
3761       sz  = ii[row + 1] - diag[row] - 1;
3762       v1  = a->a + diag[row] + 1;
3763       idx = a->j + diag[row] + 1;
3764       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3765       switch (sizes[i]) {
3766       case 1:
3767 
3768         sum1 = b[row];
3769         for (n = 0; n < sz - 1; n += 2) {
3770           i1 = idx[0];
3771           i2 = idx[1];
3772           idx += 2;
3773           tmp0 = x[i1];
3774           tmp1 = x[i2];
3775           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3776           v1 += 2;
3777         }
3778 
3779         if (n == sz - 1) {
3780           tmp0 = x[*idx];
3781           sum1 -= *v1 * tmp0;
3782         }
3783         x[row] = sum1 * (*ibdiag);
3784         row--;
3785         break;
3786 
3787       case 2:
3788 
3789         sum1 = b[row];
3790         sum2 = b[row - 1];
3791         /* note that sum1 is associated with the second of the two rows */
3792         v2 = a->a + diag[row - 1] + 2;
3793         for (n = 0; n < sz - 1; n += 2) {
3794           i1 = idx[0];
3795           i2 = idx[1];
3796           idx += 2;
3797           tmp0 = x[i1];
3798           tmp1 = x[i2];
3799           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3800           v1 += 2;
3801           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3802           v2 += 2;
3803         }
3804 
3805         if (n == sz - 1) {
3806           tmp0 = x[*idx];
3807           sum1 -= *v1 * tmp0;
3808           sum2 -= *v2 * tmp0;
3809         }
3810         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3811         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3812         row -= 2;
3813         break;
3814       case 3:
3815 
3816         sum1 = b[row];
3817         sum2 = b[row - 1];
3818         sum3 = b[row - 2];
3819         v2   = a->a + diag[row - 1] + 2;
3820         v3   = a->a + diag[row - 2] + 3;
3821         for (n = 0; n < sz - 1; n += 2) {
3822           i1 = idx[0];
3823           i2 = idx[1];
3824           idx += 2;
3825           tmp0 = x[i1];
3826           tmp1 = x[i2];
3827           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3828           v1 += 2;
3829           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3830           v2 += 2;
3831           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3832           v3 += 2;
3833         }
3834 
3835         if (n == sz - 1) {
3836           tmp0 = x[*idx];
3837           sum1 -= *v1 * tmp0;
3838           sum2 -= *v2 * tmp0;
3839           sum3 -= *v3 * tmp0;
3840         }
3841         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3842         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3843         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3844         row -= 3;
3845         break;
3846       case 4:
3847 
3848         sum1 = b[row];
3849         sum2 = b[row - 1];
3850         sum3 = b[row - 2];
3851         sum4 = b[row - 3];
3852         v2   = a->a + diag[row - 1] + 2;
3853         v3   = a->a + diag[row - 2] + 3;
3854         v4   = a->a + diag[row - 3] + 4;
3855         for (n = 0; n < sz - 1; n += 2) {
3856           i1 = idx[0];
3857           i2 = idx[1];
3858           idx += 2;
3859           tmp0 = x[i1];
3860           tmp1 = x[i2];
3861           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3862           v1 += 2;
3863           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3864           v2 += 2;
3865           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3866           v3 += 2;
3867           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3868           v4 += 2;
3869         }
3870 
3871         if (n == sz - 1) {
3872           tmp0 = x[*idx];
3873           sum1 -= *v1 * tmp0;
3874           sum2 -= *v2 * tmp0;
3875           sum3 -= *v3 * tmp0;
3876           sum4 -= *v4 * tmp0;
3877         }
3878         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3879         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3880         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3881         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3882         row -= 4;
3883         break;
3884       case 5:
3885 
3886         sum1 = b[row];
3887         sum2 = b[row - 1];
3888         sum3 = b[row - 2];
3889         sum4 = b[row - 3];
3890         sum5 = b[row - 4];
3891         v2   = a->a + diag[row - 1] + 2;
3892         v3   = a->a + diag[row - 2] + 3;
3893         v4   = a->a + diag[row - 3] + 4;
3894         v5   = a->a + diag[row - 4] + 5;
3895         for (n = 0; n < sz - 1; n += 2) {
3896           i1 = idx[0];
3897           i2 = idx[1];
3898           idx += 2;
3899           tmp0 = x[i1];
3900           tmp1 = x[i2];
3901           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3902           v1 += 2;
3903           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3904           v2 += 2;
3905           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3906           v3 += 2;
3907           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3908           v4 += 2;
3909           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3910           v5 += 2;
3911         }
3912 
3913         if (n == sz - 1) {
3914           tmp0 = x[*idx];
3915           sum1 -= *v1 * tmp0;
3916           sum2 -= *v2 * tmp0;
3917           sum3 -= *v3 * tmp0;
3918           sum4 -= *v4 * tmp0;
3919           sum5 -= *v5 * tmp0;
3920         }
3921         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3922         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3923         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3924         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3925         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3926         row -= 5;
3927         break;
3928       default:
3929         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3930       }
3931     }
3932     PetscCall(PetscLogFlops(a->nz));
3933 
3934     /*
3935            t = b - D x    where D is the block diagonal
3936     */
3937     cnt = 0;
3938     for (i = 0, row = 0; i < m; i++) {
3939       switch (sizes[i]) {
3940       case 1:
3941         t[row] = b[row] - bdiag[cnt++] * x[row];
3942         row++;
3943         break;
3944       case 2:
3945         x1         = x[row];
3946         x2         = x[row + 1];
3947         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3948         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3949         t[row]     = b[row] - tmp1;
3950         t[row + 1] = b[row + 1] - tmp2;
3951         row += 2;
3952         cnt += 4;
3953         break;
3954       case 3:
3955         x1         = x[row];
3956         x2         = x[row + 1];
3957         x3         = x[row + 2];
3958         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3959         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3960         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3961         t[row]     = b[row] - tmp1;
3962         t[row + 1] = b[row + 1] - tmp2;
3963         t[row + 2] = b[row + 2] - tmp3;
3964         row += 3;
3965         cnt += 9;
3966         break;
3967       case 4:
3968         x1         = x[row];
3969         x2         = x[row + 1];
3970         x3         = x[row + 2];
3971         x4         = x[row + 3];
3972         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3973         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3974         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3975         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3976         t[row]     = b[row] - tmp1;
3977         t[row + 1] = b[row + 1] - tmp2;
3978         t[row + 2] = b[row + 2] - tmp3;
3979         t[row + 3] = b[row + 3] - tmp4;
3980         row += 4;
3981         cnt += 16;
3982         break;
3983       case 5:
3984         x1         = x[row];
3985         x2         = x[row + 1];
3986         x3         = x[row + 2];
3987         x4         = x[row + 3];
3988         x5         = x[row + 4];
3989         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3990         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3991         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3992         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3993         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3994         t[row]     = b[row] - tmp1;
3995         t[row + 1] = b[row + 1] - tmp2;
3996         t[row + 2] = b[row + 2] - tmp3;
3997         t[row + 3] = b[row + 3] - tmp4;
3998         t[row + 4] = b[row + 4] - tmp5;
3999         row += 5;
4000         cnt += 25;
4001         break;
4002       default:
4003         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4004       }
4005     }
4006     PetscCall(PetscLogFlops(m));
4007 
4008     /*
4009           Apply (L + D)^-1 where D is the block diagonal
4010     */
4011     for (i = 0, row = 0; i < m; i++) {
4012       sz  = diag[row] - ii[row];
4013       v1  = a->a + ii[row];
4014       idx = a->j + ii[row];
4015       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4016       switch (sizes[i]) {
4017       case 1:
4018 
4019         sum1 = t[row];
4020         for (n = 0; n < sz - 1; n += 2) {
4021           i1 = idx[0];
4022           i2 = idx[1];
4023           idx += 2;
4024           tmp0 = t[i1];
4025           tmp1 = t[i2];
4026           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4027           v1 += 2;
4028         }
4029 
4030         if (n == sz - 1) {
4031           tmp0 = t[*idx];
4032           sum1 -= *v1 * tmp0;
4033         }
4034         x[row] += t[row] = sum1 * (*ibdiag++);
4035         row++;
4036         break;
4037       case 2:
4038         v2   = a->a + ii[row + 1];
4039         sum1 = t[row];
4040         sum2 = t[row + 1];
4041         for (n = 0; n < sz - 1; n += 2) {
4042           i1 = idx[0];
4043           i2 = idx[1];
4044           idx += 2;
4045           tmp0 = t[i1];
4046           tmp1 = t[i2];
4047           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4048           v1 += 2;
4049           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4050           v2 += 2;
4051         }
4052 
4053         if (n == sz - 1) {
4054           tmp0 = t[*idx];
4055           sum1 -= v1[0] * tmp0;
4056           sum2 -= v2[0] * tmp0;
4057         }
4058         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4059         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4060         ibdiag += 4;
4061         row += 2;
4062         break;
4063       case 3:
4064         v2   = a->a + ii[row + 1];
4065         v3   = a->a + ii[row + 2];
4066         sum1 = t[row];
4067         sum2 = t[row + 1];
4068         sum3 = t[row + 2];
4069         for (n = 0; n < sz - 1; n += 2) {
4070           i1 = idx[0];
4071           i2 = idx[1];
4072           idx += 2;
4073           tmp0 = t[i1];
4074           tmp1 = t[i2];
4075           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4076           v1 += 2;
4077           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4078           v2 += 2;
4079           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4080           v3 += 2;
4081         }
4082 
4083         if (n == sz - 1) {
4084           tmp0 = t[*idx];
4085           sum1 -= v1[0] * tmp0;
4086           sum2 -= v2[0] * tmp0;
4087           sum3 -= v3[0] * tmp0;
4088         }
4089         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4090         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4091         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4092         ibdiag += 9;
4093         row += 3;
4094         break;
4095       case 4:
4096         v2   = a->a + ii[row + 1];
4097         v3   = a->a + ii[row + 2];
4098         v4   = a->a + ii[row + 3];
4099         sum1 = t[row];
4100         sum2 = t[row + 1];
4101         sum3 = t[row + 2];
4102         sum4 = t[row + 3];
4103         for (n = 0; n < sz - 1; n += 2) {
4104           i1 = idx[0];
4105           i2 = idx[1];
4106           idx += 2;
4107           tmp0 = t[i1];
4108           tmp1 = t[i2];
4109           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4110           v1 += 2;
4111           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4112           v2 += 2;
4113           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4114           v3 += 2;
4115           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4116           v4 += 2;
4117         }
4118 
4119         if (n == sz - 1) {
4120           tmp0 = t[*idx];
4121           sum1 -= v1[0] * tmp0;
4122           sum2 -= v2[0] * tmp0;
4123           sum3 -= v3[0] * tmp0;
4124           sum4 -= v4[0] * tmp0;
4125         }
4126         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4127         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4128         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4129         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4130         ibdiag += 16;
4131         row += 4;
4132         break;
4133       case 5:
4134         v2   = a->a + ii[row + 1];
4135         v3   = a->a + ii[row + 2];
4136         v4   = a->a + ii[row + 3];
4137         v5   = a->a + ii[row + 4];
4138         sum1 = t[row];
4139         sum2 = t[row + 1];
4140         sum3 = t[row + 2];
4141         sum4 = t[row + 3];
4142         sum5 = t[row + 4];
4143         for (n = 0; n < sz - 1; n += 2) {
4144           i1 = idx[0];
4145           i2 = idx[1];
4146           idx += 2;
4147           tmp0 = t[i1];
4148           tmp1 = t[i2];
4149           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4150           v1 += 2;
4151           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4152           v2 += 2;
4153           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4154           v3 += 2;
4155           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4156           v4 += 2;
4157           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4158           v5 += 2;
4159         }
4160 
4161         if (n == sz - 1) {
4162           tmp0 = t[*idx];
4163           sum1 -= v1[0] * tmp0;
4164           sum2 -= v2[0] * tmp0;
4165           sum3 -= v3[0] * tmp0;
4166           sum4 -= v4[0] * tmp0;
4167           sum5 -= v5[0] * tmp0;
4168         }
4169         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4170         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4171         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4172         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4173         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4174         ibdiag += 25;
4175         row += 5;
4176         break;
4177       default:
4178         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4179       }
4180     }
4181     PetscCall(PetscLogFlops(a->nz));
4182   }
4183   PetscCall(VecRestoreArray(xx, &x));
4184   PetscCall(VecRestoreArrayRead(bb, &b));
4185   PetscFunctionReturn(PETSC_SUCCESS);
4186 }
4187 
4188 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4189 {
4190   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4191   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4192   const MatScalar   *bdiag = a->inode.bdiag;
4193   const PetscScalar *b;
4194   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4195   const PetscInt    *sizes = a->inode.size;
4196 
4197   PetscFunctionBegin;
4198   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4199   PetscCall(VecGetArray(xx, &x));
4200   PetscCall(VecGetArrayRead(bb, &b));
4201   cnt = 0;
4202   for (i = 0, row = 0; i < m; i++) {
4203     switch (sizes[i]) {
4204     case 1:
4205       x[row] = b[row] * bdiag[cnt++];
4206       row++;
4207       break;
4208     case 2:
4209       x1       = b[row];
4210       x2       = b[row + 1];
4211       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4212       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4213       x[row++] = tmp1;
4214       x[row++] = tmp2;
4215       cnt += 4;
4216       break;
4217     case 3:
4218       x1       = b[row];
4219       x2       = b[row + 1];
4220       x3       = b[row + 2];
4221       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4222       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4223       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4224       x[row++] = tmp1;
4225       x[row++] = tmp2;
4226       x[row++] = tmp3;
4227       cnt += 9;
4228       break;
4229     case 4:
4230       x1       = b[row];
4231       x2       = b[row + 1];
4232       x3       = b[row + 2];
4233       x4       = b[row + 3];
4234       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4235       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4236       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4237       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4238       x[row++] = tmp1;
4239       x[row++] = tmp2;
4240       x[row++] = tmp3;
4241       x[row++] = tmp4;
4242       cnt += 16;
4243       break;
4244     case 5:
4245       x1       = b[row];
4246       x2       = b[row + 1];
4247       x3       = b[row + 2];
4248       x4       = b[row + 3];
4249       x5       = b[row + 4];
4250       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4251       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4252       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4253       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4254       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4255       x[row++] = tmp1;
4256       x[row++] = tmp2;
4257       x[row++] = tmp3;
4258       x[row++] = tmp4;
4259       x[row++] = tmp5;
4260       cnt += 25;
4261       break;
4262     default:
4263       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4264     }
4265   }
4266   PetscCall(PetscLogFlops(2.0 * cnt));
4267   PetscCall(VecRestoreArray(xx, &x));
4268   PetscCall(VecRestoreArrayRead(bb, &b));
4269   PetscFunctionReturn(PETSC_SUCCESS);
4270 }
4271 
4272 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4273 {
4274   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4275 
4276   PetscFunctionBegin;
4277   a->inode.node_count       = 0;
4278   a->inode.use              = PETSC_FALSE;
4279   a->inode.checked          = PETSC_FALSE;
4280   a->inode.mat_nonzerostate = -1;
4281   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4282   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4283   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4284   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4285   A->ops->coloringpatch     = NULL;
4286   A->ops->multdiagonalblock = NULL;
4287   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4288   PetscFunctionReturn(PETSC_SUCCESS);
4289 }
4290 
4291 /*
4292     samestructure indicates that the matrix has not changed its nonzero structure so we
4293     do not need to recompute the inodes
4294 */
4295 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4296 {
4297   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4298   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4299   PetscBool       flag;
4300   const PetscInt *idx, *idy, *ii;
4301 
4302   PetscFunctionBegin;
4303   if (!a->inode.use) {
4304     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4305     PetscCall(PetscFree(a->inode.size));
4306     PetscFunctionReturn(PETSC_SUCCESS);
4307   }
4308   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4309 
4310   m = A->rmap->n;
4311   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4312   ns = a->inode.size;
4313 
4314   i          = 0;
4315   node_count = 0;
4316   idx        = a->j;
4317   ii         = a->i;
4318   if (idx) {
4319     while (i < m) {            /* For each row */
4320       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4321       /* Limits the number of elements in a node to 'a->inode.limit' */
4322       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4323         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4324         if (nzy != nzx) break;
4325         idy += nzx; /* Same nonzero pattern */
4326         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4327         if (!flag) break;
4328       }
4329       ns[node_count++] = blk_size;
4330       idx += blk_size * nzx;
4331       i = j;
4332     }
4333   }
4334   /* If not enough inodes found,, do not use inode version of the routines */
4335   if (!m || !idx || node_count > .8 * m) {
4336     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4337     PetscCall(PetscFree(a->inode.size));
4338     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4339   } else {
4340     if (!A->factortype) {
4341       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4342       if (A->rmap->n == A->cmap->n) {
4343         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4344         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4345         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4346         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4347         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4348       }
4349     } else {
4350       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4351     }
4352     a->inode.node_count = node_count;
4353     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4354   }
4355   a->inode.checked          = PETSC_TRUE;
4356   a->inode.mat_nonzerostate = A->nonzerostate;
4357   PetscFunctionReturn(PETSC_SUCCESS);
4358 }
4359 
4360 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4361 {
4362   Mat         B = *C;
4363   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4364   PetscInt    m = A->rmap->n;
4365 
4366   PetscFunctionBegin;
4367   c->inode.use              = a->inode.use;
4368   c->inode.limit            = a->inode.limit;
4369   c->inode.max_limit        = a->inode.max_limit;
4370   c->inode.checked          = PETSC_FALSE;
4371   c->inode.size             = NULL;
4372   c->inode.node_count       = 0;
4373   c->inode.ibdiagvalid      = PETSC_FALSE;
4374   c->inode.ibdiag           = NULL;
4375   c->inode.bdiag            = NULL;
4376   c->inode.mat_nonzerostate = -1;
4377   if (a->inode.use) {
4378     if (a->inode.checked && a->inode.size) {
4379       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4380       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4381 
4382       c->inode.checked          = PETSC_TRUE;
4383       c->inode.node_count       = a->inode.node_count;
4384       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4385     }
4386     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4387     if (!B->factortype) {
4388       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4389       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4390       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4391       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4392       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4393       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4394     } else {
4395       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4396     }
4397   }
4398   PetscFunctionReturn(PETSC_SUCCESS);
4399 }
4400 
4401 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4402 {
4403   PetscInt        k;
4404   const PetscInt *vi;
4405 
4406   PetscFunctionBegin;
4407   vi = aj + ai[row];
4408   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4409   vi        = aj + adiag[row];
4410   cols[nzl] = vi[0];
4411   vi        = aj + adiag[row + 1] + 1;
4412   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4413   PetscFunctionReturn(PETSC_SUCCESS);
4414 }
4415 /*
4416    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4417    Modified from MatSeqAIJCheckInode().
4418 
4419    Input Parameters:
4420 .  Mat A - ILU or LU matrix factor
4421 
4422 */
4423 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4424 {
4425   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4426   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4427   PetscInt       *cols1, *cols2, *ns;
4428   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4429   PetscBool       flag;
4430 
4431   PetscFunctionBegin;
4432   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4433   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4434 
4435   m = A->rmap->n;
4436   if (a->inode.size) ns = a->inode.size;
4437   else PetscCall(PetscMalloc1(m + 1, &ns));
4438 
4439   i          = 0;
4440   node_count = 0;
4441   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4442   while (i < m) {                       /* For each row */
4443     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4444     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4445     nzx  = nzl1 + nzu1 + 1;
4446     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4447 
4448     /* Limits the number of elements in a node to 'a->inode.limit' */
4449     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4450       nzl2 = ai[j + 1] - ai[j];
4451       nzu2 = adiag[j] - adiag[j + 1] - 1;
4452       nzy  = nzl2 + nzu2 + 1;
4453       if (nzy != nzx) break;
4454       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4455       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4456       if (!flag) break;
4457     }
4458     ns[node_count++] = blk_size;
4459     i                = j;
4460   }
4461   PetscCall(PetscFree2(cols1, cols2));
4462   /* If not enough inodes found,, do not use inode version of the routines */
4463   if (!m || node_count > .8 * m) {
4464     PetscCall(PetscFree(ns));
4465 
4466     a->inode.node_count = 0;
4467     a->inode.size       = NULL;
4468     a->inode.use        = PETSC_FALSE;
4469 
4470     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4471   } else {
4472     A->ops->mult              = NULL;
4473     A->ops->sor               = NULL;
4474     A->ops->multadd           = NULL;
4475     A->ops->getrowij          = NULL;
4476     A->ops->restorerowij      = NULL;
4477     A->ops->getcolumnij       = NULL;
4478     A->ops->restorecolumnij   = NULL;
4479     A->ops->coloringpatch     = NULL;
4480     A->ops->multdiagonalblock = NULL;
4481     a->inode.node_count       = node_count;
4482     a->inode.size             = ns;
4483 
4484     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4485   }
4486   a->inode.checked = PETSC_TRUE;
4487   PetscFunctionReturn(PETSC_SUCCESS);
4488 }
4489 
4490 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4491 {
4492   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4493 
4494   PetscFunctionBegin;
4495   a->inode.ibdiagvalid = PETSC_FALSE;
4496   PetscFunctionReturn(PETSC_SUCCESS);
4497 }
4498 
4499 /*
4500      This is really ugly. if inodes are used this replaces the
4501   permutations with ones that correspond to rows/cols of the matrix
4502   rather then inode blocks
4503 */
4504 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4505 {
4506   PetscFunctionBegin;
4507   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4508   PetscFunctionReturn(PETSC_SUCCESS);
4509 }
4510 
4511 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4512 {
4513   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4514   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4515   const PetscInt *ridx, *cidx;
4516   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4517   PetscInt        nslim_col, *ns_col;
4518   IS              ris = *rperm, cis = *cperm;
4519 
4520   PetscFunctionBegin;
4521   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4522   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4523 
4524   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4525   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4526   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4527 
4528   PetscCall(ISGetIndices(ris, &ridx));
4529   PetscCall(ISGetIndices(cis, &cidx));
4530 
4531   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4532   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4533 
4534   /* Construct the permutations for rows*/
4535   for (i = 0, row = 0; i < nslim_row; ++i) {
4536     indx      = ridx[i];
4537     start_val = tns[indx];
4538     end_val   = tns[indx + 1];
4539     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4540   }
4541 
4542   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4543   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4544 
4545   /* Construct permutations for columns */
4546   for (i = 0, col = 0; i < nslim_col; ++i) {
4547     indx      = cidx[i];
4548     start_val = tns[indx];
4549     end_val   = tns[indx + 1];
4550     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4551   }
4552 
4553   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4554   PetscCall(ISSetPermutation(*rperm));
4555   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4556   PetscCall(ISSetPermutation(*cperm));
4557 
4558   PetscCall(ISRestoreIndices(ris, &ridx));
4559   PetscCall(ISRestoreIndices(cis, &cidx));
4560 
4561   PetscCall(PetscFree(ns_col));
4562   PetscCall(PetscFree2(permr, permc));
4563   PetscCall(ISDestroy(&cis));
4564   PetscCall(ISDestroy(&ris));
4565   PetscCall(PetscFree(tns));
4566   PetscFunctionReturn(PETSC_SUCCESS);
4567 }
4568 
4569 /*@C
4570   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4571 
4572   Not Collective
4573 
4574   Input Parameter:
4575 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4576 
4577   Output Parameters:
4578 + node_count - no of inodes present in the matrix.
4579 . sizes      - an array of size `node_count`, with the sizes of each inode.
4580 - limit      - the max size used to generate the inodes.
4581 
4582   Level: advanced
4583 
4584   Note:
4585   It should be called after the matrix is assembled.
4586   The contents of the sizes[] array should not be changed.
4587   `NULL` may be passed for information not needed
4588 
4589 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4590 @*/
4591 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4592 {
4593   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4594 
4595   PetscFunctionBegin;
4596   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4597   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4598   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4599   PetscFunctionReturn(PETSC_SUCCESS);
4600 }
4601 
4602 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4603 {
4604   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4605 
4606   PetscFunctionBegin;
4607   if (node_count) *node_count = a->inode.node_count;
4608   if (sizes) *sizes = a->inode.size;
4609   if (limit) *limit = a->inode.limit;
4610   PetscFunctionReturn(PETSC_SUCCESS);
4611 }
4612