xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 98d129c30f3ee9fdddc40fdbc5a989b7be64f888)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30 
31   /* Use the same row structure wherever feasible. */
32   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
33 
34   /* if m < n; pad up the remainder with inode_limit */
35   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36   /* The last node is the odd ball. pad it up with the remaining rows; */
37   if (count < n) {
38     ns_col[i] = n - count;
39     i++;
40   } else if (count > n) {
41     /* Adjust for the over estimation */
42     ns_col[i - 1] += n - count;
43   }
44   *size = i;
45   *ns   = ns_col;
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 
49 /*
50       This builds symmetric version of nonzero structure,
51 */
52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53 {
54   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
55   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
58 
59   PetscFunctionBegin;
60   nslim_row = a->inode.node_count;
61   m         = A->rmap->n;
62   n         = A->cmap->n;
63   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
65 
66   /* Use the row_inode as column_inode */
67   nslim_col = nslim_row;
68   ns_col    = ns_row;
69 
70   /* allocate space for reformatted inode structure */
71   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
73 
74   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75     nsz = ns_col[i1];
76     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77   }
78   /* allocate space for row pointers */
79   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80   *iia = ia;
81   PetscCall(PetscMalloc1(nslim_row + 1, &work));
82 
83   /* determine the number of columns in each row */
84   ia[0] = oshift;
85   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86     j    = aj + ai[row] + ishift;
87     jmax = aj + ai[row + 1] + ishift;
88     if (j == jmax) continue; /* empty row */
89     col = *j++ + ishift;
90     i2  = tvc[col];
91     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92       ia[i1 + 1]++;
93       ia[i2 + 1]++;
94       i2++; /* Start col of next node */
95       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96       i2 = tvc[col];
97     }
98     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99   }
100 
101   /* shift ia[i] to point to next row */
102   for (i1 = 1; i1 < nslim_row + 1; i1++) {
103     row = ia[i1 - 1];
104     ia[i1] += row;
105     work[i1 - 1] = row - oshift;
106   }
107 
108   /* allocate space for column pointers */
109   nz = ia[nslim_row] + (!ishift);
110   PetscCall(PetscMalloc1(nz, &ja));
111   *jja = ja;
112 
113   /* loop over lower triangular part putting into ja */
114   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115     j    = aj + ai[row] + ishift;
116     jmax = aj + ai[row + 1] + ishift;
117     if (j == jmax) continue; /* empty row */
118     col = *j++ + ishift;
119     i2  = tvc[col];
120     while (i2 < i1 && j < jmax) {
121       ja[work[i2]++] = i1 + oshift;
122       ja[work[i1]++] = i2 + oshift;
123       ++i2;
124       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125       i2 = tvc[col];
126     }
127     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128   }
129   PetscCall(PetscFree(work));
130   PetscCall(PetscFree2(tns, tvc));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*
135       This builds nonsymmetric version of nonzero structure,
136 */
137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138 {
139   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
140   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141   PetscInt       *tns, *tvc, nsz, i1, i2;
142   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
143 
144   PetscFunctionBegin;
145   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146   nslim_row = a->inode.node_count;
147   n         = A->cmap->n;
148 
149   /* Create The column_inode for this matrix */
150   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
151 
152   /* allocate space for reformatted column_inode structure */
153   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
155 
156   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157     nsz = ns_col[i1];
158     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159   }
160   /* allocate space for row pointers */
161   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162   *iia = ia;
163   PetscCall(PetscMalloc1(nslim_row + 1, &work));
164 
165   /* determine the number of columns in each row */
166   ia[0] = oshift;
167   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168     j  = aj + ai[row] + ishift;
169     nz = ai[row + 1] - ai[row];
170     if (!nz) continue; /* empty row */
171     col = *j++ + ishift;
172     i2  = tvc[col];
173     while (nz-- > 0) { /* off-diagonal elements */
174       ia[i1 + 1]++;
175       i2++; /* Start col of next node */
176       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177       if (nz > 0) i2 = tvc[col];
178     }
179   }
180 
181   /* shift ia[i] to point to next row */
182   for (i1 = 1; i1 < nslim_row + 1; i1++) {
183     row = ia[i1 - 1];
184     ia[i1] += row;
185     work[i1 - 1] = row - oshift;
186   }
187 
188   /* allocate space for column pointers */
189   nz = ia[nslim_row] + (!ishift);
190   PetscCall(PetscMalloc1(nz, &ja));
191   *jja = ja;
192 
193   /* loop over matrix putting into ja */
194   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195     j  = aj + ai[row] + ishift;
196     nz = ai[row + 1] - ai[row];
197     if (!nz) continue; /* empty row */
198     col = *j++ + ishift;
199     i2  = tvc[col];
200     while (nz-- > 0) {
201       ja[work[i1]++] = i2 + oshift;
202       ++i2;
203       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204       if (nz > 0) i2 = tvc[col];
205     }
206   }
207   PetscCall(PetscFree(ns_col));
208   PetscCall(PetscFree(work));
209   PetscCall(PetscFree2(tns, tvc));
210   PetscFunctionReturn(PETSC_SUCCESS);
211 }
212 
213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214 {
215   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
216 
217   PetscFunctionBegin;
218   if (n) *n = a->inode.node_count;
219   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220   if (!blockcompressed) {
221     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222   } else if (symmetric) {
223     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224   } else {
225     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226   }
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231 {
232   PetscFunctionBegin;
233   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
234 
235   if (!blockcompressed) {
236     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237   } else {
238     PetscCall(PetscFree(*ia));
239     PetscCall(PetscFree(*ja));
240   }
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
245 {
246   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
247   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
248   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
249 
250   PetscFunctionBegin;
251   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
252   nslim_row = a->inode.node_count;
253   n         = A->cmap->n;
254 
255   /* Create The column_inode for this matrix */
256   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
257 
258   /* allocate space for reformatted column_inode structure */
259   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
260   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
261 
262   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
263     nsz = ns_col[i1];
264     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
265   }
266   /* allocate space for column pointers */
267   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
268   *iia = ia;
269   PetscCall(PetscMalloc1(nslim_col + 1, &work));
270 
271   /* determine the number of columns in each row */
272   ia[0] = oshift;
273   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
274     j   = aj + ai[row] + ishift;
275     col = *j++ + ishift;
276     i2  = tvc[col];
277     nz  = ai[row + 1] - ai[row];
278     while (nz-- > 0) { /* off-diagonal elements */
279       /* ia[i1+1]++; */
280       ia[i2 + 1]++;
281       i2++;
282       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
283       if (nz > 0) i2 = tvc[col];
284     }
285   }
286 
287   /* shift ia[i] to point to next col */
288   for (i1 = 1; i1 < nslim_col + 1; i1++) {
289     col = ia[i1 - 1];
290     ia[i1] += col;
291     work[i1 - 1] = col - oshift;
292   }
293 
294   /* allocate space for column pointers */
295   nz = ia[nslim_col] + (!ishift);
296   PetscCall(PetscMalloc1(nz, &ja));
297   *jja = ja;
298 
299   /* loop over matrix putting into ja */
300   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
301     j   = aj + ai[row] + ishift;
302     col = *j++ + ishift;
303     i2  = tvc[col];
304     nz  = ai[row + 1] - ai[row];
305     while (nz-- > 0) {
306       /* ja[work[i1]++] = i2 + oshift; */
307       ja[work[i2]++] = i1 + oshift;
308       i2++;
309       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
310       if (nz > 0) i2 = tvc[col];
311     }
312   }
313   PetscCall(PetscFree(ns_col));
314   PetscCall(PetscFree(work));
315   PetscCall(PetscFree2(tns, tvc));
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
320 {
321   PetscFunctionBegin;
322   PetscCall(MatCreateColInode_Private(A, n, NULL));
323   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
324 
325   if (!blockcompressed) {
326     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
327   } else if (symmetric) {
328     /* Since the indices are symmetric it doesn't matter */
329     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
330   } else {
331     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
332   }
333   PetscFunctionReturn(PETSC_SUCCESS);
334 }
335 
336 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
337 {
338   PetscFunctionBegin;
339   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
340   if (!blockcompressed) {
341     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
342   } else {
343     PetscCall(PetscFree(*ia));
344     PetscCall(PetscFree(*ja));
345   }
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
350 {
351   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
352   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
353   PetscScalar       *y;
354   const PetscScalar *x;
355   const MatScalar   *v1, *v2, *v3, *v4, *v5;
356   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
357   const PetscInt    *idx, *ns, *ii;
358 
359 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
360   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
361 #endif
362 
363   PetscFunctionBegin;
364   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
365   node_max = a->inode.node_count;
366   ns       = a->inode.size; /* Node Size array */
367   PetscCall(VecGetArrayRead(xx, &x));
368   PetscCall(VecGetArray(yy, &y));
369   idx = a->j;
370   v1  = a->a;
371   ii  = a->i;
372 
373   for (i = 0, row = 0; i < node_max; ++i) {
374     nsz = ns[i];
375     n   = ii[1] - ii[0];
376     nonzerorow += (n > 0) * nsz;
377     ii += nsz;
378     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
379     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
380     sz = n;                                                                /* No of non zeros in this row */
381                                                                            /* Switch on the size of Node */
382     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
383     case 1:
384       sum1 = 0.;
385 
386       for (n = 0; n < sz - 1; n += 2) {
387         i1 = idx[0]; /* The instructions are ordered to */
388         i2 = idx[1]; /* make the compiler's job easy */
389         idx += 2;
390         tmp0 = x[i1];
391         tmp1 = x[i2];
392         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
393         v1 += 2;
394       }
395 
396       if (n == sz - 1) { /* Take care of the last nonzero  */
397         tmp0 = x[*idx++];
398         sum1 += *v1++ * tmp0;
399       }
400       y[row++] = sum1;
401       break;
402     case 2:
403       sum1 = 0.;
404       sum2 = 0.;
405       v2   = v1 + n;
406 
407       for (n = 0; n < sz - 1; n += 2) {
408         i1 = idx[0];
409         i2 = idx[1];
410         idx += 2;
411         tmp0 = x[i1];
412         tmp1 = x[i2];
413         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
414         v1 += 2;
415         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
416         v2 += 2;
417       }
418       if (n == sz - 1) {
419         tmp0 = x[*idx++];
420         sum1 += *v1++ * tmp0;
421         sum2 += *v2++ * tmp0;
422       }
423       y[row++] = sum1;
424       y[row++] = sum2;
425       v1       = v2; /* Since the next block to be processed starts there*/
426       idx += sz;
427       break;
428     case 3:
429       sum1 = 0.;
430       sum2 = 0.;
431       sum3 = 0.;
432       v2   = v1 + n;
433       v3   = v2 + n;
434 
435       for (n = 0; n < sz - 1; n += 2) {
436         i1 = idx[0];
437         i2 = idx[1];
438         idx += 2;
439         tmp0 = x[i1];
440         tmp1 = x[i2];
441         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
442         v1 += 2;
443         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
444         v2 += 2;
445         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
446         v3 += 2;
447       }
448       if (n == sz - 1) {
449         tmp0 = x[*idx++];
450         sum1 += *v1++ * tmp0;
451         sum2 += *v2++ * tmp0;
452         sum3 += *v3++ * tmp0;
453       }
454       y[row++] = sum1;
455       y[row++] = sum2;
456       y[row++] = sum3;
457       v1       = v3; /* Since the next block to be processed starts there*/
458       idx += 2 * sz;
459       break;
460     case 4:
461       sum1 = 0.;
462       sum2 = 0.;
463       sum3 = 0.;
464       sum4 = 0.;
465       v2   = v1 + n;
466       v3   = v2 + n;
467       v4   = v3 + n;
468 
469       for (n = 0; n < sz - 1; n += 2) {
470         i1 = idx[0];
471         i2 = idx[1];
472         idx += 2;
473         tmp0 = x[i1];
474         tmp1 = x[i2];
475         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
476         v1 += 2;
477         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
478         v2 += 2;
479         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
480         v3 += 2;
481         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
482         v4 += 2;
483       }
484       if (n == sz - 1) {
485         tmp0 = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489         sum4 += *v4++ * tmp0;
490       }
491       y[row++] = sum1;
492       y[row++] = sum2;
493       y[row++] = sum3;
494       y[row++] = sum4;
495       v1       = v4; /* Since the next block to be processed starts there*/
496       idx += 3 * sz;
497       break;
498     case 5:
499       sum1 = 0.;
500       sum2 = 0.;
501       sum3 = 0.;
502       sum4 = 0.;
503       sum5 = 0.;
504       v2   = v1 + n;
505       v3   = v2 + n;
506       v4   = v3 + n;
507       v5   = v4 + n;
508 
509       for (n = 0; n < sz - 1; n += 2) {
510         i1 = idx[0];
511         i2 = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
516         v1 += 2;
517         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
518         v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
520         v3 += 2;
521         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
522         v4 += 2;
523         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
524         v5 += 2;
525       }
526       if (n == sz - 1) {
527         tmp0 = x[*idx++];
528         sum1 += *v1++ * tmp0;
529         sum2 += *v2++ * tmp0;
530         sum3 += *v3++ * tmp0;
531         sum4 += *v4++ * tmp0;
532         sum5 += *v5++ * tmp0;
533       }
534       y[row++] = sum1;
535       y[row++] = sum2;
536       y[row++] = sum3;
537       y[row++] = sum4;
538       y[row++] = sum5;
539       v1       = v5; /* Since the next block to be processed starts there */
540       idx += 4 * sz;
541       break;
542     default:
543       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
544     }
545   }
546   PetscCall(VecRestoreArrayRead(xx, &x));
547   PetscCall(VecRestoreArray(yy, &y));
548   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
549   PetscFunctionReturn(PETSC_SUCCESS);
550 }
551 
552 /* Almost same code as the MatMult_SeqAIJ_Inode() */
553 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
554 {
555   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
556   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
557   const MatScalar   *v1, *v2, *v3, *v4, *v5;
558   const PetscScalar *x;
559   PetscScalar       *y, *z, *zt;
560   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
561   const PetscInt    *idx, *ns, *ii;
562 
563   PetscFunctionBegin;
564   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
565   node_max = a->inode.node_count;
566   ns       = a->inode.size; /* Node Size array */
567 
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
570   zt = z;
571 
572   idx = a->j;
573   v1  = a->a;
574   ii  = a->i;
575 
576   for (i = 0, row = 0; i < node_max; ++i) {
577     nsz = ns[i];
578     n   = ii[1] - ii[0];
579     ii += nsz;
580     sz = n;        /* No of non zeros in this row */
581                    /* Switch on the size of Node */
582     switch (nsz) { /* Each loop in 'case' is unrolled */
583     case 1:
584       sum1 = *zt++;
585 
586       for (n = 0; n < sz - 1; n += 2) {
587         i1 = idx[0]; /* The instructions are ordered to */
588         i2 = idx[1]; /* make the compiler's job easy */
589         idx += 2;
590         tmp0 = x[i1];
591         tmp1 = x[i2];
592         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
593         v1 += 2;
594       }
595 
596       if (n == sz - 1) { /* Take care of the last nonzero  */
597         tmp0 = x[*idx++];
598         sum1 += *v1++ * tmp0;
599       }
600       y[row++] = sum1;
601       break;
602     case 2:
603       sum1 = *zt++;
604       sum2 = *zt++;
605       v2   = v1 + n;
606 
607       for (n = 0; n < sz - 1; n += 2) {
608         i1 = idx[0];
609         i2 = idx[1];
610         idx += 2;
611         tmp0 = x[i1];
612         tmp1 = x[i2];
613         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
614         v1 += 2;
615         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
616         v2 += 2;
617       }
618       if (n == sz - 1) {
619         tmp0 = x[*idx++];
620         sum1 += *v1++ * tmp0;
621         sum2 += *v2++ * tmp0;
622       }
623       y[row++] = sum1;
624       y[row++] = sum2;
625       v1       = v2; /* Since the next block to be processed starts there*/
626       idx += sz;
627       break;
628     case 3:
629       sum1 = *zt++;
630       sum2 = *zt++;
631       sum3 = *zt++;
632       v2   = v1 + n;
633       v3   = v2 + n;
634 
635       for (n = 0; n < sz - 1; n += 2) {
636         i1 = idx[0];
637         i2 = idx[1];
638         idx += 2;
639         tmp0 = x[i1];
640         tmp1 = x[i2];
641         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
642         v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
644         v2 += 2;
645         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
646         v3 += 2;
647       }
648       if (n == sz - 1) {
649         tmp0 = x[*idx++];
650         sum1 += *v1++ * tmp0;
651         sum2 += *v2++ * tmp0;
652         sum3 += *v3++ * tmp0;
653       }
654       y[row++] = sum1;
655       y[row++] = sum2;
656       y[row++] = sum3;
657       v1       = v3; /* Since the next block to be processed starts there*/
658       idx += 2 * sz;
659       break;
660     case 4:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       sum4 = *zt++;
665       v2   = v1 + n;
666       v3   = v2 + n;
667       v4   = v3 + n;
668 
669       for (n = 0; n < sz - 1; n += 2) {
670         i1 = idx[0];
671         i2 = idx[1];
672         idx += 2;
673         tmp0 = x[i1];
674         tmp1 = x[i2];
675         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
676         v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
678         v2 += 2;
679         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
680         v3 += 2;
681         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
682         v4 += 2;
683       }
684       if (n == sz - 1) {
685         tmp0 = x[*idx++];
686         sum1 += *v1++ * tmp0;
687         sum2 += *v2++ * tmp0;
688         sum3 += *v3++ * tmp0;
689         sum4 += *v4++ * tmp0;
690       }
691       y[row++] = sum1;
692       y[row++] = sum2;
693       y[row++] = sum3;
694       y[row++] = sum4;
695       v1       = v4; /* Since the next block to be processed starts there*/
696       idx += 3 * sz;
697       break;
698     case 5:
699       sum1 = *zt++;
700       sum2 = *zt++;
701       sum3 = *zt++;
702       sum4 = *zt++;
703       sum5 = *zt++;
704       v2   = v1 + n;
705       v3   = v2 + n;
706       v4   = v3 + n;
707       v5   = v4 + n;
708 
709       for (n = 0; n < sz - 1; n += 2) {
710         i1 = idx[0];
711         i2 = idx[1];
712         idx += 2;
713         tmp0 = x[i1];
714         tmp1 = x[i2];
715         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
716         v1 += 2;
717         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
718         v2 += 2;
719         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
720         v3 += 2;
721         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
722         v4 += 2;
723         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
724         v5 += 2;
725       }
726       if (n == sz - 1) {
727         tmp0 = x[*idx++];
728         sum1 += *v1++ * tmp0;
729         sum2 += *v2++ * tmp0;
730         sum3 += *v3++ * tmp0;
731         sum4 += *v4++ * tmp0;
732         sum5 += *v5++ * tmp0;
733       }
734       y[row++] = sum1;
735       y[row++] = sum2;
736       y[row++] = sum3;
737       y[row++] = sum4;
738       y[row++] = sum5;
739       v1       = v5; /* Since the next block to be processed starts there */
740       idx += 4 * sz;
741       break;
742     default:
743       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
744     }
745   }
746   PetscCall(VecRestoreArrayRead(xx, &x));
747   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
748   PetscCall(PetscLogFlops(2.0 * a->nz));
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
753 {
754   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
755   IS                 iscol = a->col, isrow = a->row;
756   const PetscInt    *r, *c, *rout, *cout;
757   PetscInt           i, j, n = A->rmap->n, nz;
758   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
759   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
760   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
761   PetscScalar        sum1, sum2, sum3, sum4, sum5;
762   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
763   const PetscScalar *b;
764 
765   PetscFunctionBegin;
766   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
767   node_max = a->inode.node_count;
768   ns       = a->inode.size; /* Node Size array */
769 
770   PetscCall(VecGetArrayRead(bb, &b));
771   PetscCall(VecGetArrayWrite(xx, &x));
772   tmp = a->solve_work;
773 
774   PetscCall(ISGetIndices(isrow, &rout));
775   r = rout;
776   PetscCall(ISGetIndices(iscol, &cout));
777   c = cout + (n - 1);
778 
779   /* forward solve the lower triangular */
780   tmps = tmp;
781   aa   = a_a;
782   aj   = a_j;
783   ad   = a->diag;
784 
785   for (i = 0, row = 0; i < node_max; ++i) {
786     nsz = ns[i];
787     aii = ai[row];
788     v1  = aa + aii;
789     vi  = aj + aii;
790     nz  = ad[row] - aii;
791     if (i < node_max - 1) {
792       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
793       * but our indexing to determine its size could. */
794       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
795       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
796       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
797       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
798     }
799 
800     switch (nsz) { /* Each loop in 'case' is unrolled */
801     case 1:
802       sum1 = b[*r++];
803       for (j = 0; j < nz - 1; j += 2) {
804         i0 = vi[0];
805         i1 = vi[1];
806         vi += 2;
807         tmp0 = tmps[i0];
808         tmp1 = tmps[i1];
809         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
810         v1 += 2;
811       }
812       if (j == nz - 1) {
813         tmp0 = tmps[*vi++];
814         sum1 -= *v1++ * tmp0;
815       }
816       tmp[row++] = sum1;
817       break;
818     case 2:
819       sum1 = b[*r++];
820       sum2 = b[*r++];
821       v2   = aa + ai[row + 1];
822 
823       for (j = 0; j < nz - 1; j += 2) {
824         i0 = vi[0];
825         i1 = vi[1];
826         vi += 2;
827         tmp0 = tmps[i0];
828         tmp1 = tmps[i1];
829         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
830         v1 += 2;
831         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
832         v2 += 2;
833       }
834       if (j == nz - 1) {
835         tmp0 = tmps[*vi++];
836         sum1 -= *v1++ * tmp0;
837         sum2 -= *v2++ * tmp0;
838       }
839       sum2 -= *v2++ * sum1;
840       tmp[row++] = sum1;
841       tmp[row++] = sum2;
842       break;
843     case 3:
844       sum1 = b[*r++];
845       sum2 = b[*r++];
846       sum3 = b[*r++];
847       v2   = aa + ai[row + 1];
848       v3   = aa + ai[row + 2];
849 
850       for (j = 0; j < nz - 1; j += 2) {
851         i0 = vi[0];
852         i1 = vi[1];
853         vi += 2;
854         tmp0 = tmps[i0];
855         tmp1 = tmps[i1];
856         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
857         v1 += 2;
858         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
859         v2 += 2;
860         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
861         v3 += 2;
862       }
863       if (j == nz - 1) {
864         tmp0 = tmps[*vi++];
865         sum1 -= *v1++ * tmp0;
866         sum2 -= *v2++ * tmp0;
867         sum3 -= *v3++ * tmp0;
868       }
869       sum2 -= *v2++ * sum1;
870       sum3 -= *v3++ * sum1;
871       sum3 -= *v3++ * sum2;
872 
873       tmp[row++] = sum1;
874       tmp[row++] = sum2;
875       tmp[row++] = sum3;
876       break;
877 
878     case 4:
879       sum1 = b[*r++];
880       sum2 = b[*r++];
881       sum3 = b[*r++];
882       sum4 = b[*r++];
883       v2   = aa + ai[row + 1];
884       v3   = aa + ai[row + 2];
885       v4   = aa + ai[row + 3];
886 
887       for (j = 0; j < nz - 1; j += 2) {
888         i0 = vi[0];
889         i1 = vi[1];
890         vi += 2;
891         tmp0 = tmps[i0];
892         tmp1 = tmps[i1];
893         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
894         v1 += 2;
895         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
896         v2 += 2;
897         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
898         v3 += 2;
899         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
900         v4 += 2;
901       }
902       if (j == nz - 1) {
903         tmp0 = tmps[*vi++];
904         sum1 -= *v1++ * tmp0;
905         sum2 -= *v2++ * tmp0;
906         sum3 -= *v3++ * tmp0;
907         sum4 -= *v4++ * tmp0;
908       }
909       sum2 -= *v2++ * sum1;
910       sum3 -= *v3++ * sum1;
911       sum4 -= *v4++ * sum1;
912       sum3 -= *v3++ * sum2;
913       sum4 -= *v4++ * sum2;
914       sum4 -= *v4++ * sum3;
915 
916       tmp[row++] = sum1;
917       tmp[row++] = sum2;
918       tmp[row++] = sum3;
919       tmp[row++] = sum4;
920       break;
921     case 5:
922       sum1 = b[*r++];
923       sum2 = b[*r++];
924       sum3 = b[*r++];
925       sum4 = b[*r++];
926       sum5 = b[*r++];
927       v2   = aa + ai[row + 1];
928       v3   = aa + ai[row + 2];
929       v4   = aa + ai[row + 3];
930       v5   = aa + ai[row + 4];
931 
932       for (j = 0; j < nz - 1; j += 2) {
933         i0 = vi[0];
934         i1 = vi[1];
935         vi += 2;
936         tmp0 = tmps[i0];
937         tmp1 = tmps[i1];
938         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
939         v1 += 2;
940         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
941         v2 += 2;
942         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
943         v3 += 2;
944         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
945         v4 += 2;
946         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
947         v5 += 2;
948       }
949       if (j == nz - 1) {
950         tmp0 = tmps[*vi++];
951         sum1 -= *v1++ * tmp0;
952         sum2 -= *v2++ * tmp0;
953         sum3 -= *v3++ * tmp0;
954         sum4 -= *v4++ * tmp0;
955         sum5 -= *v5++ * tmp0;
956       }
957 
958       sum2 -= *v2++ * sum1;
959       sum3 -= *v3++ * sum1;
960       sum4 -= *v4++ * sum1;
961       sum5 -= *v5++ * sum1;
962       sum3 -= *v3++ * sum2;
963       sum4 -= *v4++ * sum2;
964       sum5 -= *v5++ * sum2;
965       sum4 -= *v4++ * sum3;
966       sum5 -= *v5++ * sum3;
967       sum5 -= *v5++ * sum4;
968 
969       tmp[row++] = sum1;
970       tmp[row++] = sum2;
971       tmp[row++] = sum3;
972       tmp[row++] = sum4;
973       tmp[row++] = sum5;
974       break;
975     default:
976       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
977     }
978   }
979   /* backward solve the upper triangular */
980   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
981     nsz = ns[i];
982     aii = ai[row + 1] - 1;
983     v1  = aa + aii;
984     vi  = aj + aii;
985     nz  = aii - ad[row];
986     switch (nsz) { /* Each loop in 'case' is unrolled */
987     case 1:
988       sum1 = tmp[row];
989 
990       for (j = nz; j > 1; j -= 2) {
991         vi -= 2;
992         i0   = vi[2];
993         i1   = vi[1];
994         tmp0 = tmps[i0];
995         tmp1 = tmps[i1];
996         v1 -= 2;
997         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
998       }
999       if (j == 1) {
1000         tmp0 = tmps[*vi--];
1001         sum1 -= *v1-- * tmp0;
1002       }
1003       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1004       row--;
1005       break;
1006     case 2:
1007       sum1 = tmp[row];
1008       sum2 = tmp[row - 1];
1009       v2   = aa + ai[row] - 1;
1010       for (j = nz; j > 1; j -= 2) {
1011         vi -= 2;
1012         i0   = vi[2];
1013         i1   = vi[1];
1014         tmp0 = tmps[i0];
1015         tmp1 = tmps[i1];
1016         v1 -= 2;
1017         v2 -= 2;
1018         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020       }
1021       if (j == 1) {
1022         tmp0 = tmps[*vi--];
1023         sum1 -= *v1-- * tmp0;
1024         sum2 -= *v2-- * tmp0;
1025       }
1026 
1027       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1028       row--;
1029       sum2 -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1031       row--;
1032       break;
1033     case 3:
1034       sum1 = tmp[row];
1035       sum2 = tmp[row - 1];
1036       sum3 = tmp[row - 2];
1037       v2   = aa + ai[row] - 1;
1038       v3   = aa + ai[row - 1] - 1;
1039       for (j = nz; j > 1; j -= 2) {
1040         vi -= 2;
1041         i0   = vi[2];
1042         i1   = vi[1];
1043         tmp0 = tmps[i0];
1044         tmp1 = tmps[i1];
1045         v1 -= 2;
1046         v2 -= 2;
1047         v3 -= 2;
1048         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1049         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1050         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1051       }
1052       if (j == 1) {
1053         tmp0 = tmps[*vi--];
1054         sum1 -= *v1-- * tmp0;
1055         sum2 -= *v2-- * tmp0;
1056         sum3 -= *v3-- * tmp0;
1057       }
1058       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1059       row--;
1060       sum2 -= *v2-- * tmp0;
1061       sum3 -= *v3-- * tmp0;
1062       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1063       row--;
1064       sum3 -= *v3-- * tmp0;
1065       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1066       row--;
1067 
1068       break;
1069     case 4:
1070       sum1 = tmp[row];
1071       sum2 = tmp[row - 1];
1072       sum3 = tmp[row - 2];
1073       sum4 = tmp[row - 3];
1074       v2   = aa + ai[row] - 1;
1075       v3   = aa + ai[row - 1] - 1;
1076       v4   = aa + ai[row - 2] - 1;
1077 
1078       for (j = nz; j > 1; j -= 2) {
1079         vi -= 2;
1080         i0   = vi[2];
1081         i1   = vi[1];
1082         tmp0 = tmps[i0];
1083         tmp1 = tmps[i1];
1084         v1 -= 2;
1085         v2 -= 2;
1086         v3 -= 2;
1087         v4 -= 2;
1088         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092       }
1093       if (j == 1) {
1094         tmp0 = tmps[*vi--];
1095         sum1 -= *v1-- * tmp0;
1096         sum2 -= *v2-- * tmp0;
1097         sum3 -= *v3-- * tmp0;
1098         sum4 -= *v4-- * tmp0;
1099       }
1100 
1101       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1102       row--;
1103       sum2 -= *v2-- * tmp0;
1104       sum3 -= *v3-- * tmp0;
1105       sum4 -= *v4-- * tmp0;
1106       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1107       row--;
1108       sum3 -= *v3-- * tmp0;
1109       sum4 -= *v4-- * tmp0;
1110       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1111       row--;
1112       sum4 -= *v4-- * tmp0;
1113       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1114       row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row - 1];
1119       sum3 = tmp[row - 2];
1120       sum4 = tmp[row - 3];
1121       sum5 = tmp[row - 4];
1122       v2   = aa + ai[row] - 1;
1123       v3   = aa + ai[row - 1] - 1;
1124       v4   = aa + ai[row - 2] - 1;
1125       v5   = aa + ai[row - 3] - 1;
1126       for (j = nz; j > 1; j -= 2) {
1127         vi -= 2;
1128         i0   = vi[2];
1129         i1   = vi[1];
1130         tmp0 = tmps[i0];
1131         tmp1 = tmps[i1];
1132         v1 -= 2;
1133         v2 -= 2;
1134         v3 -= 2;
1135         v4 -= 2;
1136         v5 -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j == 1) {
1144         tmp0 = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1153       row--;
1154       sum2 -= *v2-- * tmp0;
1155       sum3 -= *v3-- * tmp0;
1156       sum4 -= *v4-- * tmp0;
1157       sum5 -= *v5-- * tmp0;
1158       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1159       row--;
1160       sum3 -= *v3-- * tmp0;
1161       sum4 -= *v4-- * tmp0;
1162       sum5 -= *v5-- * tmp0;
1163       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1164       row--;
1165       sum4 -= *v4-- * tmp0;
1166       sum5 -= *v5-- * tmp0;
1167       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1168       row--;
1169       sum5 -= *v5-- * tmp0;
1170       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1171       row--;
1172       break;
1173     default:
1174       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1175     }
1176   }
1177   PetscCall(ISRestoreIndices(isrow, &rout));
1178   PetscCall(ISRestoreIndices(iscol, &cout));
1179   PetscCall(VecRestoreArrayRead(bb, &b));
1180   PetscCall(VecRestoreArrayWrite(xx, &x));
1181   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1186 {
1187   Mat              C = B;
1188   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1189   IS               isrow = b->row, isicol = b->icol;
1190   const PetscInt  *r, *ic, *ics;
1191   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1192   PetscInt         i, j, k, nz, nzL, row, *pj;
1193   const PetscInt  *ajtmp, *bjtmp;
1194   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1195   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1196   FactorShiftCtx   sctx;
1197   const PetscInt  *ddiag;
1198   PetscReal        rs;
1199   MatScalar        d;
1200   PetscInt         inod, nodesz, node_max, col;
1201   const PetscInt  *ns;
1202   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1203 
1204   PetscFunctionBegin;
1205   /* MatPivotSetUp(): initialize shift context sctx */
1206   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1207 
1208   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1209     ddiag          = a->diag;
1210     sctx.shift_top = info->zeropivot;
1211     for (i = 0; i < n; i++) {
1212       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1213       d  = (aa)[ddiag[i]];
1214       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1215       v  = aa + ai[i];
1216       nz = ai[i + 1] - ai[i];
1217       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1218       if (rs > sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top *= 1.1;
1221     sctx.nshift_max = 5;
1222     sctx.shift_lo   = 0.;
1223     sctx.shift_hi   = 1.;
1224   }
1225 
1226   PetscCall(ISGetIndices(isrow, &r));
1227   PetscCall(ISGetIndices(isicol, &ic));
1228 
1229   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1230   ics = ic;
1231 
1232   node_max = a->inode.node_count;
1233   ns       = a->inode.size;
1234   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1235 
1236   /* If max inode size > 4, split it into two inodes.*/
1237   /* also map the inode sizes according to the ordering */
1238   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1239   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1240     if (ns[i] > 4) {
1241       tmp_vec1[j] = 4;
1242       ++j;
1243       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1244     } else {
1245       tmp_vec1[j] = ns[i];
1246     }
1247   }
1248   /* Use the correct node_max */
1249   node_max = j;
1250 
1251   /* Now reorder the inode info based on mat re-ordering info */
1252   /* First create a row -> inode_size_array_index map */
1253   PetscCall(PetscMalloc1(n + 1, &nsmap));
1254   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1255   for (i = 0, row = 0; i < node_max; i++) {
1256     nodesz = tmp_vec1[i];
1257     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i = 0, j = 0; i < node_max; i++) {
1261     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1262     tmp_vec2[i] = nodesz;
1263     j += nodesz;
1264   }
1265   PetscCall(PetscFree(nsmap));
1266   PetscCall(PetscFree(tmp_vec1));
1267 
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.newshift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1275       nodesz = ns[inod];
1276 
1277       switch (nodesz) {
1278       case 1:
1279         /* zero rtmp1 */
1280         /* L part */
1281         nz    = bi[i + 1] - bi[i];
1282         bjtmp = bj + bi[i];
1283         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1284 
1285         /* U part */
1286         nz    = bdiag[i] - bdiag[i + 1];
1287         bjtmp = bj + bdiag[i + 1] + 1;
1288         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289 
1290         /* load in initial (unfactored row) */
1291         nz    = ai[r[i] + 1] - ai[r[i]];
1292         ajtmp = aj + ai[r[i]];
1293         v     = aa + ai[r[i]];
1294         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1295 
1296         /* ZeropivotApply() */
1297         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1298 
1299         /* elimination */
1300         bjtmp = bj + bi[i];
1301         row   = *bjtmp++;
1302         nzL   = bi[i + 1] - bi[i];
1303         for (k = 0; k < nzL; k++) {
1304           pc = rtmp1 + row;
1305           if (*pc != 0.0) {
1306             pv   = b->a + bdiag[row];
1307             mul1 = *pc * (*pv);
1308             *pc  = mul1;
1309             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1310             pv   = b->a + bdiag[row + 1] + 1;
1311             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1312             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1313             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1314           }
1315           row = *bjtmp++;
1316         }
1317 
1318         /* finished row so stick it into b->a */
1319         rs = 0.0;
1320         /* L part */
1321         pv = b->a + bi[i];
1322         pj = b->j + bi[i];
1323         nz = bi[i + 1] - bi[i];
1324         for (j = 0; j < nz; j++) {
1325           pv[j] = rtmp1[pj[j]];
1326           rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* U part */
1330         pv = b->a + bdiag[i + 1] + 1;
1331         pj = b->j + bdiag[i + 1] + 1;
1332         nz = bdiag[i] - bdiag[i + 1] - 1;
1333         for (j = 0; j < nz; j++) {
1334           pv[j] = rtmp1[pj[j]];
1335           rs += PetscAbsScalar(pv[j]);
1336         }
1337 
1338         /* Check zero pivot */
1339         sctx.rs = rs;
1340         sctx.pv = rtmp1[i];
1341         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1342         if (sctx.newshift) break;
1343 
1344         /* Mark diagonal and invert diagonal for simpler triangular solves */
1345         pv  = b->a + bdiag[i];
1346         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347         break;
1348 
1349       case 2:
1350         /* zero rtmp1 and rtmp2 */
1351         /* L part */
1352         nz    = bi[i + 1] - bi[i];
1353         bjtmp = bj + bi[i];
1354         for (j = 0; j < nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0;
1357           rtmp2[col] = 0.0;
1358         }
1359 
1360         /* U part */
1361         nz    = bdiag[i] - bdiag[i + 1];
1362         bjtmp = bj + bdiag[i + 1] + 1;
1363         for (j = 0; j < nz; j++) {
1364           col        = bjtmp[j];
1365           rtmp1[col] = 0.0;
1366           rtmp2[col] = 0.0;
1367         }
1368 
1369         /* load in initial (unfactored row) */
1370         nz    = ai[r[i] + 1] - ai[r[i]];
1371         ajtmp = aj + ai[r[i]];
1372         v1    = aa + ai[r[i]];
1373         v2    = aa + ai[r[i] + 1];
1374         for (j = 0; j < nz; j++) {
1375           col        = ics[ajtmp[j]];
1376           rtmp1[col] = v1[j];
1377           rtmp2[col] = v2[j];
1378         }
1379         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380         rtmp1[i] += sctx.shift_amount;
1381         rtmp2[i + 1] += sctx.shift_amount;
1382 
1383         /* elimination */
1384         bjtmp = bj + bi[i];
1385         row   = *bjtmp++; /* pivot row */
1386         nzL   = bi[i + 1] - bi[i];
1387         for (k = 0; k < nzL; k++) {
1388           pc1 = rtmp1 + row;
1389           pc2 = rtmp2 + row;
1390           if (*pc1 != 0.0 || *pc2 != 0.0) {
1391             pv   = b->a + bdiag[row];
1392             mul1 = *pc1 * (*pv);
1393             mul2 = *pc2 * (*pv);
1394             *pc1 = mul1;
1395             *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row + 1] + 1;
1399             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1400             for (j = 0; j < nz; j++) {
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i];
1415         nz  = bi[i + 1] - bi[i];
1416         for (j = 0; j < nz; j++) {
1417           col    = pj[j];
1418           pc1[j] = rtmp1[col];
1419           rs += PetscAbsScalar(pc1[j]);
1420         }
1421         /* U part */
1422         pc1 = b->a + bdiag[i + 1] + 1;
1423         pj  = b->j + bdiag[i + 1] + 1;
1424         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1425         for (j = 0; j < nz; j++) {
1426           col    = pj[j];
1427           pc1[j] = rtmp1[col];
1428           rs += PetscAbsScalar(pc1[j]);
1429         }
1430 
1431         sctx.rs = rs;
1432         sctx.pv = rtmp1[i];
1433         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1434         if (sctx.newshift) break;
1435         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1436         *pc1 = 1.0 / sctx.pv;
1437 
1438         /* Now take care of diagonal 2x2 block. */
1439         pc2 = rtmp2 + i;
1440         if (*pc2 != 0.0) {
1441           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1442           *pc2 = mul1;                        /* insert L entry */
1443           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1444           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1445           for (j = 0; j < nz; j++) {
1446             col = pj[j];
1447             rtmp2[col] -= mul1 * rtmp1[col];
1448           }
1449           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1450         }
1451 
1452         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1453         rs = 0.0;
1454         /* L part */
1455         pc2 = b->a + bi[i + 1];
1456         pj  = b->j + bi[i + 1];
1457         nz  = bi[i + 2] - bi[i + 1];
1458         for (j = 0; j < nz; j++) {
1459           col    = pj[j];
1460           pc2[j] = rtmp2[col];
1461           rs += PetscAbsScalar(pc2[j]);
1462         }
1463         /* U part */
1464         pc2 = b->a + bdiag[i + 2] + 1;
1465         pj  = b->j + bdiag[i + 2] + 1;
1466         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1467         for (j = 0; j < nz; j++) {
1468           col    = pj[j];
1469           pc2[j] = rtmp2[col];
1470           rs += PetscAbsScalar(pc2[j]);
1471         }
1472 
1473         sctx.rs = rs;
1474         sctx.pv = rtmp2[i + 1];
1475         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1476         if (sctx.newshift) break;
1477         pc2  = b->a + bdiag[i + 1];
1478         *pc2 = 1.0 / sctx.pv;
1479         break;
1480 
1481       case 3:
1482         /* zero rtmp */
1483         /* L part */
1484         nz    = bi[i + 1] - bi[i];
1485         bjtmp = bj + bi[i];
1486         for (j = 0; j < nz; j++) {
1487           col        = bjtmp[j];
1488           rtmp1[col] = 0.0;
1489           rtmp2[col] = 0.0;
1490           rtmp3[col] = 0.0;
1491         }
1492 
1493         /* U part */
1494         nz    = bdiag[i] - bdiag[i + 1];
1495         bjtmp = bj + bdiag[i + 1] + 1;
1496         for (j = 0; j < nz; j++) {
1497           col        = bjtmp[j];
1498           rtmp1[col] = 0.0;
1499           rtmp2[col] = 0.0;
1500           rtmp3[col] = 0.0;
1501         }
1502 
1503         /* load in initial (unfactored row) */
1504         nz    = ai[r[i] + 1] - ai[r[i]];
1505         ajtmp = aj + ai[r[i]];
1506         v1    = aa + ai[r[i]];
1507         v2    = aa + ai[r[i] + 1];
1508         v3    = aa + ai[r[i] + 2];
1509         for (j = 0; j < nz; j++) {
1510           col        = ics[ajtmp[j]];
1511           rtmp1[col] = v1[j];
1512           rtmp2[col] = v2[j];
1513           rtmp3[col] = v3[j];
1514         }
1515         /* ZeropivotApply(): shift the diagonal of the matrix  */
1516         rtmp1[i] += sctx.shift_amount;
1517         rtmp2[i + 1] += sctx.shift_amount;
1518         rtmp3[i + 2] += sctx.shift_amount;
1519 
1520         /* elimination */
1521         bjtmp = bj + bi[i];
1522         row   = *bjtmp++; /* pivot row */
1523         nzL   = bi[i + 1] - bi[i];
1524         for (k = 0; k < nzL; k++) {
1525           pc1 = rtmp1 + row;
1526           pc2 = rtmp2 + row;
1527           pc3 = rtmp3 + row;
1528           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1529             pv   = b->a + bdiag[row];
1530             mul1 = *pc1 * (*pv);
1531             mul2 = *pc2 * (*pv);
1532             mul3 = *pc3 * (*pv);
1533             *pc1 = mul1;
1534             *pc2 = mul2;
1535             *pc3 = mul3;
1536 
1537             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1538             pv = b->a + bdiag[row + 1] + 1;
1539             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1540             for (j = 0; j < nz; j++) {
1541               col = pj[j];
1542               rtmp1[col] -= mul1 * pv[j];
1543               rtmp2[col] -= mul2 * pv[j];
1544               rtmp3[col] -= mul3 * pv[j];
1545             }
1546             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1547           }
1548           row = *bjtmp++;
1549         }
1550 
1551         /* finished row i; check zero pivot, then stick row i into b->a */
1552         rs = 0.0;
1553         /* L part */
1554         pc1 = b->a + bi[i];
1555         pj  = b->j + bi[i];
1556         nz  = bi[i + 1] - bi[i];
1557         for (j = 0; j < nz; j++) {
1558           col    = pj[j];
1559           pc1[j] = rtmp1[col];
1560           rs += PetscAbsScalar(pc1[j]);
1561         }
1562         /* U part */
1563         pc1 = b->a + bdiag[i + 1] + 1;
1564         pj  = b->j + bdiag[i + 1] + 1;
1565         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1566         for (j = 0; j < nz; j++) {
1567           col    = pj[j];
1568           pc1[j] = rtmp1[col];
1569           rs += PetscAbsScalar(pc1[j]);
1570         }
1571 
1572         sctx.rs = rs;
1573         sctx.pv = rtmp1[i];
1574         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1575         if (sctx.newshift) break;
1576         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1577         *pc1 = 1.0 / sctx.pv;
1578 
1579         /* Now take care of 1st column of diagonal 3x3 block. */
1580         pc2 = rtmp2 + i;
1581         pc3 = rtmp3 + i;
1582         if (*pc2 != 0.0 || *pc3 != 0.0) {
1583           mul2 = (*pc2) * (*pc1);
1584           *pc2 = mul2;
1585           mul3 = (*pc3) * (*pc1);
1586           *pc3 = mul3;
1587           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1588           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1589           for (j = 0; j < nz; j++) {
1590             col = pj[j];
1591             rtmp2[col] -= mul2 * rtmp1[col];
1592             rtmp3[col] -= mul3 * rtmp1[col];
1593           }
1594           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1595         }
1596 
1597         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1598         rs = 0.0;
1599         /* L part */
1600         pc2 = b->a + bi[i + 1];
1601         pj  = b->j + bi[i + 1];
1602         nz  = bi[i + 2] - bi[i + 1];
1603         for (j = 0; j < nz; j++) {
1604           col    = pj[j];
1605           pc2[j] = rtmp2[col];
1606           rs += PetscAbsScalar(pc2[j]);
1607         }
1608         /* U part */
1609         pc2 = b->a + bdiag[i + 2] + 1;
1610         pj  = b->j + bdiag[i + 2] + 1;
1611         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1612         for (j = 0; j < nz; j++) {
1613           col    = pj[j];
1614           pc2[j] = rtmp2[col];
1615           rs += PetscAbsScalar(pc2[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp2[i + 1];
1620         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1621         if (sctx.newshift) break;
1622         pc2  = b->a + bdiag[i + 1];
1623         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1624 
1625         /* Now take care of 2nd column of diagonal 3x3 block. */
1626         pc3 = rtmp3 + i + 1;
1627         if (*pc3 != 0.0) {
1628           mul3 = (*pc3) * (*pc2);
1629           *pc3 = mul3;
1630           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1631           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1632           for (j = 0; j < nz; j++) {
1633             col = pj[j];
1634             rtmp3[col] -= mul3 * rtmp2[col];
1635           }
1636           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1637         }
1638 
1639         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1640         rs = 0.0;
1641         /* L part */
1642         pc3 = b->a + bi[i + 2];
1643         pj  = b->j + bi[i + 2];
1644         nz  = bi[i + 3] - bi[i + 2];
1645         for (j = 0; j < nz; j++) {
1646           col    = pj[j];
1647           pc3[j] = rtmp3[col];
1648           rs += PetscAbsScalar(pc3[j]);
1649         }
1650         /* U part */
1651         pc3 = b->a + bdiag[i + 3] + 1;
1652         pj  = b->j + bdiag[i + 3] + 1;
1653         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1654         for (j = 0; j < nz; j++) {
1655           col    = pj[j];
1656           pc3[j] = rtmp3[col];
1657           rs += PetscAbsScalar(pc3[j]);
1658         }
1659 
1660         sctx.rs = rs;
1661         sctx.pv = rtmp3[i + 2];
1662         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1663         if (sctx.newshift) break;
1664         pc3  = b->a + bdiag[i + 2];
1665         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1666         break;
1667       case 4:
1668         /* zero rtmp */
1669         /* L part */
1670         nz    = bi[i + 1] - bi[i];
1671         bjtmp = bj + bi[i];
1672         for (j = 0; j < nz; j++) {
1673           col        = bjtmp[j];
1674           rtmp1[col] = 0.0;
1675           rtmp2[col] = 0.0;
1676           rtmp3[col] = 0.0;
1677           rtmp4[col] = 0.0;
1678         }
1679 
1680         /* U part */
1681         nz    = bdiag[i] - bdiag[i + 1];
1682         bjtmp = bj + bdiag[i + 1] + 1;
1683         for (j = 0; j < nz; j++) {
1684           col        = bjtmp[j];
1685           rtmp1[col] = 0.0;
1686           rtmp2[col] = 0.0;
1687           rtmp3[col] = 0.0;
1688           rtmp4[col] = 0.0;
1689         }
1690 
1691         /* load in initial (unfactored row) */
1692         nz    = ai[r[i] + 1] - ai[r[i]];
1693         ajtmp = aj + ai[r[i]];
1694         v1    = aa + ai[r[i]];
1695         v2    = aa + ai[r[i] + 1];
1696         v3    = aa + ai[r[i] + 2];
1697         v4    = aa + ai[r[i] + 3];
1698         for (j = 0; j < nz; j++) {
1699           col        = ics[ajtmp[j]];
1700           rtmp1[col] = v1[j];
1701           rtmp2[col] = v2[j];
1702           rtmp3[col] = v3[j];
1703           rtmp4[col] = v4[j];
1704         }
1705         /* ZeropivotApply(): shift the diagonal of the matrix  */
1706         rtmp1[i] += sctx.shift_amount;
1707         rtmp2[i + 1] += sctx.shift_amount;
1708         rtmp3[i + 2] += sctx.shift_amount;
1709         rtmp4[i + 3] += sctx.shift_amount;
1710 
1711         /* elimination */
1712         bjtmp = bj + bi[i];
1713         row   = *bjtmp++; /* pivot row */
1714         nzL   = bi[i + 1] - bi[i];
1715         for (k = 0; k < nzL; k++) {
1716           pc1 = rtmp1 + row;
1717           pc2 = rtmp2 + row;
1718           pc3 = rtmp3 + row;
1719           pc4 = rtmp4 + row;
1720           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1721             pv   = b->a + bdiag[row];
1722             mul1 = *pc1 * (*pv);
1723             mul2 = *pc2 * (*pv);
1724             mul3 = *pc3 * (*pv);
1725             mul4 = *pc4 * (*pv);
1726             *pc1 = mul1;
1727             *pc2 = mul2;
1728             *pc3 = mul3;
1729             *pc4 = mul4;
1730 
1731             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1732             pv = b->a + bdiag[row + 1] + 1;
1733             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1734             for (j = 0; j < nz; j++) {
1735               col = pj[j];
1736               rtmp1[col] -= mul1 * pv[j];
1737               rtmp2[col] -= mul2 * pv[j];
1738               rtmp3[col] -= mul3 * pv[j];
1739               rtmp4[col] -= mul4 * pv[j];
1740             }
1741             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1742           }
1743           row = *bjtmp++;
1744         }
1745 
1746         /* finished row i; check zero pivot, then stick row i into b->a */
1747         rs = 0.0;
1748         /* L part */
1749         pc1 = b->a + bi[i];
1750         pj  = b->j + bi[i];
1751         nz  = bi[i + 1] - bi[i];
1752         for (j = 0; j < nz; j++) {
1753           col    = pj[j];
1754           pc1[j] = rtmp1[col];
1755           rs += PetscAbsScalar(pc1[j]);
1756         }
1757         /* U part */
1758         pc1 = b->a + bdiag[i + 1] + 1;
1759         pj  = b->j + bdiag[i + 1] + 1;
1760         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1761         for (j = 0; j < nz; j++) {
1762           col    = pj[j];
1763           pc1[j] = rtmp1[col];
1764           rs += PetscAbsScalar(pc1[j]);
1765         }
1766 
1767         sctx.rs = rs;
1768         sctx.pv = rtmp1[i];
1769         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1770         if (sctx.newshift) break;
1771         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1772         *pc1 = 1.0 / sctx.pv;
1773 
1774         /* Now take care of 1st column of diagonal 4x4 block. */
1775         pc2 = rtmp2 + i;
1776         pc3 = rtmp3 + i;
1777         pc4 = rtmp4 + i;
1778         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1779           mul2 = (*pc2) * (*pc1);
1780           *pc2 = mul2;
1781           mul3 = (*pc3) * (*pc1);
1782           *pc3 = mul3;
1783           mul4 = (*pc4) * (*pc1);
1784           *pc4 = mul4;
1785           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1786           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1787           for (j = 0; j < nz; j++) {
1788             col = pj[j];
1789             rtmp2[col] -= mul2 * rtmp1[col];
1790             rtmp3[col] -= mul3 * rtmp1[col];
1791             rtmp4[col] -= mul4 * rtmp1[col];
1792           }
1793           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1794         }
1795 
1796         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1797         rs = 0.0;
1798         /* L part */
1799         pc2 = b->a + bi[i + 1];
1800         pj  = b->j + bi[i + 1];
1801         nz  = bi[i + 2] - bi[i + 1];
1802         for (j = 0; j < nz; j++) {
1803           col    = pj[j];
1804           pc2[j] = rtmp2[col];
1805           rs += PetscAbsScalar(pc2[j]);
1806         }
1807         /* U part */
1808         pc2 = b->a + bdiag[i + 2] + 1;
1809         pj  = b->j + bdiag[i + 2] + 1;
1810         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1811         for (j = 0; j < nz; j++) {
1812           col    = pj[j];
1813           pc2[j] = rtmp2[col];
1814           rs += PetscAbsScalar(pc2[j]);
1815         }
1816 
1817         sctx.rs = rs;
1818         sctx.pv = rtmp2[i + 1];
1819         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1820         if (sctx.newshift) break;
1821         pc2  = b->a + bdiag[i + 1];
1822         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1823 
1824         /* Now take care of 2nd column of diagonal 4x4 block. */
1825         pc3 = rtmp3 + i + 1;
1826         pc4 = rtmp4 + i + 1;
1827         if (*pc3 != 0.0 || *pc4 != 0.0) {
1828           mul3 = (*pc3) * (*pc2);
1829           *pc3 = mul3;
1830           mul4 = (*pc4) * (*pc2);
1831           *pc4 = mul4;
1832           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1833           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1834           for (j = 0; j < nz; j++) {
1835             col = pj[j];
1836             rtmp3[col] -= mul3 * rtmp2[col];
1837             rtmp4[col] -= mul4 * rtmp2[col];
1838           }
1839           PetscCall(PetscLogFlops(4.0 * nz));
1840         }
1841 
1842         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1843         rs = 0.0;
1844         /* L part */
1845         pc3 = b->a + bi[i + 2];
1846         pj  = b->j + bi[i + 2];
1847         nz  = bi[i + 3] - bi[i + 2];
1848         for (j = 0; j < nz; j++) {
1849           col    = pj[j];
1850           pc3[j] = rtmp3[col];
1851           rs += PetscAbsScalar(pc3[j]);
1852         }
1853         /* U part */
1854         pc3 = b->a + bdiag[i + 3] + 1;
1855         pj  = b->j + bdiag[i + 3] + 1;
1856         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1857         for (j = 0; j < nz; j++) {
1858           col    = pj[j];
1859           pc3[j] = rtmp3[col];
1860           rs += PetscAbsScalar(pc3[j]);
1861         }
1862 
1863         sctx.rs = rs;
1864         sctx.pv = rtmp3[i + 2];
1865         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1866         if (sctx.newshift) break;
1867         pc3  = b->a + bdiag[i + 2];
1868         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1869 
1870         /* Now take care of 3rd column of diagonal 4x4 block. */
1871         pc4 = rtmp4 + i + 2;
1872         if (*pc4 != 0.0) {
1873           mul4 = (*pc4) * (*pc3);
1874           *pc4 = mul4;
1875           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1876           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1877           for (j = 0; j < nz; j++) {
1878             col = pj[j];
1879             rtmp4[col] -= mul4 * rtmp3[col];
1880           }
1881           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1882         }
1883 
1884         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1885         rs = 0.0;
1886         /* L part */
1887         pc4 = b->a + bi[i + 3];
1888         pj  = b->j + bi[i + 3];
1889         nz  = bi[i + 4] - bi[i + 3];
1890         for (j = 0; j < nz; j++) {
1891           col    = pj[j];
1892           pc4[j] = rtmp4[col];
1893           rs += PetscAbsScalar(pc4[j]);
1894         }
1895         /* U part */
1896         pc4 = b->a + bdiag[i + 4] + 1;
1897         pj  = b->j + bdiag[i + 4] + 1;
1898         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1899         for (j = 0; j < nz; j++) {
1900           col    = pj[j];
1901           pc4[j] = rtmp4[col];
1902           rs += PetscAbsScalar(pc4[j]);
1903         }
1904 
1905         sctx.rs = rs;
1906         sctx.pv = rtmp4[i + 3];
1907         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1908         if (sctx.newshift) break;
1909         pc4  = b->a + bdiag[i + 3];
1910         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1911         break;
1912 
1913       default:
1914         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1915       }
1916       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1917       i += nodesz;              /* Update the row */
1918     }
1919 
1920     /* MatPivotRefine() */
1921     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1922       /*
1923        * if no shift in this attempt & shifting & started shifting & can refine,
1924        * then try lower shift
1925        */
1926       sctx.shift_hi       = sctx.shift_fraction;
1927       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1928       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1929       sctx.newshift       = PETSC_TRUE;
1930       sctx.nshift++;
1931     }
1932   } while (sctx.newshift);
1933 
1934   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1935   PetscCall(PetscFree(tmp_vec2));
1936   PetscCall(ISRestoreIndices(isicol, &ic));
1937   PetscCall(ISRestoreIndices(isrow, &r));
1938 
1939   if (b->inode.size) {
1940     C->ops->solve = MatSolve_SeqAIJ_Inode;
1941   } else {
1942     C->ops->solve = MatSolve_SeqAIJ;
1943   }
1944   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1945   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1946   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1947   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1948   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1949   C->assembled              = PETSC_TRUE;
1950   C->preallocated           = PETSC_TRUE;
1951 
1952   PetscCall(PetscLogFlops(C->cmap->n));
1953 
1954   /* MatShiftView(A,info,&sctx) */
1955   if (sctx.nshift) {
1956     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1957       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1958     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1959       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1960     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1961       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1962     }
1963   }
1964   PetscFunctionReturn(PETSC_SUCCESS);
1965 }
1966 
1967 #if 0
1968 // unused
1969 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1970 {
1971   Mat              C = B;
1972   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1973   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1974   const PetscInt  *r, *ic, *c, *ics;
1975   PetscInt         n = A->rmap->n, *bi = b->i;
1976   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1977   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1978   PetscInt        *ai = a->i, *aj = a->j;
1979   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1980   PetscScalar      mul1, mul2, mul3, tmp;
1981   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1982   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1983   PetscReal        rs = 0.0;
1984   FactorShiftCtx   sctx;
1985 
1986   PetscFunctionBegin;
1987   sctx.shift_top      = 0;
1988   sctx.nshift_max     = 0;
1989   sctx.shift_lo       = 0;
1990   sctx.shift_hi       = 0;
1991   sctx.shift_fraction = 0;
1992 
1993   /* if both shift schemes are chosen by user, only use info->shiftpd */
1994   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1995     sctx.shift_top = 0;
1996     for (i = 0; i < n; i++) {
1997       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1998       rs    = 0.0;
1999       ajtmp = aj + ai[i];
2000       rtmp1 = aa + ai[i];
2001       nz    = ai[i + 1] - ai[i];
2002       for (j = 0; j < nz; j++) {
2003         if (*ajtmp != i) {
2004           rs += PetscAbsScalar(*rtmp1++);
2005         } else {
2006           rs -= PetscRealPart(*rtmp1++);
2007         }
2008         ajtmp++;
2009       }
2010       if (rs > sctx.shift_top) sctx.shift_top = rs;
2011     }
2012     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2013     sctx.shift_top *= 1.1;
2014     sctx.nshift_max = 5;
2015     sctx.shift_lo   = 0.;
2016     sctx.shift_hi   = 1.;
2017   }
2018   sctx.shift_amount = 0;
2019   sctx.nshift       = 0;
2020 
2021   PetscCall(ISGetIndices(isrow, &r));
2022   PetscCall(ISGetIndices(iscol, &c));
2023   PetscCall(ISGetIndices(isicol, &ic));
2024   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2025   ics = ic;
2026 
2027   node_max = a->inode.node_count;
2028   ns       = a->inode.size;
2029   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2030 
2031   /* If max inode size > 3, split it into two inodes.*/
2032   /* also map the inode sizes according to the ordering */
2033   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2034   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2035     if (ns[i] > 3) {
2036       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2037       ++j;
2038       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2039     } else {
2040       tmp_vec1[j] = ns[i];
2041     }
2042   }
2043   /* Use the correct node_max */
2044   node_max = j;
2045 
2046   /* Now reorder the inode info based on mat re-ordering info */
2047   /* First create a row -> inode_size_array_index map */
2048   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2049   for (i = 0, row = 0; i < node_max; i++) {
2050     nodesz = tmp_vec1[i];
2051     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2052   }
2053   /* Using nsmap, create a reordered ns structure */
2054   for (i = 0, j = 0; i < node_max; i++) {
2055     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2056     tmp_vec2[i] = nodesz;
2057     j += nodesz;
2058   }
2059   PetscCall(PetscFree2(nsmap, tmp_vec1));
2060   /* Now use the correct ns */
2061   ns = tmp_vec2;
2062 
2063   do {
2064     sctx.newshift = PETSC_FALSE;
2065     /* Now loop over each block-row, and do the factorization */
2066     for (i = 0, row = 0; i < node_max; i++) {
2067       nodesz = ns[i];
2068       nz     = bi[row + 1] - bi[row];
2069       bjtmp  = bj + bi[row];
2070 
2071       switch (nodesz) {
2072       case 1:
2073         for (j = 0; j < nz; j++) {
2074           idx         = bjtmp[j];
2075           rtmp11[idx] = 0.0;
2076         }
2077 
2078         /* load in initial (unfactored row) */
2079         idx    = r[row];
2080         nz_tmp = ai[idx + 1] - ai[idx];
2081         ajtmp  = aj + ai[idx];
2082         v1     = aa + ai[idx];
2083 
2084         for (j = 0; j < nz_tmp; j++) {
2085           idx         = ics[ajtmp[j]];
2086           rtmp11[idx] = v1[j];
2087         }
2088         rtmp11[ics[r[row]]] += sctx.shift_amount;
2089 
2090         prow = *bjtmp++;
2091         while (prow < row) {
2092           pc1 = rtmp11 + prow;
2093           if (*pc1 != 0.0) {
2094             pv     = ba + bd[prow];
2095             pj     = nbj + bd[prow];
2096             mul1   = *pc1 * *pv++;
2097             *pc1   = mul1;
2098             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2099             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2100             for (j = 0; j < nz_tmp; j++) {
2101               tmp = pv[j];
2102               idx = pj[j];
2103               rtmp11[idx] -= mul1 * tmp;
2104             }
2105           }
2106           prow = *bjtmp++;
2107         }
2108         pj  = bj + bi[row];
2109         pc1 = ba + bi[row];
2110 
2111         sctx.pv     = rtmp11[row];
2112         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2113         rs          = 0.0;
2114         for (j = 0; j < nz; j++) {
2115           idx    = pj[j];
2116           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2117           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2118         }
2119         sctx.rs = rs;
2120         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2121         if (sctx.newshift) goto endofwhile;
2122         break;
2123 
2124       case 2:
2125         for (j = 0; j < nz; j++) {
2126           idx         = bjtmp[j];
2127           rtmp11[idx] = 0.0;
2128           rtmp22[idx] = 0.0;
2129         }
2130 
2131         /* load in initial (unfactored row) */
2132         idx    = r[row];
2133         nz_tmp = ai[idx + 1] - ai[idx];
2134         ajtmp  = aj + ai[idx];
2135         v1     = aa + ai[idx];
2136         v2     = aa + ai[idx + 1];
2137         for (j = 0; j < nz_tmp; j++) {
2138           idx         = ics[ajtmp[j]];
2139           rtmp11[idx] = v1[j];
2140           rtmp22[idx] = v2[j];
2141         }
2142         rtmp11[ics[r[row]]] += sctx.shift_amount;
2143         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2144 
2145         prow = *bjtmp++;
2146         while (prow < row) {
2147           pc1 = rtmp11 + prow;
2148           pc2 = rtmp22 + prow;
2149           if (*pc1 != 0.0 || *pc2 != 0.0) {
2150             pv   = ba + bd[prow];
2151             pj   = nbj + bd[prow];
2152             mul1 = *pc1 * *pv;
2153             mul2 = *pc2 * *pv;
2154             ++pv;
2155             *pc1 = mul1;
2156             *pc2 = mul2;
2157 
2158             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2159             for (j = 0; j < nz_tmp; j++) {
2160               tmp = pv[j];
2161               idx = pj[j];
2162               rtmp11[idx] -= mul1 * tmp;
2163               rtmp22[idx] -= mul2 * tmp;
2164             }
2165             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2166           }
2167           prow = *bjtmp++;
2168         }
2169 
2170         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2171         pc1 = rtmp11 + prow;
2172         pc2 = rtmp22 + prow;
2173 
2174         sctx.pv = *pc1;
2175         pj      = bj + bi[prow];
2176         rs      = 0.0;
2177         for (j = 0; j < nz; j++) {
2178           idx = pj[j];
2179           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2180         }
2181         sctx.rs = rs;
2182         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2183         if (sctx.newshift) goto endofwhile;
2184 
2185         if (*pc2 != 0.0) {
2186           pj     = nbj + bd[prow];
2187           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2188           *pc2   = mul2;
2189           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2190           for (j = 0; j < nz_tmp; j++) {
2191             idx = pj[j];
2192             tmp = rtmp11[idx];
2193             rtmp22[idx] -= mul2 * tmp;
2194           }
2195           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2196         }
2197 
2198         pj  = bj + bi[row];
2199         pc1 = ba + bi[row];
2200         pc2 = ba + bi[row + 1];
2201 
2202         sctx.pv         = rtmp22[row + 1];
2203         rs              = 0.0;
2204         rtmp11[row]     = 1.0 / rtmp11[row];
2205         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2206         /* copy row entries from dense representation to sparse */
2207         for (j = 0; j < nz; j++) {
2208           idx    = pj[j];
2209           pc1[j] = rtmp11[idx];
2210           pc2[j] = rtmp22[idx];
2211           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2212         }
2213         sctx.rs = rs;
2214         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2215         if (sctx.newshift) goto endofwhile;
2216         break;
2217 
2218       case 3:
2219         for (j = 0; j < nz; j++) {
2220           idx         = bjtmp[j];
2221           rtmp11[idx] = 0.0;
2222           rtmp22[idx] = 0.0;
2223           rtmp33[idx] = 0.0;
2224         }
2225         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2226         idx    = r[row];
2227         nz_tmp = ai[idx + 1] - ai[idx];
2228         ajtmp  = aj + ai[idx];
2229         v1     = aa + ai[idx];
2230         v2     = aa + ai[idx + 1];
2231         v3     = aa + ai[idx + 2];
2232         for (j = 0; j < nz_tmp; j++) {
2233           idx         = ics[ajtmp[j]];
2234           rtmp11[idx] = v1[j];
2235           rtmp22[idx] = v2[j];
2236           rtmp33[idx] = v3[j];
2237         }
2238         rtmp11[ics[r[row]]] += sctx.shift_amount;
2239         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2240         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2241 
2242         /* loop over all pivot row blocks above this row block */
2243         prow = *bjtmp++;
2244         while (prow < row) {
2245           pc1 = rtmp11 + prow;
2246           pc2 = rtmp22 + prow;
2247           pc3 = rtmp33 + prow;
2248           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2249             pv   = ba + bd[prow];
2250             pj   = nbj + bd[prow];
2251             mul1 = *pc1 * *pv;
2252             mul2 = *pc2 * *pv;
2253             mul3 = *pc3 * *pv;
2254             ++pv;
2255             *pc1 = mul1;
2256             *pc2 = mul2;
2257             *pc3 = mul3;
2258 
2259             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2260             /* update this row based on pivot row */
2261             for (j = 0; j < nz_tmp; j++) {
2262               tmp = pv[j];
2263               idx = pj[j];
2264               rtmp11[idx] -= mul1 * tmp;
2265               rtmp22[idx] -= mul2 * tmp;
2266               rtmp33[idx] -= mul3 * tmp;
2267             }
2268             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2269           }
2270           prow = *bjtmp++;
2271         }
2272 
2273         /* Now take care of diagonal 3x3 block in this set of rows */
2274         /* note: prow = row here */
2275         pc1 = rtmp11 + prow;
2276         pc2 = rtmp22 + prow;
2277         pc3 = rtmp33 + prow;
2278 
2279         sctx.pv = *pc1;
2280         pj      = bj + bi[prow];
2281         rs      = 0.0;
2282         for (j = 0; j < nz; j++) {
2283           idx = pj[j];
2284           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2285         }
2286         sctx.rs = rs;
2287         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2288         if (sctx.newshift) goto endofwhile;
2289 
2290         if (*pc2 != 0.0 || *pc3 != 0.0) {
2291           mul2   = (*pc2) / (*pc1);
2292           mul3   = (*pc3) / (*pc1);
2293           *pc2   = mul2;
2294           *pc3   = mul3;
2295           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2296           pj     = nbj + bd[prow];
2297           for (j = 0; j < nz_tmp; j++) {
2298             idx = pj[j];
2299             tmp = rtmp11[idx];
2300             rtmp22[idx] -= mul2 * tmp;
2301             rtmp33[idx] -= mul3 * tmp;
2302           }
2303           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2304         }
2305         ++prow;
2306 
2307         pc2     = rtmp22 + prow;
2308         pc3     = rtmp33 + prow;
2309         sctx.pv = *pc2;
2310         pj      = bj + bi[prow];
2311         rs      = 0.0;
2312         for (j = 0; j < nz; j++) {
2313           idx = pj[j];
2314           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2315         }
2316         sctx.rs = rs;
2317         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2318         if (sctx.newshift) goto endofwhile;
2319 
2320         if (*pc3 != 0.0) {
2321           mul3   = (*pc3) / (*pc2);
2322           *pc3   = mul3;
2323           pj     = nbj + bd[prow];
2324           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2325           for (j = 0; j < nz_tmp; j++) {
2326             idx = pj[j];
2327             tmp = rtmp22[idx];
2328             rtmp33[idx] -= mul3 * tmp;
2329           }
2330           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2331         }
2332 
2333         pj  = bj + bi[row];
2334         pc1 = ba + bi[row];
2335         pc2 = ba + bi[row + 1];
2336         pc3 = ba + bi[row + 2];
2337 
2338         sctx.pv         = rtmp33[row + 2];
2339         rs              = 0.0;
2340         rtmp11[row]     = 1.0 / rtmp11[row];
2341         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2342         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2343         /* copy row entries from dense representation to sparse */
2344         for (j = 0; j < nz; j++) {
2345           idx    = pj[j];
2346           pc1[j] = rtmp11[idx];
2347           pc2[j] = rtmp22[idx];
2348           pc3[j] = rtmp33[idx];
2349           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2350         }
2351 
2352         sctx.rs = rs;
2353         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2354         if (sctx.newshift) goto endofwhile;
2355         break;
2356 
2357       default:
2358         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2359       }
2360       row += nodesz; /* Update the row */
2361     }
2362   endofwhile:;
2363   } while (sctx.newshift);
2364   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2365   PetscCall(PetscFree(tmp_vec2));
2366   PetscCall(ISRestoreIndices(isicol, &ic));
2367   PetscCall(ISRestoreIndices(isrow, &r));
2368   PetscCall(ISRestoreIndices(iscol, &c));
2369 
2370   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2371   /* do not set solve add, since MatSolve_Inode + Add is faster */
2372   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2373   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2374   C->assembled              = PETSC_TRUE;
2375   C->preallocated           = PETSC_TRUE;
2376   if (sctx.nshift) {
2377     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2378       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2379     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2380       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2381     }
2382   }
2383   PetscCall(PetscLogFlops(C->cmap->n));
2384   PetscCall(MatSeqAIJCheckInode(C));
2385   PetscFunctionReturn(PETSC_SUCCESS);
2386 }
2387 #endif
2388 
2389 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2390 {
2391   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2392   IS                 iscol = a->col, isrow = a->row;
2393   const PetscInt    *r, *c, *rout, *cout;
2394   PetscInt           i, j, n = A->rmap->n;
2395   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2396   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2397   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2398   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2399   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2400   const PetscScalar *b;
2401 
2402   PetscFunctionBegin;
2403   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2404   node_max = a->inode.node_count;
2405   ns       = a->inode.size; /* Node Size array */
2406 
2407   PetscCall(VecGetArrayRead(bb, &b));
2408   PetscCall(VecGetArrayWrite(xx, &x));
2409   tmp = a->solve_work;
2410 
2411   PetscCall(ISGetIndices(isrow, &rout));
2412   r = rout;
2413   PetscCall(ISGetIndices(iscol, &cout));
2414   c = cout;
2415 
2416   /* forward solve the lower triangular */
2417   tmps = tmp;
2418   aa   = a_a;
2419   aj   = a_j;
2420   ad   = a->diag;
2421 
2422   for (i = 0, row = 0; i < node_max; ++i) {
2423     nsz = ns[i];
2424     aii = ai[row];
2425     v1  = aa + aii;
2426     vi  = aj + aii;
2427     nz  = ai[row + 1] - ai[row];
2428 
2429     if (i < node_max - 1) {
2430       /* Prefetch the indices for the next block */
2431       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2432       /* Prefetch the data for the next block */
2433       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2434     }
2435 
2436     switch (nsz) { /* Each loop in 'case' is unrolled */
2437     case 1:
2438       sum1 = b[r[row]];
2439       for (j = 0; j < nz - 1; j += 2) {
2440         i0   = vi[j];
2441         i1   = vi[j + 1];
2442         tmp0 = tmps[i0];
2443         tmp1 = tmps[i1];
2444         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2445       }
2446       if (j == nz - 1) {
2447         tmp0 = tmps[vi[j]];
2448         sum1 -= v1[j] * tmp0;
2449       }
2450       tmp[row++] = sum1;
2451       break;
2452     case 2:
2453       sum1 = b[r[row]];
2454       sum2 = b[r[row + 1]];
2455       v2   = aa + ai[row + 1];
2456 
2457       for (j = 0; j < nz - 1; j += 2) {
2458         i0   = vi[j];
2459         i1   = vi[j + 1];
2460         tmp0 = tmps[i0];
2461         tmp1 = tmps[i1];
2462         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2463         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2464       }
2465       if (j == nz - 1) {
2466         tmp0 = tmps[vi[j]];
2467         sum1 -= v1[j] * tmp0;
2468         sum2 -= v2[j] * tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       tmp[row++] = sum1;
2472       tmp[row++] = sum2;
2473       break;
2474     case 3:
2475       sum1 = b[r[row]];
2476       sum2 = b[r[row + 1]];
2477       sum3 = b[r[row + 2]];
2478       v2   = aa + ai[row + 1];
2479       v3   = aa + ai[row + 2];
2480 
2481       for (j = 0; j < nz - 1; j += 2) {
2482         i0   = vi[j];
2483         i1   = vi[j + 1];
2484         tmp0 = tmps[i0];
2485         tmp1 = tmps[i1];
2486         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2487         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2488         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2489       }
2490       if (j == nz - 1) {
2491         tmp0 = tmps[vi[j]];
2492         sum1 -= v1[j] * tmp0;
2493         sum2 -= v2[j] * tmp0;
2494         sum3 -= v3[j] * tmp0;
2495       }
2496       sum2 -= v2[nz] * sum1;
2497       sum3 -= v3[nz] * sum1;
2498       sum3 -= v3[nz + 1] * sum2;
2499       tmp[row++] = sum1;
2500       tmp[row++] = sum2;
2501       tmp[row++] = sum3;
2502       break;
2503 
2504     case 4:
2505       sum1 = b[r[row]];
2506       sum2 = b[r[row + 1]];
2507       sum3 = b[r[row + 2]];
2508       sum4 = b[r[row + 3]];
2509       v2   = aa + ai[row + 1];
2510       v3   = aa + ai[row + 2];
2511       v4   = aa + ai[row + 3];
2512 
2513       for (j = 0; j < nz - 1; j += 2) {
2514         i0   = vi[j];
2515         i1   = vi[j + 1];
2516         tmp0 = tmps[i0];
2517         tmp1 = tmps[i1];
2518         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2519         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2520         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2521         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2522       }
2523       if (j == nz - 1) {
2524         tmp0 = tmps[vi[j]];
2525         sum1 -= v1[j] * tmp0;
2526         sum2 -= v2[j] * tmp0;
2527         sum3 -= v3[j] * tmp0;
2528         sum4 -= v4[j] * tmp0;
2529       }
2530       sum2 -= v2[nz] * sum1;
2531       sum3 -= v3[nz] * sum1;
2532       sum4 -= v4[nz] * sum1;
2533       sum3 -= v3[nz + 1] * sum2;
2534       sum4 -= v4[nz + 1] * sum2;
2535       sum4 -= v4[nz + 2] * sum3;
2536 
2537       tmp[row++] = sum1;
2538       tmp[row++] = sum2;
2539       tmp[row++] = sum3;
2540       tmp[row++] = sum4;
2541       break;
2542     case 5:
2543       sum1 = b[r[row]];
2544       sum2 = b[r[row + 1]];
2545       sum3 = b[r[row + 2]];
2546       sum4 = b[r[row + 3]];
2547       sum5 = b[r[row + 4]];
2548       v2   = aa + ai[row + 1];
2549       v3   = aa + ai[row + 2];
2550       v4   = aa + ai[row + 3];
2551       v5   = aa + ai[row + 4];
2552 
2553       for (j = 0; j < nz - 1; j += 2) {
2554         i0   = vi[j];
2555         i1   = vi[j + 1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2559         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2560         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2561         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2562         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2563       }
2564       if (j == nz - 1) {
2565         tmp0 = tmps[vi[j]];
2566         sum1 -= v1[j] * tmp0;
2567         sum2 -= v2[j] * tmp0;
2568         sum3 -= v3[j] * tmp0;
2569         sum4 -= v4[j] * tmp0;
2570         sum5 -= v5[j] * tmp0;
2571       }
2572 
2573       sum2 -= v2[nz] * sum1;
2574       sum3 -= v3[nz] * sum1;
2575       sum4 -= v4[nz] * sum1;
2576       sum5 -= v5[nz] * sum1;
2577       sum3 -= v3[nz + 1] * sum2;
2578       sum4 -= v4[nz + 1] * sum2;
2579       sum5 -= v5[nz + 1] * sum2;
2580       sum4 -= v4[nz + 2] * sum3;
2581       sum5 -= v5[nz + 2] * sum3;
2582       sum5 -= v5[nz + 3] * sum4;
2583 
2584       tmp[row++] = sum1;
2585       tmp[row++] = sum2;
2586       tmp[row++] = sum3;
2587       tmp[row++] = sum4;
2588       tmp[row++] = sum5;
2589       break;
2590     default:
2591       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2592     }
2593   }
2594   /* backward solve the upper triangular */
2595   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2596     nsz = ns[i];
2597     aii = ad[row + 1] + 1;
2598     v1  = aa + aii;
2599     vi  = aj + aii;
2600     nz  = ad[row] - ad[row + 1] - 1;
2601 
2602     if (i > 0) {
2603       /* Prefetch the indices for the next block */
2604       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2605       /* Prefetch the data for the next block */
2606       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2607     }
2608 
2609     switch (nsz) { /* Each loop in 'case' is unrolled */
2610     case 1:
2611       sum1 = tmp[row];
2612 
2613       for (j = 0; j < nz - 1; j += 2) {
2614         i0   = vi[j];
2615         i1   = vi[j + 1];
2616         tmp0 = tmps[i0];
2617         tmp1 = tmps[i1];
2618         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2619       }
2620       if (j == nz - 1) {
2621         tmp0 = tmps[vi[j]];
2622         sum1 -= v1[j] * tmp0;
2623       }
2624       x[c[row]] = tmp[row] = sum1 * v1[nz];
2625       row--;
2626       break;
2627     case 2:
2628       sum1 = tmp[row];
2629       sum2 = tmp[row - 1];
2630       v2   = aa + ad[row] + 1;
2631       for (j = 0; j < nz - 1; j += 2) {
2632         i0   = vi[j];
2633         i1   = vi[j + 1];
2634         tmp0 = tmps[i0];
2635         tmp1 = tmps[i1];
2636         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2637         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2638       }
2639       if (j == nz - 1) {
2640         tmp0 = tmps[vi[j]];
2641         sum1 -= v1[j] * tmp0;
2642         sum2 -= v2[j + 1] * tmp0;
2643       }
2644 
2645       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2646       row--;
2647       sum2 -= v2[0] * tmp0;
2648       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2649       row--;
2650       break;
2651     case 3:
2652       sum1 = tmp[row];
2653       sum2 = tmp[row - 1];
2654       sum3 = tmp[row - 2];
2655       v2   = aa + ad[row] + 1;
2656       v3   = aa + ad[row - 1] + 1;
2657       for (j = 0; j < nz - 1; j += 2) {
2658         i0   = vi[j];
2659         i1   = vi[j + 1];
2660         tmp0 = tmps[i0];
2661         tmp1 = tmps[i1];
2662         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2663         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2664         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2665       }
2666       if (j == nz - 1) {
2667         tmp0 = tmps[vi[j]];
2668         sum1 -= v1[j] * tmp0;
2669         sum2 -= v2[j + 1] * tmp0;
2670         sum3 -= v3[j + 2] * tmp0;
2671       }
2672       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2673       row--;
2674       sum2 -= v2[0] * tmp0;
2675       sum3 -= v3[1] * tmp0;
2676       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2677       row--;
2678       sum3 -= v3[0] * tmp0;
2679       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2680       row--;
2681 
2682       break;
2683     case 4:
2684       sum1 = tmp[row];
2685       sum2 = tmp[row - 1];
2686       sum3 = tmp[row - 2];
2687       sum4 = tmp[row - 3];
2688       v2   = aa + ad[row] + 1;
2689       v3   = aa + ad[row - 1] + 1;
2690       v4   = aa + ad[row - 2] + 1;
2691 
2692       for (j = 0; j < nz - 1; j += 2) {
2693         i0   = vi[j];
2694         i1   = vi[j + 1];
2695         tmp0 = tmps[i0];
2696         tmp1 = tmps[i1];
2697         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2698         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2699         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2700         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2701       }
2702       if (j == nz - 1) {
2703         tmp0 = tmps[vi[j]];
2704         sum1 -= v1[j] * tmp0;
2705         sum2 -= v2[j + 1] * tmp0;
2706         sum3 -= v3[j + 2] * tmp0;
2707         sum4 -= v4[j + 3] * tmp0;
2708       }
2709 
2710       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2711       row--;
2712       sum2 -= v2[0] * tmp0;
2713       sum3 -= v3[1] * tmp0;
2714       sum4 -= v4[2] * tmp0;
2715       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2716       row--;
2717       sum3 -= v3[0] * tmp0;
2718       sum4 -= v4[1] * tmp0;
2719       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2720       row--;
2721       sum4 -= v4[0] * tmp0;
2722       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2723       row--;
2724       break;
2725     case 5:
2726       sum1 = tmp[row];
2727       sum2 = tmp[row - 1];
2728       sum3 = tmp[row - 2];
2729       sum4 = tmp[row - 3];
2730       sum5 = tmp[row - 4];
2731       v2   = aa + ad[row] + 1;
2732       v3   = aa + ad[row - 1] + 1;
2733       v4   = aa + ad[row - 2] + 1;
2734       v5   = aa + ad[row - 3] + 1;
2735       for (j = 0; j < nz - 1; j += 2) {
2736         i0   = vi[j];
2737         i1   = vi[j + 1];
2738         tmp0 = tmps[i0];
2739         tmp1 = tmps[i1];
2740         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2741         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2742         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2743         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2744         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2745       }
2746       if (j == nz - 1) {
2747         tmp0 = tmps[vi[j]];
2748         sum1 -= v1[j] * tmp0;
2749         sum2 -= v2[j + 1] * tmp0;
2750         sum3 -= v3[j + 2] * tmp0;
2751         sum4 -= v4[j + 3] * tmp0;
2752         sum5 -= v5[j + 4] * tmp0;
2753       }
2754 
2755       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2756       row--;
2757       sum2 -= v2[0] * tmp0;
2758       sum3 -= v3[1] * tmp0;
2759       sum4 -= v4[2] * tmp0;
2760       sum5 -= v5[3] * tmp0;
2761       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2762       row--;
2763       sum3 -= v3[0] * tmp0;
2764       sum4 -= v4[1] * tmp0;
2765       sum5 -= v5[2] * tmp0;
2766       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2767       row--;
2768       sum4 -= v4[0] * tmp0;
2769       sum5 -= v5[1] * tmp0;
2770       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2771       row--;
2772       sum5 -= v5[0] * tmp0;
2773       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2774       row--;
2775       break;
2776     default:
2777       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2778     }
2779   }
2780   PetscCall(ISRestoreIndices(isrow, &rout));
2781   PetscCall(ISRestoreIndices(iscol, &cout));
2782   PetscCall(VecRestoreArrayRead(bb, &b));
2783   PetscCall(VecRestoreArrayWrite(xx, &x));
2784   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2785   PetscFunctionReturn(PETSC_SUCCESS);
2786 }
2787 
2788 /*
2789      Makes a longer coloring[] array and calls the usual code with that
2790 */
2791 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2792 {
2793   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2794   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2795   PetscInt        *colorused, i;
2796   ISColoringValue *newcolor;
2797 
2798   PetscFunctionBegin;
2799   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2800   PetscCall(PetscMalloc1(n + 1, &newcolor));
2801   /* loop over inodes, marking a color for each column*/
2802   row = 0;
2803   for (i = 0; i < m; i++) {
2804     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2805   }
2806 
2807   /* eliminate unneeded colors */
2808   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2809   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2810 
2811   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2812   ncolors = colorused[5 * ncolors - 1];
2813   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2814   PetscCall(PetscFree(colorused));
2815   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2816   PetscCall(PetscFree(coloring));
2817   PetscFunctionReturn(PETSC_SUCCESS);
2818 }
2819 
2820 #include <petsc/private/kernels/blockinvert.h>
2821 
2822 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2823 {
2824   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2825   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2826   MatScalar         *ibdiag, *bdiag, work[25], *t;
2827   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2828   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2829   const PetscScalar *xb, *b;
2830   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2831   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2832   PetscInt           sz, k, ipvt[5];
2833   PetscBool          allowzeropivot, zeropivotdetected;
2834   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2835 
2836   PetscFunctionBegin;
2837   allowzeropivot = PetscNot(A->erroriffailure);
2838   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2839   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2840   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2841 
2842   if (!a->inode.ibdiagvalid) {
2843     if (!a->inode.ibdiag) {
2844       /* calculate space needed for diagonal blocks */
2845       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2846       a->inode.bdiagsize = cnt;
2847 
2848       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2849     }
2850 
2851     /* copy over the diagonal blocks and invert them */
2852     ibdiag = a->inode.ibdiag;
2853     bdiag  = a->inode.bdiag;
2854     cnt    = 0;
2855     for (i = 0, row = 0; i < m; i++) {
2856       for (j = 0; j < sizes[i]; j++) {
2857         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2858       }
2859       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2860 
2861       switch (sizes[i]) {
2862       case 1:
2863         /* Create matrix data structure */
2864         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2865           if (allowzeropivot) {
2866             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2867             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2868             A->factorerror_zeropivot_row   = row;
2869             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2870           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2871         }
2872         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2873         break;
2874       case 2:
2875         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2876         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2877         break;
2878       case 3:
2879         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2880         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2881         break;
2882       case 4:
2883         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2884         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2885         break;
2886       case 5:
2887         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2888         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2889         break;
2890       default:
2891         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2892       }
2893       cnt += sizes[i] * sizes[i];
2894       row += sizes[i];
2895     }
2896     a->inode.ibdiagvalid = PETSC_TRUE;
2897   }
2898   ibdiag = a->inode.ibdiag;
2899   bdiag  = a->inode.bdiag;
2900   t      = a->inode.ssor_work;
2901 
2902   PetscCall(VecGetArray(xx, &x));
2903   PetscCall(VecGetArrayRead(bb, &b));
2904   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2905   if (flag & SOR_ZERO_INITIAL_GUESS) {
2906     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2907       for (i = 0, row = 0; i < m; i++) {
2908         sz  = diag[row] - ii[row];
2909         v1  = a->a + ii[row];
2910         idx = a->j + ii[row];
2911 
2912         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2913         switch (sizes[i]) {
2914         case 1:
2915 
2916           sum1 = b[row];
2917           for (n = 0; n < sz - 1; n += 2) {
2918             i1 = idx[0];
2919             i2 = idx[1];
2920             idx += 2;
2921             tmp0 = x[i1];
2922             tmp1 = x[i2];
2923             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2924             v1 += 2;
2925           }
2926 
2927           if (n == sz - 1) {
2928             tmp0 = x[*idx];
2929             sum1 -= *v1 * tmp0;
2930           }
2931           t[row]   = sum1;
2932           x[row++] = sum1 * (*ibdiag++);
2933           break;
2934         case 2:
2935           v2   = a->a + ii[row + 1];
2936           sum1 = b[row];
2937           sum2 = b[row + 1];
2938           for (n = 0; n < sz - 1; n += 2) {
2939             i1 = idx[0];
2940             i2 = idx[1];
2941             idx += 2;
2942             tmp0 = x[i1];
2943             tmp1 = x[i2];
2944             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2945             v1 += 2;
2946             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2947             v2 += 2;
2948           }
2949 
2950           if (n == sz - 1) {
2951             tmp0 = x[*idx];
2952             sum1 -= v1[0] * tmp0;
2953             sum2 -= v2[0] * tmp0;
2954           }
2955           t[row]     = sum1;
2956           t[row + 1] = sum2;
2957           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2958           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2959           ibdiag += 4;
2960           break;
2961         case 3:
2962           v2   = a->a + ii[row + 1];
2963           v3   = a->a + ii[row + 2];
2964           sum1 = b[row];
2965           sum2 = b[row + 1];
2966           sum3 = b[row + 2];
2967           for (n = 0; n < sz - 1; n += 2) {
2968             i1 = idx[0];
2969             i2 = idx[1];
2970             idx += 2;
2971             tmp0 = x[i1];
2972             tmp1 = x[i2];
2973             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2974             v1 += 2;
2975             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2976             v2 += 2;
2977             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2978             v3 += 2;
2979           }
2980 
2981           if (n == sz - 1) {
2982             tmp0 = x[*idx];
2983             sum1 -= v1[0] * tmp0;
2984             sum2 -= v2[0] * tmp0;
2985             sum3 -= v3[0] * tmp0;
2986           }
2987           t[row]     = sum1;
2988           t[row + 1] = sum2;
2989           t[row + 2] = sum3;
2990           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2991           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2992           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2993           ibdiag += 9;
2994           break;
2995         case 4:
2996           v2   = a->a + ii[row + 1];
2997           v3   = a->a + ii[row + 2];
2998           v4   = a->a + ii[row + 3];
2999           sum1 = b[row];
3000           sum2 = b[row + 1];
3001           sum3 = b[row + 2];
3002           sum4 = b[row + 3];
3003           for (n = 0; n < sz - 1; n += 2) {
3004             i1 = idx[0];
3005             i2 = idx[1];
3006             idx += 2;
3007             tmp0 = x[i1];
3008             tmp1 = x[i2];
3009             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3010             v1 += 2;
3011             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3012             v2 += 2;
3013             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3014             v3 += 2;
3015             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3016             v4 += 2;
3017           }
3018 
3019           if (n == sz - 1) {
3020             tmp0 = x[*idx];
3021             sum1 -= v1[0] * tmp0;
3022             sum2 -= v2[0] * tmp0;
3023             sum3 -= v3[0] * tmp0;
3024             sum4 -= v4[0] * tmp0;
3025           }
3026           t[row]     = sum1;
3027           t[row + 1] = sum2;
3028           t[row + 2] = sum3;
3029           t[row + 3] = sum4;
3030           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3031           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3032           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3033           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3034           ibdiag += 16;
3035           break;
3036         case 5:
3037           v2   = a->a + ii[row + 1];
3038           v3   = a->a + ii[row + 2];
3039           v4   = a->a + ii[row + 3];
3040           v5   = a->a + ii[row + 4];
3041           sum1 = b[row];
3042           sum2 = b[row + 1];
3043           sum3 = b[row + 2];
3044           sum4 = b[row + 3];
3045           sum5 = b[row + 4];
3046           for (n = 0; n < sz - 1; n += 2) {
3047             i1 = idx[0];
3048             i2 = idx[1];
3049             idx += 2;
3050             tmp0 = x[i1];
3051             tmp1 = x[i2];
3052             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3053             v1 += 2;
3054             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3055             v2 += 2;
3056             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3057             v3 += 2;
3058             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3059             v4 += 2;
3060             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3061             v5 += 2;
3062           }
3063 
3064           if (n == sz - 1) {
3065             tmp0 = x[*idx];
3066             sum1 -= v1[0] * tmp0;
3067             sum2 -= v2[0] * tmp0;
3068             sum3 -= v3[0] * tmp0;
3069             sum4 -= v4[0] * tmp0;
3070             sum5 -= v5[0] * tmp0;
3071           }
3072           t[row]     = sum1;
3073           t[row + 1] = sum2;
3074           t[row + 2] = sum3;
3075           t[row + 3] = sum4;
3076           t[row + 4] = sum5;
3077           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3078           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3079           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3080           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3081           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3082           ibdiag += 25;
3083           break;
3084         default:
3085           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3086         }
3087       }
3088 
3089       xb = t;
3090       PetscCall(PetscLogFlops(a->nz));
3091     } else xb = b;
3092     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3093       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3094       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3095         ibdiag -= sizes[i] * sizes[i];
3096         sz  = ii[row + 1] - diag[row] - 1;
3097         v1  = a->a + diag[row] + 1;
3098         idx = a->j + diag[row] + 1;
3099 
3100         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3101         switch (sizes[i]) {
3102         case 1:
3103 
3104           sum1 = xb[row];
3105           for (n = 0; n < sz - 1; n += 2) {
3106             i1 = idx[0];
3107             i2 = idx[1];
3108             idx += 2;
3109             tmp0 = x[i1];
3110             tmp1 = x[i2];
3111             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3112             v1 += 2;
3113           }
3114 
3115           if (n == sz - 1) {
3116             tmp0 = x[*idx];
3117             sum1 -= *v1 * tmp0;
3118           }
3119           x[row--] = sum1 * (*ibdiag);
3120           break;
3121 
3122         case 2:
3123 
3124           sum1 = xb[row];
3125           sum2 = xb[row - 1];
3126           /* note that sum1 is associated with the second of the two rows */
3127           v2 = a->a + diag[row - 1] + 2;
3128           for (n = 0; n < sz - 1; n += 2) {
3129             i1 = idx[0];
3130             i2 = idx[1];
3131             idx += 2;
3132             tmp0 = x[i1];
3133             tmp1 = x[i2];
3134             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3135             v1 += 2;
3136             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3137             v2 += 2;
3138           }
3139 
3140           if (n == sz - 1) {
3141             tmp0 = x[*idx];
3142             sum1 -= *v1 * tmp0;
3143             sum2 -= *v2 * tmp0;
3144           }
3145           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3146           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3147           break;
3148         case 3:
3149 
3150           sum1 = xb[row];
3151           sum2 = xb[row - 1];
3152           sum3 = xb[row - 2];
3153           v2   = a->a + diag[row - 1] + 2;
3154           v3   = a->a + diag[row - 2] + 3;
3155           for (n = 0; n < sz - 1; n += 2) {
3156             i1 = idx[0];
3157             i2 = idx[1];
3158             idx += 2;
3159             tmp0 = x[i1];
3160             tmp1 = x[i2];
3161             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3162             v1 += 2;
3163             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3164             v2 += 2;
3165             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3166             v3 += 2;
3167           }
3168 
3169           if (n == sz - 1) {
3170             tmp0 = x[*idx];
3171             sum1 -= *v1 * tmp0;
3172             sum2 -= *v2 * tmp0;
3173             sum3 -= *v3 * tmp0;
3174           }
3175           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3176           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3177           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3178           break;
3179         case 4:
3180 
3181           sum1 = xb[row];
3182           sum2 = xb[row - 1];
3183           sum3 = xb[row - 2];
3184           sum4 = xb[row - 3];
3185           v2   = a->a + diag[row - 1] + 2;
3186           v3   = a->a + diag[row - 2] + 3;
3187           v4   = a->a + diag[row - 3] + 4;
3188           for (n = 0; n < sz - 1; n += 2) {
3189             i1 = idx[0];
3190             i2 = idx[1];
3191             idx += 2;
3192             tmp0 = x[i1];
3193             tmp1 = x[i2];
3194             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3195             v1 += 2;
3196             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3197             v2 += 2;
3198             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3199             v3 += 2;
3200             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3201             v4 += 2;
3202           }
3203 
3204           if (n == sz - 1) {
3205             tmp0 = x[*idx];
3206             sum1 -= *v1 * tmp0;
3207             sum2 -= *v2 * tmp0;
3208             sum3 -= *v3 * tmp0;
3209             sum4 -= *v4 * tmp0;
3210           }
3211           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3212           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3213           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3214           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3215           break;
3216         case 5:
3217 
3218           sum1 = xb[row];
3219           sum2 = xb[row - 1];
3220           sum3 = xb[row - 2];
3221           sum4 = xb[row - 3];
3222           sum5 = xb[row - 4];
3223           v2   = a->a + diag[row - 1] + 2;
3224           v3   = a->a + diag[row - 2] + 3;
3225           v4   = a->a + diag[row - 3] + 4;
3226           v5   = a->a + diag[row - 4] + 5;
3227           for (n = 0; n < sz - 1; n += 2) {
3228             i1 = idx[0];
3229             i2 = idx[1];
3230             idx += 2;
3231             tmp0 = x[i1];
3232             tmp1 = x[i2];
3233             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3234             v1 += 2;
3235             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3236             v2 += 2;
3237             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3238             v3 += 2;
3239             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3240             v4 += 2;
3241             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3242             v5 += 2;
3243           }
3244 
3245           if (n == sz - 1) {
3246             tmp0 = x[*idx];
3247             sum1 -= *v1 * tmp0;
3248             sum2 -= *v2 * tmp0;
3249             sum3 -= *v3 * tmp0;
3250             sum4 -= *v4 * tmp0;
3251             sum5 -= *v5 * tmp0;
3252           }
3253           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3254           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3255           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3256           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3257           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3258           break;
3259         default:
3260           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3261         }
3262       }
3263 
3264       PetscCall(PetscLogFlops(a->nz));
3265     }
3266     its--;
3267   }
3268   while (its--) {
3269     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3270       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3271         sz  = diag[row] - ii[row];
3272         v1  = a->a + ii[row];
3273         idx = a->j + ii[row];
3274         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3275         switch (sizes[i]) {
3276         case 1:
3277           sum1 = b[row];
3278           for (n = 0; n < sz - 1; n += 2) {
3279             i1 = idx[0];
3280             i2 = idx[1];
3281             idx += 2;
3282             tmp0 = x[i1];
3283             tmp1 = x[i2];
3284             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3285             v1 += 2;
3286           }
3287           if (n == sz - 1) {
3288             tmp0 = x[*idx++];
3289             sum1 -= *v1 * tmp0;
3290             v1++;
3291           }
3292           t[row] = sum1;
3293           sz     = ii[row + 1] - diag[row] - 1;
3294           idx    = a->j + diag[row] + 1;
3295           v1 += 1;
3296           for (n = 0; n < sz - 1; n += 2) {
3297             i1 = idx[0];
3298             i2 = idx[1];
3299             idx += 2;
3300             tmp0 = x[i1];
3301             tmp1 = x[i2];
3302             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3303             v1 += 2;
3304           }
3305           if (n == sz - 1) {
3306             tmp0 = x[*idx++];
3307             sum1 -= *v1 * tmp0;
3308           }
3309           /* in MatSOR_SeqAIJ this line would be
3310            *
3311            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3312            *
3313            * but omega == 1, so this becomes
3314            *
3315            * x[row] = sum1*(*ibdiag++);
3316            *
3317            */
3318           x[row] = sum1 * (*ibdiag);
3319           break;
3320         case 2:
3321           v2   = a->a + ii[row + 1];
3322           sum1 = b[row];
3323           sum2 = b[row + 1];
3324           for (n = 0; n < sz - 1; n += 2) {
3325             i1 = idx[0];
3326             i2 = idx[1];
3327             idx += 2;
3328             tmp0 = x[i1];
3329             tmp1 = x[i2];
3330             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3331             v1 += 2;
3332             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3333             v2 += 2;
3334           }
3335           if (n == sz - 1) {
3336             tmp0 = x[*idx++];
3337             sum1 -= v1[0] * tmp0;
3338             sum2 -= v2[0] * tmp0;
3339             v1++;
3340             v2++;
3341           }
3342           t[row]     = sum1;
3343           t[row + 1] = sum2;
3344           sz         = ii[row + 1] - diag[row] - 2;
3345           idx        = a->j + diag[row] + 2;
3346           v1 += 2;
3347           v2 += 2;
3348           for (n = 0; n < sz - 1; n += 2) {
3349             i1 = idx[0];
3350             i2 = idx[1];
3351             idx += 2;
3352             tmp0 = x[i1];
3353             tmp1 = x[i2];
3354             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3355             v1 += 2;
3356             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3357             v2 += 2;
3358           }
3359           if (n == sz - 1) {
3360             tmp0 = x[*idx];
3361             sum1 -= v1[0] * tmp0;
3362             sum2 -= v2[0] * tmp0;
3363           }
3364           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3365           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3366           break;
3367         case 3:
3368           v2   = a->a + ii[row + 1];
3369           v3   = a->a + ii[row + 2];
3370           sum1 = b[row];
3371           sum2 = b[row + 1];
3372           sum3 = b[row + 2];
3373           for (n = 0; n < sz - 1; n += 2) {
3374             i1 = idx[0];
3375             i2 = idx[1];
3376             idx += 2;
3377             tmp0 = x[i1];
3378             tmp1 = x[i2];
3379             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3380             v1 += 2;
3381             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3382             v2 += 2;
3383             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3384             v3 += 2;
3385           }
3386           if (n == sz - 1) {
3387             tmp0 = x[*idx++];
3388             sum1 -= v1[0] * tmp0;
3389             sum2 -= v2[0] * tmp0;
3390             sum3 -= v3[0] * tmp0;
3391             v1++;
3392             v2++;
3393             v3++;
3394           }
3395           t[row]     = sum1;
3396           t[row + 1] = sum2;
3397           t[row + 2] = sum3;
3398           sz         = ii[row + 1] - diag[row] - 3;
3399           idx        = a->j + diag[row] + 3;
3400           v1 += 3;
3401           v2 += 3;
3402           v3 += 3;
3403           for (n = 0; n < sz - 1; n += 2) {
3404             i1 = idx[0];
3405             i2 = idx[1];
3406             idx += 2;
3407             tmp0 = x[i1];
3408             tmp1 = x[i2];
3409             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3410             v1 += 2;
3411             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3412             v2 += 2;
3413             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3414             v3 += 2;
3415           }
3416           if (n == sz - 1) {
3417             tmp0 = x[*idx];
3418             sum1 -= v1[0] * tmp0;
3419             sum2 -= v2[0] * tmp0;
3420             sum3 -= v3[0] * tmp0;
3421           }
3422           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3423           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3424           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3425           break;
3426         case 4:
3427           v2   = a->a + ii[row + 1];
3428           v3   = a->a + ii[row + 2];
3429           v4   = a->a + ii[row + 3];
3430           sum1 = b[row];
3431           sum2 = b[row + 1];
3432           sum3 = b[row + 2];
3433           sum4 = b[row + 3];
3434           for (n = 0; n < sz - 1; n += 2) {
3435             i1 = idx[0];
3436             i2 = idx[1];
3437             idx += 2;
3438             tmp0 = x[i1];
3439             tmp1 = x[i2];
3440             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3441             v1 += 2;
3442             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3443             v2 += 2;
3444             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3445             v3 += 2;
3446             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3447             v4 += 2;
3448           }
3449           if (n == sz - 1) {
3450             tmp0 = x[*idx++];
3451             sum1 -= v1[0] * tmp0;
3452             sum2 -= v2[0] * tmp0;
3453             sum3 -= v3[0] * tmp0;
3454             sum4 -= v4[0] * tmp0;
3455             v1++;
3456             v2++;
3457             v3++;
3458             v4++;
3459           }
3460           t[row]     = sum1;
3461           t[row + 1] = sum2;
3462           t[row + 2] = sum3;
3463           t[row + 3] = sum4;
3464           sz         = ii[row + 1] - diag[row] - 4;
3465           idx        = a->j + diag[row] + 4;
3466           v1 += 4;
3467           v2 += 4;
3468           v3 += 4;
3469           v4 += 4;
3470           for (n = 0; n < sz - 1; n += 2) {
3471             i1 = idx[0];
3472             i2 = idx[1];
3473             idx += 2;
3474             tmp0 = x[i1];
3475             tmp1 = x[i2];
3476             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3477             v1 += 2;
3478             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3479             v2 += 2;
3480             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3481             v3 += 2;
3482             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3483             v4 += 2;
3484           }
3485           if (n == sz - 1) {
3486             tmp0 = x[*idx];
3487             sum1 -= v1[0] * tmp0;
3488             sum2 -= v2[0] * tmp0;
3489             sum3 -= v3[0] * tmp0;
3490             sum4 -= v4[0] * tmp0;
3491           }
3492           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3493           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3494           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3495           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3496           break;
3497         case 5:
3498           v2   = a->a + ii[row + 1];
3499           v3   = a->a + ii[row + 2];
3500           v4   = a->a + ii[row + 3];
3501           v5   = a->a + ii[row + 4];
3502           sum1 = b[row];
3503           sum2 = b[row + 1];
3504           sum3 = b[row + 2];
3505           sum4 = b[row + 3];
3506           sum5 = b[row + 4];
3507           for (n = 0; n < sz - 1; n += 2) {
3508             i1 = idx[0];
3509             i2 = idx[1];
3510             idx += 2;
3511             tmp0 = x[i1];
3512             tmp1 = x[i2];
3513             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3514             v1 += 2;
3515             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3516             v2 += 2;
3517             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3518             v3 += 2;
3519             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3520             v4 += 2;
3521             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3522             v5 += 2;
3523           }
3524           if (n == sz - 1) {
3525             tmp0 = x[*idx++];
3526             sum1 -= v1[0] * tmp0;
3527             sum2 -= v2[0] * tmp0;
3528             sum3 -= v3[0] * tmp0;
3529             sum4 -= v4[0] * tmp0;
3530             sum5 -= v5[0] * tmp0;
3531             v1++;
3532             v2++;
3533             v3++;
3534             v4++;
3535             v5++;
3536           }
3537           t[row]     = sum1;
3538           t[row + 1] = sum2;
3539           t[row + 2] = sum3;
3540           t[row + 3] = sum4;
3541           t[row + 4] = sum5;
3542           sz         = ii[row + 1] - diag[row] - 5;
3543           idx        = a->j + diag[row] + 5;
3544           v1 += 5;
3545           v2 += 5;
3546           v3 += 5;
3547           v4 += 5;
3548           v5 += 5;
3549           for (n = 0; n < sz - 1; n += 2) {
3550             i1 = idx[0];
3551             i2 = idx[1];
3552             idx += 2;
3553             tmp0 = x[i1];
3554             tmp1 = x[i2];
3555             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3556             v1 += 2;
3557             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3558             v2 += 2;
3559             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3560             v3 += 2;
3561             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3562             v4 += 2;
3563             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3564             v5 += 2;
3565           }
3566           if (n == sz - 1) {
3567             tmp0 = x[*idx];
3568             sum1 -= v1[0] * tmp0;
3569             sum2 -= v2[0] * tmp0;
3570             sum3 -= v3[0] * tmp0;
3571             sum4 -= v4[0] * tmp0;
3572             sum5 -= v5[0] * tmp0;
3573           }
3574           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3575           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3576           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3577           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3578           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3579           break;
3580         default:
3581           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3582         }
3583       }
3584       xb = t;
3585       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3586     } else xb = b;
3587 
3588     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3589       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3590       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3591         ibdiag -= sizes[i] * sizes[i];
3592 
3593         /* set RHS */
3594         if (xb == b) {
3595           /* whole (old way) */
3596           sz  = ii[row + 1] - ii[row];
3597           idx = a->j + ii[row];
3598           switch (sizes[i]) {
3599           case 5:
3600             v5 = a->a + ii[row - 4]; /* fall through */
3601           case 4:
3602             v4 = a->a + ii[row - 3]; /* fall through */
3603           case 3:
3604             v3 = a->a + ii[row - 2]; /* fall through */
3605           case 2:
3606             v2 = a->a + ii[row - 1]; /* fall through */
3607           case 1:
3608             v1 = a->a + ii[row];
3609             break;
3610           default:
3611             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3612           }
3613         } else {
3614           /* upper, no diag */
3615           sz  = ii[row + 1] - diag[row] - 1;
3616           idx = a->j + diag[row] + 1;
3617           switch (sizes[i]) {
3618           case 5:
3619             v5 = a->a + diag[row - 4] + 5; /* fall through */
3620           case 4:
3621             v4 = a->a + diag[row - 3] + 4; /* fall through */
3622           case 3:
3623             v3 = a->a + diag[row - 2] + 3; /* fall through */
3624           case 2:
3625             v2 = a->a + diag[row - 1] + 2; /* fall through */
3626           case 1:
3627             v1 = a->a + diag[row] + 1;
3628           }
3629         }
3630         /* set sum */
3631         switch (sizes[i]) {
3632         case 5:
3633           sum5 = xb[row - 4]; /* fall through */
3634         case 4:
3635           sum4 = xb[row - 3]; /* fall through */
3636         case 3:
3637           sum3 = xb[row - 2]; /* fall through */
3638         case 2:
3639           sum2 = xb[row - 1]; /* fall through */
3640         case 1:
3641           /* note that sum1 is associated with the last row */
3642           sum1 = xb[row];
3643         }
3644         /* do sums */
3645         for (n = 0; n < sz - 1; n += 2) {
3646           i1 = idx[0];
3647           i2 = idx[1];
3648           idx += 2;
3649           tmp0 = x[i1];
3650           tmp1 = x[i2];
3651           switch (sizes[i]) {
3652           case 5:
3653             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3654             v5 += 2; /* fall through */
3655           case 4:
3656             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3657             v4 += 2; /* fall through */
3658           case 3:
3659             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3660             v3 += 2; /* fall through */
3661           case 2:
3662             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3663             v2 += 2; /* fall through */
3664           case 1:
3665             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3666             v1 += 2;
3667           }
3668         }
3669         /* ragged edge */
3670         if (n == sz - 1) {
3671           tmp0 = x[*idx];
3672           switch (sizes[i]) {
3673           case 5:
3674             sum5 -= *v5 * tmp0; /* fall through */
3675           case 4:
3676             sum4 -= *v4 * tmp0; /* fall through */
3677           case 3:
3678             sum3 -= *v3 * tmp0; /* fall through */
3679           case 2:
3680             sum2 -= *v2 * tmp0; /* fall through */
3681           case 1:
3682             sum1 -= *v1 * tmp0;
3683           }
3684         }
3685         /* update */
3686         if (xb == b) {
3687           /* whole (old way) w/ diag */
3688           switch (sizes[i]) {
3689           case 5:
3690             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3691             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3692             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3693             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3694             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3695             break;
3696           case 4:
3697             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3698             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3699             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3700             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3701             break;
3702           case 3:
3703             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3704             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3705             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3706             break;
3707           case 2:
3708             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3709             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3710             break;
3711           case 1:
3712             x[row--] += sum1 * (*ibdiag);
3713             break;
3714           }
3715         } else {
3716           /* no diag so set =  */
3717           switch (sizes[i]) {
3718           case 5:
3719             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3720             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3721             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3722             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3723             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3724             break;
3725           case 4:
3726             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3727             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3728             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3729             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3730             break;
3731           case 3:
3732             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3733             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3734             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3735             break;
3736           case 2:
3737             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3738             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3739             break;
3740           case 1:
3741             x[row--] = sum1 * (*ibdiag);
3742             break;
3743           }
3744         }
3745       }
3746       if (xb == b) {
3747         PetscCall(PetscLogFlops(2.0 * a->nz));
3748       } else {
3749         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3750       }
3751     }
3752   }
3753   if (flag & SOR_EISENSTAT) {
3754     /*
3755           Apply  (U + D)^-1  where D is now the block diagonal
3756     */
3757     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3758     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3759       ibdiag -= sizes[i] * sizes[i];
3760       sz  = ii[row + 1] - diag[row] - 1;
3761       v1  = a->a + diag[row] + 1;
3762       idx = a->j + diag[row] + 1;
3763       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3764       switch (sizes[i]) {
3765       case 1:
3766 
3767         sum1 = b[row];
3768         for (n = 0; n < sz - 1; n += 2) {
3769           i1 = idx[0];
3770           i2 = idx[1];
3771           idx += 2;
3772           tmp0 = x[i1];
3773           tmp1 = x[i2];
3774           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3775           v1 += 2;
3776         }
3777 
3778         if (n == sz - 1) {
3779           tmp0 = x[*idx];
3780           sum1 -= *v1 * tmp0;
3781         }
3782         x[row] = sum1 * (*ibdiag);
3783         row--;
3784         break;
3785 
3786       case 2:
3787 
3788         sum1 = b[row];
3789         sum2 = b[row - 1];
3790         /* note that sum1 is associated with the second of the two rows */
3791         v2 = a->a + diag[row - 1] + 2;
3792         for (n = 0; n < sz - 1; n += 2) {
3793           i1 = idx[0];
3794           i2 = idx[1];
3795           idx += 2;
3796           tmp0 = x[i1];
3797           tmp1 = x[i2];
3798           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3799           v1 += 2;
3800           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3801           v2 += 2;
3802         }
3803 
3804         if (n == sz - 1) {
3805           tmp0 = x[*idx];
3806           sum1 -= *v1 * tmp0;
3807           sum2 -= *v2 * tmp0;
3808         }
3809         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3810         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3811         row -= 2;
3812         break;
3813       case 3:
3814 
3815         sum1 = b[row];
3816         sum2 = b[row - 1];
3817         sum3 = b[row - 2];
3818         v2   = a->a + diag[row - 1] + 2;
3819         v3   = a->a + diag[row - 2] + 3;
3820         for (n = 0; n < sz - 1; n += 2) {
3821           i1 = idx[0];
3822           i2 = idx[1];
3823           idx += 2;
3824           tmp0 = x[i1];
3825           tmp1 = x[i2];
3826           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3827           v1 += 2;
3828           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3829           v2 += 2;
3830           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3831           v3 += 2;
3832         }
3833 
3834         if (n == sz - 1) {
3835           tmp0 = x[*idx];
3836           sum1 -= *v1 * tmp0;
3837           sum2 -= *v2 * tmp0;
3838           sum3 -= *v3 * tmp0;
3839         }
3840         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3841         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3842         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3843         row -= 3;
3844         break;
3845       case 4:
3846 
3847         sum1 = b[row];
3848         sum2 = b[row - 1];
3849         sum3 = b[row - 2];
3850         sum4 = b[row - 3];
3851         v2   = a->a + diag[row - 1] + 2;
3852         v3   = a->a + diag[row - 2] + 3;
3853         v4   = a->a + diag[row - 3] + 4;
3854         for (n = 0; n < sz - 1; n += 2) {
3855           i1 = idx[0];
3856           i2 = idx[1];
3857           idx += 2;
3858           tmp0 = x[i1];
3859           tmp1 = x[i2];
3860           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3861           v1 += 2;
3862           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3863           v2 += 2;
3864           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3865           v3 += 2;
3866           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3867           v4 += 2;
3868         }
3869 
3870         if (n == sz - 1) {
3871           tmp0 = x[*idx];
3872           sum1 -= *v1 * tmp0;
3873           sum2 -= *v2 * tmp0;
3874           sum3 -= *v3 * tmp0;
3875           sum4 -= *v4 * tmp0;
3876         }
3877         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3878         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3879         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3880         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3881         row -= 4;
3882         break;
3883       case 5:
3884 
3885         sum1 = b[row];
3886         sum2 = b[row - 1];
3887         sum3 = b[row - 2];
3888         sum4 = b[row - 3];
3889         sum5 = b[row - 4];
3890         v2   = a->a + diag[row - 1] + 2;
3891         v3   = a->a + diag[row - 2] + 3;
3892         v4   = a->a + diag[row - 3] + 4;
3893         v5   = a->a + diag[row - 4] + 5;
3894         for (n = 0; n < sz - 1; n += 2) {
3895           i1 = idx[0];
3896           i2 = idx[1];
3897           idx += 2;
3898           tmp0 = x[i1];
3899           tmp1 = x[i2];
3900           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3901           v1 += 2;
3902           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3903           v2 += 2;
3904           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3905           v3 += 2;
3906           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3907           v4 += 2;
3908           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3909           v5 += 2;
3910         }
3911 
3912         if (n == sz - 1) {
3913           tmp0 = x[*idx];
3914           sum1 -= *v1 * tmp0;
3915           sum2 -= *v2 * tmp0;
3916           sum3 -= *v3 * tmp0;
3917           sum4 -= *v4 * tmp0;
3918           sum5 -= *v5 * tmp0;
3919         }
3920         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3921         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3922         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3923         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3924         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3925         row -= 5;
3926         break;
3927       default:
3928         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3929       }
3930     }
3931     PetscCall(PetscLogFlops(a->nz));
3932 
3933     /*
3934            t = b - D x    where D is the block diagonal
3935     */
3936     cnt = 0;
3937     for (i = 0, row = 0; i < m; i++) {
3938       switch (sizes[i]) {
3939       case 1:
3940         t[row] = b[row] - bdiag[cnt++] * x[row];
3941         row++;
3942         break;
3943       case 2:
3944         x1         = x[row];
3945         x2         = x[row + 1];
3946         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3947         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3948         t[row]     = b[row] - tmp1;
3949         t[row + 1] = b[row + 1] - tmp2;
3950         row += 2;
3951         cnt += 4;
3952         break;
3953       case 3:
3954         x1         = x[row];
3955         x2         = x[row + 1];
3956         x3         = x[row + 2];
3957         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3958         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3959         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3960         t[row]     = b[row] - tmp1;
3961         t[row + 1] = b[row + 1] - tmp2;
3962         t[row + 2] = b[row + 2] - tmp3;
3963         row += 3;
3964         cnt += 9;
3965         break;
3966       case 4:
3967         x1         = x[row];
3968         x2         = x[row + 1];
3969         x3         = x[row + 2];
3970         x4         = x[row + 3];
3971         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3972         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3973         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3974         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3975         t[row]     = b[row] - tmp1;
3976         t[row + 1] = b[row + 1] - tmp2;
3977         t[row + 2] = b[row + 2] - tmp3;
3978         t[row + 3] = b[row + 3] - tmp4;
3979         row += 4;
3980         cnt += 16;
3981         break;
3982       case 5:
3983         x1         = x[row];
3984         x2         = x[row + 1];
3985         x3         = x[row + 2];
3986         x4         = x[row + 3];
3987         x5         = x[row + 4];
3988         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3989         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3990         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3991         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3992         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3993         t[row]     = b[row] - tmp1;
3994         t[row + 1] = b[row + 1] - tmp2;
3995         t[row + 2] = b[row + 2] - tmp3;
3996         t[row + 3] = b[row + 3] - tmp4;
3997         t[row + 4] = b[row + 4] - tmp5;
3998         row += 5;
3999         cnt += 25;
4000         break;
4001       default:
4002         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4003       }
4004     }
4005     PetscCall(PetscLogFlops(m));
4006 
4007     /*
4008           Apply (L + D)^-1 where D is the block diagonal
4009     */
4010     for (i = 0, row = 0; i < m; i++) {
4011       sz  = diag[row] - ii[row];
4012       v1  = a->a + ii[row];
4013       idx = a->j + ii[row];
4014       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4015       switch (sizes[i]) {
4016       case 1:
4017 
4018         sum1 = t[row];
4019         for (n = 0; n < sz - 1; n += 2) {
4020           i1 = idx[0];
4021           i2 = idx[1];
4022           idx += 2;
4023           tmp0 = t[i1];
4024           tmp1 = t[i2];
4025           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4026           v1 += 2;
4027         }
4028 
4029         if (n == sz - 1) {
4030           tmp0 = t[*idx];
4031           sum1 -= *v1 * tmp0;
4032         }
4033         x[row] += t[row] = sum1 * (*ibdiag++);
4034         row++;
4035         break;
4036       case 2:
4037         v2   = a->a + ii[row + 1];
4038         sum1 = t[row];
4039         sum2 = t[row + 1];
4040         for (n = 0; n < sz - 1; n += 2) {
4041           i1 = idx[0];
4042           i2 = idx[1];
4043           idx += 2;
4044           tmp0 = t[i1];
4045           tmp1 = t[i2];
4046           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4047           v1 += 2;
4048           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4049           v2 += 2;
4050         }
4051 
4052         if (n == sz - 1) {
4053           tmp0 = t[*idx];
4054           sum1 -= v1[0] * tmp0;
4055           sum2 -= v2[0] * tmp0;
4056         }
4057         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4058         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4059         ibdiag += 4;
4060         row += 2;
4061         break;
4062       case 3:
4063         v2   = a->a + ii[row + 1];
4064         v3   = a->a + ii[row + 2];
4065         sum1 = t[row];
4066         sum2 = t[row + 1];
4067         sum3 = t[row + 2];
4068         for (n = 0; n < sz - 1; n += 2) {
4069           i1 = idx[0];
4070           i2 = idx[1];
4071           idx += 2;
4072           tmp0 = t[i1];
4073           tmp1 = t[i2];
4074           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4075           v1 += 2;
4076           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4077           v2 += 2;
4078           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4079           v3 += 2;
4080         }
4081 
4082         if (n == sz - 1) {
4083           tmp0 = t[*idx];
4084           sum1 -= v1[0] * tmp0;
4085           sum2 -= v2[0] * tmp0;
4086           sum3 -= v3[0] * tmp0;
4087         }
4088         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4089         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4090         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4091         ibdiag += 9;
4092         row += 3;
4093         break;
4094       case 4:
4095         v2   = a->a + ii[row + 1];
4096         v3   = a->a + ii[row + 2];
4097         v4   = a->a + ii[row + 3];
4098         sum1 = t[row];
4099         sum2 = t[row + 1];
4100         sum3 = t[row + 2];
4101         sum4 = t[row + 3];
4102         for (n = 0; n < sz - 1; n += 2) {
4103           i1 = idx[0];
4104           i2 = idx[1];
4105           idx += 2;
4106           tmp0 = t[i1];
4107           tmp1 = t[i2];
4108           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4109           v1 += 2;
4110           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4111           v2 += 2;
4112           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4113           v3 += 2;
4114           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4115           v4 += 2;
4116         }
4117 
4118         if (n == sz - 1) {
4119           tmp0 = t[*idx];
4120           sum1 -= v1[0] * tmp0;
4121           sum2 -= v2[0] * tmp0;
4122           sum3 -= v3[0] * tmp0;
4123           sum4 -= v4[0] * tmp0;
4124         }
4125         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4126         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4127         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4128         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4129         ibdiag += 16;
4130         row += 4;
4131         break;
4132       case 5:
4133         v2   = a->a + ii[row + 1];
4134         v3   = a->a + ii[row + 2];
4135         v4   = a->a + ii[row + 3];
4136         v5   = a->a + ii[row + 4];
4137         sum1 = t[row];
4138         sum2 = t[row + 1];
4139         sum3 = t[row + 2];
4140         sum4 = t[row + 3];
4141         sum5 = t[row + 4];
4142         for (n = 0; n < sz - 1; n += 2) {
4143           i1 = idx[0];
4144           i2 = idx[1];
4145           idx += 2;
4146           tmp0 = t[i1];
4147           tmp1 = t[i2];
4148           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4149           v1 += 2;
4150           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4151           v2 += 2;
4152           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4153           v3 += 2;
4154           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4155           v4 += 2;
4156           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4157           v5 += 2;
4158         }
4159 
4160         if (n == sz - 1) {
4161           tmp0 = t[*idx];
4162           sum1 -= v1[0] * tmp0;
4163           sum2 -= v2[0] * tmp0;
4164           sum3 -= v3[0] * tmp0;
4165           sum4 -= v4[0] * tmp0;
4166           sum5 -= v5[0] * tmp0;
4167         }
4168         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4169         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4170         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4171         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4172         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4173         ibdiag += 25;
4174         row += 5;
4175         break;
4176       default:
4177         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4178       }
4179     }
4180     PetscCall(PetscLogFlops(a->nz));
4181   }
4182   PetscCall(VecRestoreArray(xx, &x));
4183   PetscCall(VecRestoreArrayRead(bb, &b));
4184   PetscFunctionReturn(PETSC_SUCCESS);
4185 }
4186 
4187 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4188 {
4189   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4190   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4191   const MatScalar   *bdiag = a->inode.bdiag;
4192   const PetscScalar *b;
4193   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4194   const PetscInt    *sizes = a->inode.size;
4195 
4196   PetscFunctionBegin;
4197   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4198   PetscCall(VecGetArray(xx, &x));
4199   PetscCall(VecGetArrayRead(bb, &b));
4200   cnt = 0;
4201   for (i = 0, row = 0; i < m; i++) {
4202     switch (sizes[i]) {
4203     case 1:
4204       x[row] = b[row] * bdiag[cnt++];
4205       row++;
4206       break;
4207     case 2:
4208       x1       = b[row];
4209       x2       = b[row + 1];
4210       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4211       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4212       x[row++] = tmp1;
4213       x[row++] = tmp2;
4214       cnt += 4;
4215       break;
4216     case 3:
4217       x1       = b[row];
4218       x2       = b[row + 1];
4219       x3       = b[row + 2];
4220       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4221       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4222       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4223       x[row++] = tmp1;
4224       x[row++] = tmp2;
4225       x[row++] = tmp3;
4226       cnt += 9;
4227       break;
4228     case 4:
4229       x1       = b[row];
4230       x2       = b[row + 1];
4231       x3       = b[row + 2];
4232       x4       = b[row + 3];
4233       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4234       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4235       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4236       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4237       x[row++] = tmp1;
4238       x[row++] = tmp2;
4239       x[row++] = tmp3;
4240       x[row++] = tmp4;
4241       cnt += 16;
4242       break;
4243     case 5:
4244       x1       = b[row];
4245       x2       = b[row + 1];
4246       x3       = b[row + 2];
4247       x4       = b[row + 3];
4248       x5       = b[row + 4];
4249       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4250       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4251       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4252       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4253       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4254       x[row++] = tmp1;
4255       x[row++] = tmp2;
4256       x[row++] = tmp3;
4257       x[row++] = tmp4;
4258       x[row++] = tmp5;
4259       cnt += 25;
4260       break;
4261     default:
4262       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4263     }
4264   }
4265   PetscCall(PetscLogFlops(2.0 * cnt));
4266   PetscCall(VecRestoreArray(xx, &x));
4267   PetscCall(VecRestoreArrayRead(bb, &b));
4268   PetscFunctionReturn(PETSC_SUCCESS);
4269 }
4270 
4271 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4272 {
4273   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4274 
4275   PetscFunctionBegin;
4276   a->inode.node_count       = 0;
4277   a->inode.use              = PETSC_FALSE;
4278   a->inode.checked          = PETSC_FALSE;
4279   a->inode.mat_nonzerostate = -1;
4280   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4281   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4282   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4283   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4284   A->ops->coloringpatch     = NULL;
4285   A->ops->multdiagonalblock = NULL;
4286   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4287   PetscFunctionReturn(PETSC_SUCCESS);
4288 }
4289 
4290 /*
4291     samestructure indicates that the matrix has not changed its nonzero structure so we
4292     do not need to recompute the inodes
4293 */
4294 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4295 {
4296   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4297   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4298   PetscBool       flag;
4299   const PetscInt *idx, *idy, *ii;
4300 
4301   PetscFunctionBegin;
4302   if (!a->inode.use) {
4303     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4304     PetscCall(PetscFree(a->inode.size));
4305     PetscFunctionReturn(PETSC_SUCCESS);
4306   }
4307   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4308 
4309   m = A->rmap->n;
4310   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4311   ns = a->inode.size;
4312 
4313   i          = 0;
4314   node_count = 0;
4315   idx        = a->j;
4316   ii         = a->i;
4317   if (idx) {
4318     while (i < m) {            /* For each row */
4319       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4320       /* Limits the number of elements in a node to 'a->inode.limit' */
4321       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4322         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4323         if (nzy != nzx) break;
4324         idy += nzx; /* Same nonzero pattern */
4325         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4326         if (!flag) break;
4327       }
4328       ns[node_count++] = blk_size;
4329       idx += blk_size * nzx;
4330       i = j;
4331     }
4332   }
4333   /* If not enough inodes found,, do not use inode version of the routines */
4334   if (!m || !idx || node_count > .8 * m) {
4335     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4336     PetscCall(PetscFree(a->inode.size));
4337     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4338   } else {
4339     if (!A->factortype) {
4340       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4341       if (A->rmap->n == A->cmap->n) {
4342         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4343         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4344         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4345         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4346         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4347       }
4348     } else {
4349       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4350     }
4351     a->inode.node_count = node_count;
4352     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4353   }
4354   a->inode.checked          = PETSC_TRUE;
4355   a->inode.mat_nonzerostate = A->nonzerostate;
4356   PetscFunctionReturn(PETSC_SUCCESS);
4357 }
4358 
4359 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4360 {
4361   Mat         B = *C;
4362   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4363   PetscInt    m = A->rmap->n;
4364 
4365   PetscFunctionBegin;
4366   c->inode.use              = a->inode.use;
4367   c->inode.limit            = a->inode.limit;
4368   c->inode.max_limit        = a->inode.max_limit;
4369   c->inode.checked          = PETSC_FALSE;
4370   c->inode.size             = NULL;
4371   c->inode.node_count       = 0;
4372   c->inode.ibdiagvalid      = PETSC_FALSE;
4373   c->inode.ibdiag           = NULL;
4374   c->inode.bdiag            = NULL;
4375   c->inode.mat_nonzerostate = -1;
4376   if (a->inode.use) {
4377     if (a->inode.checked && a->inode.size) {
4378       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4379       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4380 
4381       c->inode.checked          = PETSC_TRUE;
4382       c->inode.node_count       = a->inode.node_count;
4383       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4384     }
4385     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4386     if (!B->factortype) {
4387       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4388       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4389       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4390       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4391       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4392       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4393     } else {
4394       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4395     }
4396   }
4397   PetscFunctionReturn(PETSC_SUCCESS);
4398 }
4399 
4400 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4401 {
4402   PetscInt        k;
4403   const PetscInt *vi;
4404 
4405   PetscFunctionBegin;
4406   vi = aj + ai[row];
4407   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4408   vi        = aj + adiag[row];
4409   cols[nzl] = vi[0];
4410   vi        = aj + adiag[row + 1] + 1;
4411   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4412   PetscFunctionReturn(PETSC_SUCCESS);
4413 }
4414 /*
4415    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4416    Modified from MatSeqAIJCheckInode().
4417 
4418    Input Parameters:
4419 .  Mat A - ILU or LU matrix factor
4420 
4421 */
4422 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4423 {
4424   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4425   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4426   PetscInt       *cols1, *cols2, *ns;
4427   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4428   PetscBool       flag;
4429 
4430   PetscFunctionBegin;
4431   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4432   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4433 
4434   m = A->rmap->n;
4435   if (a->inode.size) ns = a->inode.size;
4436   else PetscCall(PetscMalloc1(m + 1, &ns));
4437 
4438   i          = 0;
4439   node_count = 0;
4440   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4441   while (i < m) {                       /* For each row */
4442     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4443     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4444     nzx  = nzl1 + nzu1 + 1;
4445     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4446 
4447     /* Limits the number of elements in a node to 'a->inode.limit' */
4448     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4449       nzl2 = ai[j + 1] - ai[j];
4450       nzu2 = adiag[j] - adiag[j + 1] - 1;
4451       nzy  = nzl2 + nzu2 + 1;
4452       if (nzy != nzx) break;
4453       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4454       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4455       if (!flag) break;
4456     }
4457     ns[node_count++] = blk_size;
4458     i                = j;
4459   }
4460   PetscCall(PetscFree2(cols1, cols2));
4461   /* If not enough inodes found,, do not use inode version of the routines */
4462   if (!m || node_count > .8 * m) {
4463     PetscCall(PetscFree(ns));
4464 
4465     a->inode.node_count = 0;
4466     a->inode.size       = NULL;
4467     a->inode.use        = PETSC_FALSE;
4468 
4469     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4470   } else {
4471     A->ops->mult              = NULL;
4472     A->ops->sor               = NULL;
4473     A->ops->multadd           = NULL;
4474     A->ops->getrowij          = NULL;
4475     A->ops->restorerowij      = NULL;
4476     A->ops->getcolumnij       = NULL;
4477     A->ops->restorecolumnij   = NULL;
4478     A->ops->coloringpatch     = NULL;
4479     A->ops->multdiagonalblock = NULL;
4480     a->inode.node_count       = node_count;
4481     a->inode.size             = ns;
4482 
4483     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4484   }
4485   a->inode.checked = PETSC_TRUE;
4486   PetscFunctionReturn(PETSC_SUCCESS);
4487 }
4488 
4489 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4490 {
4491   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4492 
4493   PetscFunctionBegin;
4494   a->inode.ibdiagvalid = PETSC_FALSE;
4495   PetscFunctionReturn(PETSC_SUCCESS);
4496 }
4497 
4498 /*
4499      This is really ugly. if inodes are used this replaces the
4500   permutations with ones that correspond to rows/cols of the matrix
4501   rather than inode blocks
4502 */
4503 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4504 {
4505   PetscFunctionBegin;
4506   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4507   PetscFunctionReturn(PETSC_SUCCESS);
4508 }
4509 
4510 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4511 {
4512   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4513   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4514   const PetscInt *ridx, *cidx;
4515   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4516   PetscInt        nslim_col, *ns_col;
4517   IS              ris = *rperm, cis = *cperm;
4518 
4519   PetscFunctionBegin;
4520   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4521   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4522 
4523   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4524   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4525   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4526 
4527   PetscCall(ISGetIndices(ris, &ridx));
4528   PetscCall(ISGetIndices(cis, &cidx));
4529 
4530   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4531   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4532 
4533   /* Construct the permutations for rows*/
4534   for (i = 0, row = 0; i < nslim_row; ++i) {
4535     indx      = ridx[i];
4536     start_val = tns[indx];
4537     end_val   = tns[indx + 1];
4538     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4539   }
4540 
4541   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4542   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4543 
4544   /* Construct permutations for columns */
4545   for (i = 0, col = 0; i < nslim_col; ++i) {
4546     indx      = cidx[i];
4547     start_val = tns[indx];
4548     end_val   = tns[indx + 1];
4549     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4550   }
4551 
4552   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4553   PetscCall(ISSetPermutation(*rperm));
4554   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4555   PetscCall(ISSetPermutation(*cperm));
4556 
4557   PetscCall(ISRestoreIndices(ris, &ridx));
4558   PetscCall(ISRestoreIndices(cis, &cidx));
4559 
4560   PetscCall(PetscFree(ns_col));
4561   PetscCall(PetscFree2(permr, permc));
4562   PetscCall(ISDestroy(&cis));
4563   PetscCall(ISDestroy(&ris));
4564   PetscCall(PetscFree(tns));
4565   PetscFunctionReturn(PETSC_SUCCESS);
4566 }
4567 
4568 /*@C
4569   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4570 
4571   Not Collective
4572 
4573   Input Parameter:
4574 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4575 
4576   Output Parameters:
4577 + node_count - no of inodes present in the matrix.
4578 . sizes      - an array of size `node_count`, with the sizes of each inode.
4579 - limit      - the max size used to generate the inodes.
4580 
4581   Level: advanced
4582 
4583   Note:
4584   It should be called after the matrix is assembled.
4585   The contents of the sizes[] array should not be changed.
4586   `NULL` may be passed for information not needed
4587 
4588 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4589 @*/
4590 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4591 {
4592   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4593 
4594   PetscFunctionBegin;
4595   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4596   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4597   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4598   PetscFunctionReturn(PETSC_SUCCESS);
4599 }
4600 
4601 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4602 {
4603   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4604 
4605   PetscFunctionBegin;
4606   if (node_count) *node_count = a->inode.node_count;
4607   if (sizes) *sizes = a->inode.size;
4608   if (limit) *limit = a->inode.limit;
4609   PetscFunctionReturn(PETSC_SUCCESS);
4610 }
4611