xref: /petsc/src/mat/impls/aij/seq/inode.c (revision ae51ba108530e1a8d2aeb8bc3771a24c2fcd801e)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
9 {
10   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
11   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
12 
13   PetscFunctionBegin;
14   n = A->cmap->n;
15   m = A->rmap->n;
16   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
22       ;
23     for (; count + 1 < n; count++, i++)
24       ;
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30 
31   /* Use the same row structure wherever feasible. */
32   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
33 
34   /* if m < n; pad up the remainder with inode_limit */
35   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
36   /* The last node is the odd ball. padd it up with the remaining rows; */
37   if (count < n) {
38     ns_col[i] = n - count;
39     i++;
40   } else if (count > n) {
41     /* Adjust for the over estimation */
42     ns_col[i - 1] += n - count;
43   }
44   *size = i;
45   *ns   = ns_col;
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 
49 /*
50       This builds symmetric version of nonzero structure,
51 */
52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53 {
54   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
55   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
56   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
57   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
58 
59   PetscFunctionBegin;
60   nslim_row = a->inode.node_count;
61   m         = A->rmap->n;
62   n         = A->cmap->n;
63   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
64   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
65 
66   /* Use the row_inode as column_inode */
67   nslim_col = nslim_row;
68   ns_col    = ns_row;
69 
70   /* allocate space for reformatted inode structure */
71   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
72   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
73 
74   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
75     nsz = ns_col[i1];
76     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
77   }
78   /* allocate space for row pointers */
79   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
80   *iia = ia;
81   PetscCall(PetscMalloc1(nslim_row + 1, &work));
82 
83   /* determine the number of columns in each row */
84   ia[0] = oshift;
85   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
86     j    = aj + ai[row] + ishift;
87     jmax = aj + ai[row + 1] + ishift;
88     if (j == jmax) continue; /* empty row */
89     col = *j++ + ishift;
90     i2  = tvc[col];
91     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
92       ia[i1 + 1]++;
93       ia[i2 + 1]++;
94       i2++; /* Start col of next node */
95       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
96       i2 = tvc[col];
97     }
98     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
99   }
100 
101   /* shift ia[i] to point to next row */
102   for (i1 = 1; i1 < nslim_row + 1; i1++) {
103     row = ia[i1 - 1];
104     ia[i1] += row;
105     work[i1 - 1] = row - oshift;
106   }
107 
108   /* allocate space for column pointers */
109   nz = ia[nslim_row] + (!ishift);
110   PetscCall(PetscMalloc1(nz, &ja));
111   *jja = ja;
112 
113   /* loop over lower triangular part putting into ja */
114   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
115     j    = aj + ai[row] + ishift;
116     jmax = aj + ai[row + 1] + ishift;
117     if (j == jmax) continue; /* empty row */
118     col = *j++ + ishift;
119     i2  = tvc[col];
120     while (i2 < i1 && j < jmax) {
121       ja[work[i2]++] = i1 + oshift;
122       ja[work[i1]++] = i2 + oshift;
123       ++i2;
124       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
125       i2 = tvc[col];
126     }
127     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
128   }
129   PetscCall(PetscFree(work));
130   PetscCall(PetscFree2(tns, tvc));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*
135       This builds nonsymmetric version of nonzero structure,
136 */
137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138 {
139   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
140   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
141   PetscInt       *tns, *tvc, nsz, i1, i2;
142   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
143 
144   PetscFunctionBegin;
145   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
146   nslim_row = a->inode.node_count;
147   n         = A->cmap->n;
148 
149   /* Create The column_inode for this matrix */
150   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
151 
152   /* allocate space for reformatted column_inode structure */
153   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
154   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
155 
156   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
157     nsz = ns_col[i1];
158     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
159   }
160   /* allocate space for row pointers */
161   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
162   *iia = ia;
163   PetscCall(PetscMalloc1(nslim_row + 1, &work));
164 
165   /* determine the number of columns in each row */
166   ia[0] = oshift;
167   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
168     j  = aj + ai[row] + ishift;
169     nz = ai[row + 1] - ai[row];
170     if (!nz) continue; /* empty row */
171     col = *j++ + ishift;
172     i2  = tvc[col];
173     while (nz-- > 0) { /* off-diagonal elements */
174       ia[i1 + 1]++;
175       i2++; /* Start col of next node */
176       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
177       if (nz > 0) i2 = tvc[col];
178     }
179   }
180 
181   /* shift ia[i] to point to next row */
182   for (i1 = 1; i1 < nslim_row + 1; i1++) {
183     row = ia[i1 - 1];
184     ia[i1] += row;
185     work[i1 - 1] = row - oshift;
186   }
187 
188   /* allocate space for column pointers */
189   nz = ia[nslim_row] + (!ishift);
190   PetscCall(PetscMalloc1(nz, &ja));
191   *jja = ja;
192 
193   /* loop over matrix putting into ja */
194   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
195     j  = aj + ai[row] + ishift;
196     nz = ai[row + 1] - ai[row];
197     if (!nz) continue; /* empty row */
198     col = *j++ + ishift;
199     i2  = tvc[col];
200     while (nz-- > 0) {
201       ja[work[i1]++] = i2 + oshift;
202       ++i2;
203       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
204       if (nz > 0) i2 = tvc[col];
205     }
206   }
207   PetscCall(PetscFree(ns_col));
208   PetscCall(PetscFree(work));
209   PetscCall(PetscFree2(tns, tvc));
210   PetscFunctionReturn(PETSC_SUCCESS);
211 }
212 
213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214 {
215   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
216 
217   PetscFunctionBegin;
218   if (n) *n = a->inode.node_count;
219   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
220   if (!blockcompressed) {
221     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
222   } else if (symmetric) {
223     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
224   } else {
225     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
226   }
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231 {
232   PetscFunctionBegin;
233   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
234 
235   if (!blockcompressed) {
236     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
237   } else {
238     PetscCall(PetscFree(*ia));
239     PetscCall(PetscFree(*ja));
240   }
241   PetscFunctionReturn(PETSC_SUCCESS);
242 }
243 
244 /* ----------------------------------------------------------- */
245 
246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247 {
248   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
250   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
251 
252   PetscFunctionBegin;
253   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
254   nslim_row = a->inode.node_count;
255   n         = A->cmap->n;
256 
257   /* Create The column_inode for this matrix */
258   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
259 
260   /* allocate space for reformatted column_inode structure */
261   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
262   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
263 
264   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
265     nsz = ns_col[i1];
266     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
267   }
268   /* allocate space for column pointers */
269   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
270   *iia = ia;
271   PetscCall(PetscMalloc1(nslim_col + 1, &work));
272 
273   /* determine the number of columns in each row */
274   ia[0] = oshift;
275   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
276     j   = aj + ai[row] + ishift;
277     col = *j++ + ishift;
278     i2  = tvc[col];
279     nz  = ai[row + 1] - ai[row];
280     while (nz-- > 0) { /* off-diagonal elements */
281       /* ia[i1+1]++; */
282       ia[i2 + 1]++;
283       i2++;
284       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
285       if (nz > 0) i2 = tvc[col];
286     }
287   }
288 
289   /* shift ia[i] to point to next col */
290   for (i1 = 1; i1 < nslim_col + 1; i1++) {
291     col = ia[i1 - 1];
292     ia[i1] += col;
293     work[i1 - 1] = col - oshift;
294   }
295 
296   /* allocate space for column pointers */
297   nz = ia[nslim_col] + (!ishift);
298   PetscCall(PetscMalloc1(nz, &ja));
299   *jja = ja;
300 
301   /* loop over matrix putting into ja */
302   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
303     j   = aj + ai[row] + ishift;
304     col = *j++ + ishift;
305     i2  = tvc[col];
306     nz  = ai[row + 1] - ai[row];
307     while (nz-- > 0) {
308       /* ja[work[i1]++] = i2 + oshift; */
309       ja[work[i2]++] = i1 + oshift;
310       i2++;
311       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
312       if (nz > 0) i2 = tvc[col];
313     }
314   }
315   PetscCall(PetscFree(ns_col));
316   PetscCall(PetscFree(work));
317   PetscCall(PetscFree2(tns, tvc));
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322 {
323   PetscFunctionBegin;
324   PetscCall(MatCreateColInode_Private(A, n, NULL));
325   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
326 
327   if (!blockcompressed) {
328     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
329   } else if (symmetric) {
330     /* Since the indices are symmetric it doesn't matter */
331     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
332   } else {
333     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
334   }
335   PetscFunctionReturn(PETSC_SUCCESS);
336 }
337 
338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339 {
340   PetscFunctionBegin;
341   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
342   if (!blockcompressed) {
343     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
344   } else {
345     PetscCall(PetscFree(*ia));
346     PetscCall(PetscFree(*ja));
347   }
348   PetscFunctionReturn(PETSC_SUCCESS);
349 }
350 
351 /* ----------------------------------------------------------- */
352 
353 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
354 {
355   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
356   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
357   PetscScalar       *y;
358   const PetscScalar *x;
359   const MatScalar   *v1, *v2, *v3, *v4, *v5;
360   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
361   const PetscInt    *idx, *ns, *ii;
362 
363 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
364   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
365 #endif
366 
367   PetscFunctionBegin;
368   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
369   node_max = a->inode.node_count;
370   ns       = a->inode.size; /* Node Size array */
371   PetscCall(VecGetArrayRead(xx, &x));
372   PetscCall(VecGetArray(yy, &y));
373   idx = a->j;
374   v1  = a->a;
375   ii  = a->i;
376 
377   for (i = 0, row = 0; i < node_max; ++i) {
378     nsz = ns[i];
379     n   = ii[1] - ii[0];
380     nonzerorow += (n > 0) * nsz;
381     ii += nsz;
382     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
383     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
384     sz = n;                                                                /* No of non zeros in this row */
385                                                                            /* Switch on the size of Node */
386     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
387     case 1:
388       sum1 = 0.;
389 
390       for (n = 0; n < sz - 1; n += 2) {
391         i1 = idx[0]; /* The instructions are ordered to */
392         i2 = idx[1]; /* make the compiler's job easy */
393         idx += 2;
394         tmp0 = x[i1];
395         tmp1 = x[i2];
396         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
397         v1 += 2;
398       }
399 
400       if (n == sz - 1) { /* Take care of the last nonzero  */
401         tmp0 = x[*idx++];
402         sum1 += *v1++ * tmp0;
403       }
404       y[row++] = sum1;
405       break;
406     case 2:
407       sum1 = 0.;
408       sum2 = 0.;
409       v2   = v1 + n;
410 
411       for (n = 0; n < sz - 1; n += 2) {
412         i1 = idx[0];
413         i2 = idx[1];
414         idx += 2;
415         tmp0 = x[i1];
416         tmp1 = x[i2];
417         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
418         v1 += 2;
419         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
420         v2 += 2;
421       }
422       if (n == sz - 1) {
423         tmp0 = x[*idx++];
424         sum1 += *v1++ * tmp0;
425         sum2 += *v2++ * tmp0;
426       }
427       y[row++] = sum1;
428       y[row++] = sum2;
429       v1       = v2; /* Since the next block to be processed starts there*/
430       idx += sz;
431       break;
432     case 3:
433       sum1 = 0.;
434       sum2 = 0.;
435       sum3 = 0.;
436       v2   = v1 + n;
437       v3   = v2 + n;
438 
439       for (n = 0; n < sz - 1; n += 2) {
440         i1 = idx[0];
441         i2 = idx[1];
442         idx += 2;
443         tmp0 = x[i1];
444         tmp1 = x[i2];
445         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
446         v1 += 2;
447         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
448         v2 += 2;
449         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
450         v3 += 2;
451       }
452       if (n == sz - 1) {
453         tmp0 = x[*idx++];
454         sum1 += *v1++ * tmp0;
455         sum2 += *v2++ * tmp0;
456         sum3 += *v3++ * tmp0;
457       }
458       y[row++] = sum1;
459       y[row++] = sum2;
460       y[row++] = sum3;
461       v1       = v3; /* Since the next block to be processed starts there*/
462       idx += 2 * sz;
463       break;
464     case 4:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       sum4 = 0.;
469       v2   = v1 + n;
470       v3   = v2 + n;
471       v4   = v3 + n;
472 
473       for (n = 0; n < sz - 1; n += 2) {
474         i1 = idx[0];
475         i2 = idx[1];
476         idx += 2;
477         tmp0 = x[i1];
478         tmp1 = x[i2];
479         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
480         v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
482         v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
484         v3 += 2;
485         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
486         v4 += 2;
487       }
488       if (n == sz - 1) {
489         tmp0 = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493         sum4 += *v4++ * tmp0;
494       }
495       y[row++] = sum1;
496       y[row++] = sum2;
497       y[row++] = sum3;
498       y[row++] = sum4;
499       v1       = v4; /* Since the next block to be processed starts there*/
500       idx += 3 * sz;
501       break;
502     case 5:
503       sum1 = 0.;
504       sum2 = 0.;
505       sum3 = 0.;
506       sum4 = 0.;
507       sum5 = 0.;
508       v2   = v1 + n;
509       v3   = v2 + n;
510       v4   = v3 + n;
511       v5   = v4 + n;
512 
513       for (n = 0; n < sz - 1; n += 2) {
514         i1 = idx[0];
515         i2 = idx[1];
516         idx += 2;
517         tmp0 = x[i1];
518         tmp1 = x[i2];
519         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
520         v1 += 2;
521         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
522         v2 += 2;
523         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
524         v3 += 2;
525         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
526         v4 += 2;
527         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
528         v5 += 2;
529       }
530       if (n == sz - 1) {
531         tmp0 = x[*idx++];
532         sum1 += *v1++ * tmp0;
533         sum2 += *v2++ * tmp0;
534         sum3 += *v3++ * tmp0;
535         sum4 += *v4++ * tmp0;
536         sum5 += *v5++ * tmp0;
537       }
538       y[row++] = sum1;
539       y[row++] = sum2;
540       y[row++] = sum3;
541       y[row++] = sum4;
542       y[row++] = sum5;
543       v1       = v5; /* Since the next block to be processed starts there */
544       idx += 4 * sz;
545       break;
546     default:
547       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
548     }
549   }
550   PetscCall(VecRestoreArrayRead(xx, &x));
551   PetscCall(VecRestoreArray(yy, &y));
552   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
553   PetscFunctionReturn(PETSC_SUCCESS);
554 }
555 /* ----------------------------------------------------------- */
556 /* Almost same code as the MatMult_SeqAIJ_Inode() */
557 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
558 {
559   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
560   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
561   const MatScalar   *v1, *v2, *v3, *v4, *v5;
562   const PetscScalar *x;
563   PetscScalar       *y, *z, *zt;
564   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
565   const PetscInt    *idx, *ns, *ii;
566 
567   PetscFunctionBegin;
568   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
569   node_max = a->inode.node_count;
570   ns       = a->inode.size; /* Node Size array */
571 
572   PetscCall(VecGetArrayRead(xx, &x));
573   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
574   zt = z;
575 
576   idx = a->j;
577   v1  = a->a;
578   ii  = a->i;
579 
580   for (i = 0, row = 0; i < node_max; ++i) {
581     nsz = ns[i];
582     n   = ii[1] - ii[0];
583     ii += nsz;
584     sz = n;        /* No of non zeros in this row */
585                    /* Switch on the size of Node */
586     switch (nsz) { /* Each loop in 'case' is unrolled */
587     case 1:
588       sum1 = *zt++;
589 
590       for (n = 0; n < sz - 1; n += 2) {
591         i1 = idx[0]; /* The instructions are ordered to */
592         i2 = idx[1]; /* make the compiler's job easy */
593         idx += 2;
594         tmp0 = x[i1];
595         tmp1 = x[i2];
596         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
597         v1 += 2;
598       }
599 
600       if (n == sz - 1) { /* Take care of the last nonzero  */
601         tmp0 = x[*idx++];
602         sum1 += *v1++ * tmp0;
603       }
604       y[row++] = sum1;
605       break;
606     case 2:
607       sum1 = *zt++;
608       sum2 = *zt++;
609       v2   = v1 + n;
610 
611       for (n = 0; n < sz - 1; n += 2) {
612         i1 = idx[0];
613         i2 = idx[1];
614         idx += 2;
615         tmp0 = x[i1];
616         tmp1 = x[i2];
617         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
618         v1 += 2;
619         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
620         v2 += 2;
621       }
622       if (n == sz - 1) {
623         tmp0 = x[*idx++];
624         sum1 += *v1++ * tmp0;
625         sum2 += *v2++ * tmp0;
626       }
627       y[row++] = sum1;
628       y[row++] = sum2;
629       v1       = v2; /* Since the next block to be processed starts there*/
630       idx += sz;
631       break;
632     case 3:
633       sum1 = *zt++;
634       sum2 = *zt++;
635       sum3 = *zt++;
636       v2   = v1 + n;
637       v3   = v2 + n;
638 
639       for (n = 0; n < sz - 1; n += 2) {
640         i1 = idx[0];
641         i2 = idx[1];
642         idx += 2;
643         tmp0 = x[i1];
644         tmp1 = x[i2];
645         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
646         v1 += 2;
647         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
648         v2 += 2;
649         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
650         v3 += 2;
651       }
652       if (n == sz - 1) {
653         tmp0 = x[*idx++];
654         sum1 += *v1++ * tmp0;
655         sum2 += *v2++ * tmp0;
656         sum3 += *v3++ * tmp0;
657       }
658       y[row++] = sum1;
659       y[row++] = sum2;
660       y[row++] = sum3;
661       v1       = v3; /* Since the next block to be processed starts there*/
662       idx += 2 * sz;
663       break;
664     case 4:
665       sum1 = *zt++;
666       sum2 = *zt++;
667       sum3 = *zt++;
668       sum4 = *zt++;
669       v2   = v1 + n;
670       v3   = v2 + n;
671       v4   = v3 + n;
672 
673       for (n = 0; n < sz - 1; n += 2) {
674         i1 = idx[0];
675         i2 = idx[1];
676         idx += 2;
677         tmp0 = x[i1];
678         tmp1 = x[i2];
679         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
680         v1 += 2;
681         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
682         v2 += 2;
683         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
684         v3 += 2;
685         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
686         v4 += 2;
687       }
688       if (n == sz - 1) {
689         tmp0 = x[*idx++];
690         sum1 += *v1++ * tmp0;
691         sum2 += *v2++ * tmp0;
692         sum3 += *v3++ * tmp0;
693         sum4 += *v4++ * tmp0;
694       }
695       y[row++] = sum1;
696       y[row++] = sum2;
697       y[row++] = sum3;
698       y[row++] = sum4;
699       v1       = v4; /* Since the next block to be processed starts there*/
700       idx += 3 * sz;
701       break;
702     case 5:
703       sum1 = *zt++;
704       sum2 = *zt++;
705       sum3 = *zt++;
706       sum4 = *zt++;
707       sum5 = *zt++;
708       v2   = v1 + n;
709       v3   = v2 + n;
710       v4   = v3 + n;
711       v5   = v4 + n;
712 
713       for (n = 0; n < sz - 1; n += 2) {
714         i1 = idx[0];
715         i2 = idx[1];
716         idx += 2;
717         tmp0 = x[i1];
718         tmp1 = x[i2];
719         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
720         v1 += 2;
721         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
722         v2 += 2;
723         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
724         v3 += 2;
725         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
726         v4 += 2;
727         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
728         v5 += 2;
729       }
730       if (n == sz - 1) {
731         tmp0 = x[*idx++];
732         sum1 += *v1++ * tmp0;
733         sum2 += *v2++ * tmp0;
734         sum3 += *v3++ * tmp0;
735         sum4 += *v4++ * tmp0;
736         sum5 += *v5++ * tmp0;
737       }
738       y[row++] = sum1;
739       y[row++] = sum2;
740       y[row++] = sum3;
741       y[row++] = sum4;
742       y[row++] = sum5;
743       v1       = v5; /* Since the next block to be processed starts there */
744       idx += 4 * sz;
745       break;
746     default:
747       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
748     }
749   }
750   PetscCall(VecRestoreArrayRead(xx, &x));
751   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
752   PetscCall(PetscLogFlops(2.0 * a->nz));
753   PetscFunctionReturn(PETSC_SUCCESS);
754 }
755 
756 /* ----------------------------------------------------------- */
757 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
758 {
759   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
760   IS                 iscol = a->col, isrow = a->row;
761   const PetscInt    *r, *c, *rout, *cout;
762   PetscInt           i, j, n = A->rmap->n, nz;
763   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
764   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
765   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
766   PetscScalar        sum1, sum2, sum3, sum4, sum5;
767   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
768   const PetscScalar *b;
769 
770   PetscFunctionBegin;
771   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
772   node_max = a->inode.node_count;
773   ns       = a->inode.size; /* Node Size array */
774 
775   PetscCall(VecGetArrayRead(bb, &b));
776   PetscCall(VecGetArrayWrite(xx, &x));
777   tmp = a->solve_work;
778 
779   PetscCall(ISGetIndices(isrow, &rout));
780   r = rout;
781   PetscCall(ISGetIndices(iscol, &cout));
782   c = cout + (n - 1);
783 
784   /* forward solve the lower triangular */
785   tmps = tmp;
786   aa   = a_a;
787   aj   = a_j;
788   ad   = a->diag;
789 
790   for (i = 0, row = 0; i < node_max; ++i) {
791     nsz = ns[i];
792     aii = ai[row];
793     v1  = aa + aii;
794     vi  = aj + aii;
795     nz  = ad[row] - aii;
796     if (i < node_max - 1) {
797       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
798       * but our indexing to determine it's size could. */
799       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
800       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
801       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
802       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
803     }
804 
805     switch (nsz) { /* Each loop in 'case' is unrolled */
806     case 1:
807       sum1 = b[*r++];
808       for (j = 0; j < nz - 1; j += 2) {
809         i0 = vi[0];
810         i1 = vi[1];
811         vi += 2;
812         tmp0 = tmps[i0];
813         tmp1 = tmps[i1];
814         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
815         v1 += 2;
816       }
817       if (j == nz - 1) {
818         tmp0 = tmps[*vi++];
819         sum1 -= *v1++ * tmp0;
820       }
821       tmp[row++] = sum1;
822       break;
823     case 2:
824       sum1 = b[*r++];
825       sum2 = b[*r++];
826       v2   = aa + ai[row + 1];
827 
828       for (j = 0; j < nz - 1; j += 2) {
829         i0 = vi[0];
830         i1 = vi[1];
831         vi += 2;
832         tmp0 = tmps[i0];
833         tmp1 = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
835         v1 += 2;
836         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
837         v2 += 2;
838       }
839       if (j == nz - 1) {
840         tmp0 = tmps[*vi++];
841         sum1 -= *v1++ * tmp0;
842         sum2 -= *v2++ * tmp0;
843       }
844       sum2 -= *v2++ * sum1;
845       tmp[row++] = sum1;
846       tmp[row++] = sum2;
847       break;
848     case 3:
849       sum1 = b[*r++];
850       sum2 = b[*r++];
851       sum3 = b[*r++];
852       v2   = aa + ai[row + 1];
853       v3   = aa + ai[row + 2];
854 
855       for (j = 0; j < nz - 1; j += 2) {
856         i0 = vi[0];
857         i1 = vi[1];
858         vi += 2;
859         tmp0 = tmps[i0];
860         tmp1 = tmps[i1];
861         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
862         v1 += 2;
863         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
864         v2 += 2;
865         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
866         v3 += 2;
867       }
868       if (j == nz - 1) {
869         tmp0 = tmps[*vi++];
870         sum1 -= *v1++ * tmp0;
871         sum2 -= *v2++ * tmp0;
872         sum3 -= *v3++ * tmp0;
873       }
874       sum2 -= *v2++ * sum1;
875       sum3 -= *v3++ * sum1;
876       sum3 -= *v3++ * sum2;
877 
878       tmp[row++] = sum1;
879       tmp[row++] = sum2;
880       tmp[row++] = sum3;
881       break;
882 
883     case 4:
884       sum1 = b[*r++];
885       sum2 = b[*r++];
886       sum3 = b[*r++];
887       sum4 = b[*r++];
888       v2   = aa + ai[row + 1];
889       v3   = aa + ai[row + 2];
890       v4   = aa + ai[row + 3];
891 
892       for (j = 0; j < nz - 1; j += 2) {
893         i0 = vi[0];
894         i1 = vi[1];
895         vi += 2;
896         tmp0 = tmps[i0];
897         tmp1 = tmps[i1];
898         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
899         v1 += 2;
900         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
901         v2 += 2;
902         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
903         v3 += 2;
904         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
905         v4 += 2;
906       }
907       if (j == nz - 1) {
908         tmp0 = tmps[*vi++];
909         sum1 -= *v1++ * tmp0;
910         sum2 -= *v2++ * tmp0;
911         sum3 -= *v3++ * tmp0;
912         sum4 -= *v4++ * tmp0;
913       }
914       sum2 -= *v2++ * sum1;
915       sum3 -= *v3++ * sum1;
916       sum4 -= *v4++ * sum1;
917       sum3 -= *v3++ * sum2;
918       sum4 -= *v4++ * sum2;
919       sum4 -= *v4++ * sum3;
920 
921       tmp[row++] = sum1;
922       tmp[row++] = sum2;
923       tmp[row++] = sum3;
924       tmp[row++] = sum4;
925       break;
926     case 5:
927       sum1 = b[*r++];
928       sum2 = b[*r++];
929       sum3 = b[*r++];
930       sum4 = b[*r++];
931       sum5 = b[*r++];
932       v2   = aa + ai[row + 1];
933       v3   = aa + ai[row + 2];
934       v4   = aa + ai[row + 3];
935       v5   = aa + ai[row + 4];
936 
937       for (j = 0; j < nz - 1; j += 2) {
938         i0 = vi[0];
939         i1 = vi[1];
940         vi += 2;
941         tmp0 = tmps[i0];
942         tmp1 = tmps[i1];
943         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
944         v1 += 2;
945         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
946         v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
948         v3 += 2;
949         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
950         v4 += 2;
951         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
952         v5 += 2;
953       }
954       if (j == nz - 1) {
955         tmp0 = tmps[*vi++];
956         sum1 -= *v1++ * tmp0;
957         sum2 -= *v2++ * tmp0;
958         sum3 -= *v3++ * tmp0;
959         sum4 -= *v4++ * tmp0;
960         sum5 -= *v5++ * tmp0;
961       }
962 
963       sum2 -= *v2++ * sum1;
964       sum3 -= *v3++ * sum1;
965       sum4 -= *v4++ * sum1;
966       sum5 -= *v5++ * sum1;
967       sum3 -= *v3++ * sum2;
968       sum4 -= *v4++ * sum2;
969       sum5 -= *v5++ * sum2;
970       sum4 -= *v4++ * sum3;
971       sum5 -= *v5++ * sum3;
972       sum5 -= *v5++ * sum4;
973 
974       tmp[row++] = sum1;
975       tmp[row++] = sum2;
976       tmp[row++] = sum3;
977       tmp[row++] = sum4;
978       tmp[row++] = sum5;
979       break;
980     default:
981       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
982     }
983   }
984   /* backward solve the upper triangular */
985   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
986     nsz = ns[i];
987     aii = ai[row + 1] - 1;
988     v1  = aa + aii;
989     vi  = aj + aii;
990     nz  = aii - ad[row];
991     switch (nsz) { /* Each loop in 'case' is unrolled */
992     case 1:
993       sum1 = tmp[row];
994 
995       for (j = nz; j > 1; j -= 2) {
996         vi -= 2;
997         i0   = vi[2];
998         i1   = vi[1];
999         tmp0 = tmps[i0];
1000         tmp1 = tmps[i1];
1001         v1 -= 2;
1002         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1003       }
1004       if (j == 1) {
1005         tmp0 = tmps[*vi--];
1006         sum1 -= *v1-- * tmp0;
1007       }
1008       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1009       row--;
1010       break;
1011     case 2:
1012       sum1 = tmp[row];
1013       sum2 = tmp[row - 1];
1014       v2   = aa + ai[row] - 1;
1015       for (j = nz; j > 1; j -= 2) {
1016         vi -= 2;
1017         i0   = vi[2];
1018         i1   = vi[1];
1019         tmp0 = tmps[i0];
1020         tmp1 = tmps[i1];
1021         v1 -= 2;
1022         v2 -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025       }
1026       if (j == 1) {
1027         tmp0 = tmps[*vi--];
1028         sum1 -= *v1-- * tmp0;
1029         sum2 -= *v2-- * tmp0;
1030       }
1031 
1032       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1033       row--;
1034       sum2 -= *v2-- * tmp0;
1035       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1036       row--;
1037       break;
1038     case 3:
1039       sum1 = tmp[row];
1040       sum2 = tmp[row - 1];
1041       sum3 = tmp[row - 2];
1042       v2   = aa + ai[row] - 1;
1043       v3   = aa + ai[row - 1] - 1;
1044       for (j = nz; j > 1; j -= 2) {
1045         vi -= 2;
1046         i0   = vi[2];
1047         i1   = vi[1];
1048         tmp0 = tmps[i0];
1049         tmp1 = tmps[i1];
1050         v1 -= 2;
1051         v2 -= 2;
1052         v3 -= 2;
1053         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1054         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1055         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1056       }
1057       if (j == 1) {
1058         tmp0 = tmps[*vi--];
1059         sum1 -= *v1-- * tmp0;
1060         sum2 -= *v2-- * tmp0;
1061         sum3 -= *v3-- * tmp0;
1062       }
1063       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1064       row--;
1065       sum2 -= *v2-- * tmp0;
1066       sum3 -= *v3-- * tmp0;
1067       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1068       row--;
1069       sum3 -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1071       row--;
1072 
1073       break;
1074     case 4:
1075       sum1 = tmp[row];
1076       sum2 = tmp[row - 1];
1077       sum3 = tmp[row - 2];
1078       sum4 = tmp[row - 3];
1079       v2   = aa + ai[row] - 1;
1080       v3   = aa + ai[row - 1] - 1;
1081       v4   = aa + ai[row - 2] - 1;
1082 
1083       for (j = nz; j > 1; j -= 2) {
1084         vi -= 2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1 -= 2;
1090         v2 -= 2;
1091         v3 -= 2;
1092         v4 -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j == 1) {
1099         tmp0 = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1107       row--;
1108       sum2 -= *v2-- * tmp0;
1109       sum3 -= *v3-- * tmp0;
1110       sum4 -= *v4-- * tmp0;
1111       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1112       row--;
1113       sum3 -= *v3-- * tmp0;
1114       sum4 -= *v4-- * tmp0;
1115       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1116       row--;
1117       sum4 -= *v4-- * tmp0;
1118       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1119       row--;
1120       break;
1121     case 5:
1122       sum1 = tmp[row];
1123       sum2 = tmp[row - 1];
1124       sum3 = tmp[row - 2];
1125       sum4 = tmp[row - 3];
1126       sum5 = tmp[row - 4];
1127       v2   = aa + ai[row] - 1;
1128       v3   = aa + ai[row - 1] - 1;
1129       v4   = aa + ai[row - 2] - 1;
1130       v5   = aa + ai[row - 3] - 1;
1131       for (j = nz; j > 1; j -= 2) {
1132         vi -= 2;
1133         i0   = vi[2];
1134         i1   = vi[1];
1135         tmp0 = tmps[i0];
1136         tmp1 = tmps[i1];
1137         v1 -= 2;
1138         v2 -= 2;
1139         v3 -= 2;
1140         v4 -= 2;
1141         v5 -= 2;
1142         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1143         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1144         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1145         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1146         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1147       }
1148       if (j == 1) {
1149         tmp0 = tmps[*vi--];
1150         sum1 -= *v1-- * tmp0;
1151         sum2 -= *v2-- * tmp0;
1152         sum3 -= *v3-- * tmp0;
1153         sum4 -= *v4-- * tmp0;
1154         sum5 -= *v5-- * tmp0;
1155       }
1156 
1157       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1158       row--;
1159       sum2 -= *v2-- * tmp0;
1160       sum3 -= *v3-- * tmp0;
1161       sum4 -= *v4-- * tmp0;
1162       sum5 -= *v5-- * tmp0;
1163       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1164       row--;
1165       sum3 -= *v3-- * tmp0;
1166       sum4 -= *v4-- * tmp0;
1167       sum5 -= *v5-- * tmp0;
1168       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1169       row--;
1170       sum4 -= *v4-- * tmp0;
1171       sum5 -= *v5-- * tmp0;
1172       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1173       row--;
1174       sum5 -= *v5-- * tmp0;
1175       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1176       row--;
1177       break;
1178     default:
1179       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1180     }
1181   }
1182   PetscCall(ISRestoreIndices(isrow, &rout));
1183   PetscCall(ISRestoreIndices(iscol, &cout));
1184   PetscCall(VecRestoreArrayRead(bb, &b));
1185   PetscCall(VecRestoreArrayWrite(xx, &x));
1186   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1191 {
1192   Mat              C = B;
1193   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1194   IS               isrow = b->row, isicol = b->icol;
1195   const PetscInt  *r, *ic, *ics;
1196   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1197   PetscInt         i, j, k, nz, nzL, row, *pj;
1198   const PetscInt  *ajtmp, *bjtmp;
1199   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1200   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1201   FactorShiftCtx   sctx;
1202   const PetscInt  *ddiag;
1203   PetscReal        rs;
1204   MatScalar        d;
1205   PetscInt         inod, nodesz, node_max, col;
1206   const PetscInt  *ns;
1207   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1208 
1209   PetscFunctionBegin;
1210   /* MatPivotSetUp(): initialize shift context sctx */
1211   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1212 
1213   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1214     ddiag          = a->diag;
1215     sctx.shift_top = info->zeropivot;
1216     for (i = 0; i < n; i++) {
1217       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1218       d  = (aa)[ddiag[i]];
1219       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1220       v  = aa + ai[i];
1221       nz = ai[i + 1] - ai[i];
1222       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1223       if (rs > sctx.shift_top) sctx.shift_top = rs;
1224     }
1225     sctx.shift_top *= 1.1;
1226     sctx.nshift_max = 5;
1227     sctx.shift_lo   = 0.;
1228     sctx.shift_hi   = 1.;
1229   }
1230 
1231   PetscCall(ISGetIndices(isrow, &r));
1232   PetscCall(ISGetIndices(isicol, &ic));
1233 
1234   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1235   ics = ic;
1236 
1237   node_max = a->inode.node_count;
1238   ns       = a->inode.size;
1239   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1240 
1241   /* If max inode size > 4, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1244   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1245     if (ns[i] > 4) {
1246       tmp_vec1[j] = 4;
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   PetscCall(PetscMalloc1(n + 1, &nsmap));
1259   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1260   for (i = 0, row = 0; i < node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i = 0, j = 0; i < node_max; i++) {
1266     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1267     tmp_vec2[i] = nodesz;
1268     j += nodesz;
1269   }
1270   PetscCall(PetscFree(nsmap));
1271   PetscCall(PetscFree(tmp_vec1));
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.newshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz) {
1283       case 1:
1284         /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i + 1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz    = bdiag[i] - bdiag[i + 1];
1293         bjtmp = bj + bdiag[i + 1] + 1;
1294         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i] + 1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1301 
1302         /* ZeropivotApply() */
1303         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1304 
1305         /* elimination */
1306         bjtmp = bj + bi[i];
1307         row   = *bjtmp++;
1308         nzL   = bi[i + 1] - bi[i];
1309         for (k = 0; k < nzL; k++) {
1310           pc = rtmp1 + row;
1311           if (*pc != 0.0) {
1312             pv   = b->a + bdiag[row];
1313             mul1 = *pc * (*pv);
1314             *pc  = mul1;
1315             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1316             pv   = b->a + bdiag[row + 1] + 1;
1317             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1318             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1319             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1320           }
1321           row = *bjtmp++;
1322         }
1323 
1324         /* finished row so stick it into b->a */
1325         rs = 0.0;
1326         /* L part */
1327         pv = b->a + bi[i];
1328         pj = b->j + bi[i];
1329         nz = bi[i + 1] - bi[i];
1330         for (j = 0; j < nz; j++) {
1331           pv[j] = rtmp1[pj[j]];
1332           rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i + 1] + 1;
1337         pj = b->j + bdiag[i + 1] + 1;
1338         nz = bdiag[i] - bdiag[i + 1] - 1;
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1348         if (sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simpler triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356         /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i + 1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for (j = 0; j < nz; j++) {
1362           col        = bjtmp[j];
1363           rtmp1[col] = 0.0;
1364           rtmp2[col] = 0.0;
1365         }
1366 
1367         /* U part */
1368         nz    = bdiag[i] - bdiag[i + 1];
1369         bjtmp = bj + bdiag[i + 1] + 1;
1370         for (j = 0; j < nz; j++) {
1371           col        = bjtmp[j];
1372           rtmp1[col] = 0.0;
1373           rtmp2[col] = 0.0;
1374         }
1375 
1376         /* load in initial (unfactored row) */
1377         nz    = ai[r[i] + 1] - ai[r[i]];
1378         ajtmp = aj + ai[r[i]];
1379         v1    = aa + ai[r[i]];
1380         v2    = aa + ai[r[i] + 1];
1381         for (j = 0; j < nz; j++) {
1382           col        = ics[ajtmp[j]];
1383           rtmp1[col] = v1[j];
1384           rtmp2[col] = v2[j];
1385         }
1386         /* ZeropivotApply(): shift the diagonal of the matrix  */
1387         rtmp1[i] += sctx.shift_amount;
1388         rtmp2[i + 1] += sctx.shift_amount;
1389 
1390         /* elimination */
1391         bjtmp = bj + bi[i];
1392         row   = *bjtmp++; /* pivot row */
1393         nzL   = bi[i + 1] - bi[i];
1394         for (k = 0; k < nzL; k++) {
1395           pc1 = rtmp1 + row;
1396           pc2 = rtmp2 + row;
1397           if (*pc1 != 0.0 || *pc2 != 0.0) {
1398             pv   = b->a + bdiag[row];
1399             mul1 = *pc1 * (*pv);
1400             mul2 = *pc2 * (*pv);
1401             *pc1 = mul1;
1402             *pc2 = mul2;
1403 
1404             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1405             pv = b->a + bdiag[row + 1] + 1;
1406             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1407             for (j = 0; j < nz; j++) {
1408               col = pj[j];
1409               rtmp1[col] -= mul1 * pv[j];
1410               rtmp2[col] -= mul2 * pv[j];
1411             }
1412             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1413           }
1414           row = *bjtmp++;
1415         }
1416 
1417         /* finished row i; check zero pivot, then stick row i into b->a */
1418         rs = 0.0;
1419         /* L part */
1420         pc1 = b->a + bi[i];
1421         pj  = b->j + bi[i];
1422         nz  = bi[i + 1] - bi[i];
1423         for (j = 0; j < nz; j++) {
1424           col    = pj[j];
1425           pc1[j] = rtmp1[col];
1426           rs += PetscAbsScalar(pc1[j]);
1427         }
1428         /* U part */
1429         pc1 = b->a + bdiag[i + 1] + 1;
1430         pj  = b->j + bdiag[i + 1] + 1;
1431         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1432         for (j = 0; j < nz; j++) {
1433           col    = pj[j];
1434           pc1[j] = rtmp1[col];
1435           rs += PetscAbsScalar(pc1[j]);
1436         }
1437 
1438         sctx.rs = rs;
1439         sctx.pv = rtmp1[i];
1440         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1441         if (sctx.newshift) break;
1442         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1443         *pc1 = 1.0 / sctx.pv;
1444 
1445         /* Now take care of diagonal 2x2 block. */
1446         pc2 = rtmp2 + i;
1447         if (*pc2 != 0.0) {
1448           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1449           *pc2 = mul1;                        /* insert L entry */
1450           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1451           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1452           for (j = 0; j < nz; j++) {
1453             col = pj[j];
1454             rtmp2[col] -= mul1 * rtmp1[col];
1455           }
1456           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1457         }
1458 
1459         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1460         rs = 0.0;
1461         /* L part */
1462         pc2 = b->a + bi[i + 1];
1463         pj  = b->j + bi[i + 1];
1464         nz  = bi[i + 2] - bi[i + 1];
1465         for (j = 0; j < nz; j++) {
1466           col    = pj[j];
1467           pc2[j] = rtmp2[col];
1468           rs += PetscAbsScalar(pc2[j]);
1469         }
1470         /* U part */
1471         pc2 = b->a + bdiag[i + 2] + 1;
1472         pj  = b->j + bdiag[i + 2] + 1;
1473         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1474         for (j = 0; j < nz; j++) {
1475           col    = pj[j];
1476           pc2[j] = rtmp2[col];
1477           rs += PetscAbsScalar(pc2[j]);
1478         }
1479 
1480         sctx.rs = rs;
1481         sctx.pv = rtmp2[i + 1];
1482         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1483         if (sctx.newshift) break;
1484         pc2  = b->a + bdiag[i + 1];
1485         *pc2 = 1.0 / sctx.pv;
1486         break;
1487 
1488       case 3:
1489         /*----------*/
1490         /* zero rtmp */
1491         /* L part */
1492         nz    = bi[i + 1] - bi[i];
1493         bjtmp = bj + bi[i];
1494         for (j = 0; j < nz; j++) {
1495           col        = bjtmp[j];
1496           rtmp1[col] = 0.0;
1497           rtmp2[col] = 0.0;
1498           rtmp3[col] = 0.0;
1499         }
1500 
1501         /* U part */
1502         nz    = bdiag[i] - bdiag[i + 1];
1503         bjtmp = bj + bdiag[i + 1] + 1;
1504         for (j = 0; j < nz; j++) {
1505           col        = bjtmp[j];
1506           rtmp1[col] = 0.0;
1507           rtmp2[col] = 0.0;
1508           rtmp3[col] = 0.0;
1509         }
1510 
1511         /* load in initial (unfactored row) */
1512         nz    = ai[r[i] + 1] - ai[r[i]];
1513         ajtmp = aj + ai[r[i]];
1514         v1    = aa + ai[r[i]];
1515         v2    = aa + ai[r[i] + 1];
1516         v3    = aa + ai[r[i] + 2];
1517         for (j = 0; j < nz; j++) {
1518           col        = ics[ajtmp[j]];
1519           rtmp1[col] = v1[j];
1520           rtmp2[col] = v2[j];
1521           rtmp3[col] = v3[j];
1522         }
1523         /* ZeropivotApply(): shift the diagonal of the matrix  */
1524         rtmp1[i] += sctx.shift_amount;
1525         rtmp2[i + 1] += sctx.shift_amount;
1526         rtmp3[i + 2] += sctx.shift_amount;
1527 
1528         /* elimination */
1529         bjtmp = bj + bi[i];
1530         row   = *bjtmp++; /* pivot row */
1531         nzL   = bi[i + 1] - bi[i];
1532         for (k = 0; k < nzL; k++) {
1533           pc1 = rtmp1 + row;
1534           pc2 = rtmp2 + row;
1535           pc3 = rtmp3 + row;
1536           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1537             pv   = b->a + bdiag[row];
1538             mul1 = *pc1 * (*pv);
1539             mul2 = *pc2 * (*pv);
1540             mul3 = *pc3 * (*pv);
1541             *pc1 = mul1;
1542             *pc2 = mul2;
1543             *pc3 = mul3;
1544 
1545             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1546             pv = b->a + bdiag[row + 1] + 1;
1547             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1548             for (j = 0; j < nz; j++) {
1549               col = pj[j];
1550               rtmp1[col] -= mul1 * pv[j];
1551               rtmp2[col] -= mul2 * pv[j];
1552               rtmp3[col] -= mul3 * pv[j];
1553             }
1554             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1555           }
1556           row = *bjtmp++;
1557         }
1558 
1559         /* finished row i; check zero pivot, then stick row i into b->a */
1560         rs = 0.0;
1561         /* L part */
1562         pc1 = b->a + bi[i];
1563         pj  = b->j + bi[i];
1564         nz  = bi[i + 1] - bi[i];
1565         for (j = 0; j < nz; j++) {
1566           col    = pj[j];
1567           pc1[j] = rtmp1[col];
1568           rs += PetscAbsScalar(pc1[j]);
1569         }
1570         /* U part */
1571         pc1 = b->a + bdiag[i + 1] + 1;
1572         pj  = b->j + bdiag[i + 1] + 1;
1573         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1574         for (j = 0; j < nz; j++) {
1575           col    = pj[j];
1576           pc1[j] = rtmp1[col];
1577           rs += PetscAbsScalar(pc1[j]);
1578         }
1579 
1580         sctx.rs = rs;
1581         sctx.pv = rtmp1[i];
1582         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1583         if (sctx.newshift) break;
1584         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1585         *pc1 = 1.0 / sctx.pv;
1586 
1587         /* Now take care of 1st column of diagonal 3x3 block. */
1588         pc2 = rtmp2 + i;
1589         pc3 = rtmp3 + i;
1590         if (*pc2 != 0.0 || *pc3 != 0.0) {
1591           mul2 = (*pc2) * (*pc1);
1592           *pc2 = mul2;
1593           mul3 = (*pc3) * (*pc1);
1594           *pc3 = mul3;
1595           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1596           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1597           for (j = 0; j < nz; j++) {
1598             col = pj[j];
1599             rtmp2[col] -= mul2 * rtmp1[col];
1600             rtmp3[col] -= mul3 * rtmp1[col];
1601           }
1602           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1603         }
1604 
1605         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1606         rs = 0.0;
1607         /* L part */
1608         pc2 = b->a + bi[i + 1];
1609         pj  = b->j + bi[i + 1];
1610         nz  = bi[i + 2] - bi[i + 1];
1611         for (j = 0; j < nz; j++) {
1612           col    = pj[j];
1613           pc2[j] = rtmp2[col];
1614           rs += PetscAbsScalar(pc2[j]);
1615         }
1616         /* U part */
1617         pc2 = b->a + bdiag[i + 2] + 1;
1618         pj  = b->j + bdiag[i + 2] + 1;
1619         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1620         for (j = 0; j < nz; j++) {
1621           col    = pj[j];
1622           pc2[j] = rtmp2[col];
1623           rs += PetscAbsScalar(pc2[j]);
1624         }
1625 
1626         sctx.rs = rs;
1627         sctx.pv = rtmp2[i + 1];
1628         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1629         if (sctx.newshift) break;
1630         pc2  = b->a + bdiag[i + 1];
1631         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1632 
1633         /* Now take care of 2nd column of diagonal 3x3 block. */
1634         pc3 = rtmp3 + i + 1;
1635         if (*pc3 != 0.0) {
1636           mul3 = (*pc3) * (*pc2);
1637           *pc3 = mul3;
1638           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1639           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1640           for (j = 0; j < nz; j++) {
1641             col = pj[j];
1642             rtmp3[col] -= mul3 * rtmp2[col];
1643           }
1644           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1645         }
1646 
1647         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1648         rs = 0.0;
1649         /* L part */
1650         pc3 = b->a + bi[i + 2];
1651         pj  = b->j + bi[i + 2];
1652         nz  = bi[i + 3] - bi[i + 2];
1653         for (j = 0; j < nz; j++) {
1654           col    = pj[j];
1655           pc3[j] = rtmp3[col];
1656           rs += PetscAbsScalar(pc3[j]);
1657         }
1658         /* U part */
1659         pc3 = b->a + bdiag[i + 3] + 1;
1660         pj  = b->j + bdiag[i + 3] + 1;
1661         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1662         for (j = 0; j < nz; j++) {
1663           col    = pj[j];
1664           pc3[j] = rtmp3[col];
1665           rs += PetscAbsScalar(pc3[j]);
1666         }
1667 
1668         sctx.rs = rs;
1669         sctx.pv = rtmp3[i + 2];
1670         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1671         if (sctx.newshift) break;
1672         pc3  = b->a + bdiag[i + 2];
1673         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1674         break;
1675       case 4:
1676         /*----------*/
1677         /* zero rtmp */
1678         /* L part */
1679         nz    = bi[i + 1] - bi[i];
1680         bjtmp = bj + bi[i];
1681         for (j = 0; j < nz; j++) {
1682           col        = bjtmp[j];
1683           rtmp1[col] = 0.0;
1684           rtmp2[col] = 0.0;
1685           rtmp3[col] = 0.0;
1686           rtmp4[col] = 0.0;
1687         }
1688 
1689         /* U part */
1690         nz    = bdiag[i] - bdiag[i + 1];
1691         bjtmp = bj + bdiag[i + 1] + 1;
1692         for (j = 0; j < nz; j++) {
1693           col        = bjtmp[j];
1694           rtmp1[col] = 0.0;
1695           rtmp2[col] = 0.0;
1696           rtmp3[col] = 0.0;
1697           rtmp4[col] = 0.0;
1698         }
1699 
1700         /* load in initial (unfactored row) */
1701         nz    = ai[r[i] + 1] - ai[r[i]];
1702         ajtmp = aj + ai[r[i]];
1703         v1    = aa + ai[r[i]];
1704         v2    = aa + ai[r[i] + 1];
1705         v3    = aa + ai[r[i] + 2];
1706         v4    = aa + ai[r[i] + 3];
1707         for (j = 0; j < nz; j++) {
1708           col        = ics[ajtmp[j]];
1709           rtmp1[col] = v1[j];
1710           rtmp2[col] = v2[j];
1711           rtmp3[col] = v3[j];
1712           rtmp4[col] = v4[j];
1713         }
1714         /* ZeropivotApply(): shift the diagonal of the matrix  */
1715         rtmp1[i] += sctx.shift_amount;
1716         rtmp2[i + 1] += sctx.shift_amount;
1717         rtmp3[i + 2] += sctx.shift_amount;
1718         rtmp4[i + 3] += sctx.shift_amount;
1719 
1720         /* elimination */
1721         bjtmp = bj + bi[i];
1722         row   = *bjtmp++; /* pivot row */
1723         nzL   = bi[i + 1] - bi[i];
1724         for (k = 0; k < nzL; k++) {
1725           pc1 = rtmp1 + row;
1726           pc2 = rtmp2 + row;
1727           pc3 = rtmp3 + row;
1728           pc4 = rtmp4 + row;
1729           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1730             pv   = b->a + bdiag[row];
1731             mul1 = *pc1 * (*pv);
1732             mul2 = *pc2 * (*pv);
1733             mul3 = *pc3 * (*pv);
1734             mul4 = *pc4 * (*pv);
1735             *pc1 = mul1;
1736             *pc2 = mul2;
1737             *pc3 = mul3;
1738             *pc4 = mul4;
1739 
1740             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1741             pv = b->a + bdiag[row + 1] + 1;
1742             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1743             for (j = 0; j < nz; j++) {
1744               col = pj[j];
1745               rtmp1[col] -= mul1 * pv[j];
1746               rtmp2[col] -= mul2 * pv[j];
1747               rtmp3[col] -= mul3 * pv[j];
1748               rtmp4[col] -= mul4 * pv[j];
1749             }
1750             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1751           }
1752           row = *bjtmp++;
1753         }
1754 
1755         /* finished row i; check zero pivot, then stick row i into b->a */
1756         rs = 0.0;
1757         /* L part */
1758         pc1 = b->a + bi[i];
1759         pj  = b->j + bi[i];
1760         nz  = bi[i + 1] - bi[i];
1761         for (j = 0; j < nz; j++) {
1762           col    = pj[j];
1763           pc1[j] = rtmp1[col];
1764           rs += PetscAbsScalar(pc1[j]);
1765         }
1766         /* U part */
1767         pc1 = b->a + bdiag[i + 1] + 1;
1768         pj  = b->j + bdiag[i + 1] + 1;
1769         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1770         for (j = 0; j < nz; j++) {
1771           col    = pj[j];
1772           pc1[j] = rtmp1[col];
1773           rs += PetscAbsScalar(pc1[j]);
1774         }
1775 
1776         sctx.rs = rs;
1777         sctx.pv = rtmp1[i];
1778         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1779         if (sctx.newshift) break;
1780         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1781         *pc1 = 1.0 / sctx.pv;
1782 
1783         /* Now take care of 1st column of diagonal 4x4 block. */
1784         pc2 = rtmp2 + i;
1785         pc3 = rtmp3 + i;
1786         pc4 = rtmp4 + i;
1787         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1788           mul2 = (*pc2) * (*pc1);
1789           *pc2 = mul2;
1790           mul3 = (*pc3) * (*pc1);
1791           *pc3 = mul3;
1792           mul4 = (*pc4) * (*pc1);
1793           *pc4 = mul4;
1794           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1795           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1796           for (j = 0; j < nz; j++) {
1797             col = pj[j];
1798             rtmp2[col] -= mul2 * rtmp1[col];
1799             rtmp3[col] -= mul3 * rtmp1[col];
1800             rtmp4[col] -= mul4 * rtmp1[col];
1801           }
1802           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1803         }
1804 
1805         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1806         rs = 0.0;
1807         /* L part */
1808         pc2 = b->a + bi[i + 1];
1809         pj  = b->j + bi[i + 1];
1810         nz  = bi[i + 2] - bi[i + 1];
1811         for (j = 0; j < nz; j++) {
1812           col    = pj[j];
1813           pc2[j] = rtmp2[col];
1814           rs += PetscAbsScalar(pc2[j]);
1815         }
1816         /* U part */
1817         pc2 = b->a + bdiag[i + 2] + 1;
1818         pj  = b->j + bdiag[i + 2] + 1;
1819         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1820         for (j = 0; j < nz; j++) {
1821           col    = pj[j];
1822           pc2[j] = rtmp2[col];
1823           rs += PetscAbsScalar(pc2[j]);
1824         }
1825 
1826         sctx.rs = rs;
1827         sctx.pv = rtmp2[i + 1];
1828         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1829         if (sctx.newshift) break;
1830         pc2  = b->a + bdiag[i + 1];
1831         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1832 
1833         /* Now take care of 2nd column of diagonal 4x4 block. */
1834         pc3 = rtmp3 + i + 1;
1835         pc4 = rtmp4 + i + 1;
1836         if (*pc3 != 0.0 || *pc4 != 0.0) {
1837           mul3 = (*pc3) * (*pc2);
1838           *pc3 = mul3;
1839           mul4 = (*pc4) * (*pc2);
1840           *pc4 = mul4;
1841           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1842           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1843           for (j = 0; j < nz; j++) {
1844             col = pj[j];
1845             rtmp3[col] -= mul3 * rtmp2[col];
1846             rtmp4[col] -= mul4 * rtmp2[col];
1847           }
1848           PetscCall(PetscLogFlops(4.0 * nz));
1849         }
1850 
1851         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1852         rs = 0.0;
1853         /* L part */
1854         pc3 = b->a + bi[i + 2];
1855         pj  = b->j + bi[i + 2];
1856         nz  = bi[i + 3] - bi[i + 2];
1857         for (j = 0; j < nz; j++) {
1858           col    = pj[j];
1859           pc3[j] = rtmp3[col];
1860           rs += PetscAbsScalar(pc3[j]);
1861         }
1862         /* U part */
1863         pc3 = b->a + bdiag[i + 3] + 1;
1864         pj  = b->j + bdiag[i + 3] + 1;
1865         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1866         for (j = 0; j < nz; j++) {
1867           col    = pj[j];
1868           pc3[j] = rtmp3[col];
1869           rs += PetscAbsScalar(pc3[j]);
1870         }
1871 
1872         sctx.rs = rs;
1873         sctx.pv = rtmp3[i + 2];
1874         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1875         if (sctx.newshift) break;
1876         pc3  = b->a + bdiag[i + 2];
1877         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1878 
1879         /* Now take care of 3rd column of diagonal 4x4 block. */
1880         pc4 = rtmp4 + i + 2;
1881         if (*pc4 != 0.0) {
1882           mul4 = (*pc4) * (*pc3);
1883           *pc4 = mul4;
1884           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1885           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1886           for (j = 0; j < nz; j++) {
1887             col = pj[j];
1888             rtmp4[col] -= mul4 * rtmp3[col];
1889           }
1890           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1891         }
1892 
1893         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1894         rs = 0.0;
1895         /* L part */
1896         pc4 = b->a + bi[i + 3];
1897         pj  = b->j + bi[i + 3];
1898         nz  = bi[i + 4] - bi[i + 3];
1899         for (j = 0; j < nz; j++) {
1900           col    = pj[j];
1901           pc4[j] = rtmp4[col];
1902           rs += PetscAbsScalar(pc4[j]);
1903         }
1904         /* U part */
1905         pc4 = b->a + bdiag[i + 4] + 1;
1906         pj  = b->j + bdiag[i + 4] + 1;
1907         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1908         for (j = 0; j < nz; j++) {
1909           col    = pj[j];
1910           pc4[j] = rtmp4[col];
1911           rs += PetscAbsScalar(pc4[j]);
1912         }
1913 
1914         sctx.rs = rs;
1915         sctx.pv = rtmp4[i + 3];
1916         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1917         if (sctx.newshift) break;
1918         pc4  = b->a + bdiag[i + 3];
1919         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1920         break;
1921 
1922       default:
1923         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1924       }
1925       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1926       i += nodesz;              /* Update the row */
1927     }
1928 
1929     /* MatPivotRefine() */
1930     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1931       /*
1932        * if no shift in this attempt & shifting & started shifting & can refine,
1933        * then try lower shift
1934        */
1935       sctx.shift_hi       = sctx.shift_fraction;
1936       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1937       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1938       sctx.newshift       = PETSC_TRUE;
1939       sctx.nshift++;
1940     }
1941   } while (sctx.newshift);
1942 
1943   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1944   PetscCall(PetscFree(tmp_vec2));
1945   PetscCall(ISRestoreIndices(isicol, &ic));
1946   PetscCall(ISRestoreIndices(isrow, &r));
1947 
1948   if (b->inode.size) {
1949     C->ops->solve = MatSolve_SeqAIJ_Inode;
1950   } else {
1951     C->ops->solve = MatSolve_SeqAIJ;
1952   }
1953   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1954   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1955   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1956   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1957   C->assembled              = PETSC_TRUE;
1958   C->preallocated           = PETSC_TRUE;
1959 
1960   PetscCall(PetscLogFlops(C->cmap->n));
1961 
1962   /* MatShiftView(A,info,&sctx) */
1963   if (sctx.nshift) {
1964     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1965       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1966     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1967       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1968     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1969       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1970     }
1971   }
1972   PetscFunctionReturn(PETSC_SUCCESS);
1973 }
1974 
1975 #if 0
1976 // unused
1977 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1978 {
1979   Mat              C = B;
1980   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1981   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1982   const PetscInt  *r, *ic, *c, *ics;
1983   PetscInt         n = A->rmap->n, *bi = b->i;
1984   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
1985   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
1986   PetscInt        *ai = a->i, *aj = a->j;
1987   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1988   PetscScalar      mul1, mul2, mul3, tmp;
1989   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1990   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1991   PetscReal        rs = 0.0;
1992   FactorShiftCtx   sctx;
1993 
1994   PetscFunctionBegin;
1995   sctx.shift_top      = 0;
1996   sctx.nshift_max     = 0;
1997   sctx.shift_lo       = 0;
1998   sctx.shift_hi       = 0;
1999   sctx.shift_fraction = 0;
2000 
2001   /* if both shift schemes are chosen by user, only use info->shiftpd */
2002   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2003     sctx.shift_top = 0;
2004     for (i = 0; i < n; i++) {
2005       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2006       rs    = 0.0;
2007       ajtmp = aj + ai[i];
2008       rtmp1 = aa + ai[i];
2009       nz    = ai[i + 1] - ai[i];
2010       for (j = 0; j < nz; j++) {
2011         if (*ajtmp != i) {
2012           rs += PetscAbsScalar(*rtmp1++);
2013         } else {
2014           rs -= PetscRealPart(*rtmp1++);
2015         }
2016         ajtmp++;
2017       }
2018       if (rs > sctx.shift_top) sctx.shift_top = rs;
2019     }
2020     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2021     sctx.shift_top *= 1.1;
2022     sctx.nshift_max = 5;
2023     sctx.shift_lo   = 0.;
2024     sctx.shift_hi   = 1.;
2025   }
2026   sctx.shift_amount = 0;
2027   sctx.nshift       = 0;
2028 
2029   PetscCall(ISGetIndices(isrow, &r));
2030   PetscCall(ISGetIndices(iscol, &c));
2031   PetscCall(ISGetIndices(isicol, &ic));
2032   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2033   ics = ic;
2034 
2035   node_max = a->inode.node_count;
2036   ns       = a->inode.size;
2037   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2038 
2039   /* If max inode size > 3, split it into two inodes.*/
2040   /* also map the inode sizes according to the ordering */
2041   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2042   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2043     if (ns[i] > 3) {
2044       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2045       ++j;
2046       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2047     } else {
2048       tmp_vec1[j] = ns[i];
2049     }
2050   }
2051   /* Use the correct node_max */
2052   node_max = j;
2053 
2054   /* Now reorder the inode info based on mat re-ordering info */
2055   /* First create a row -> inode_size_array_index map */
2056   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2057   for (i = 0, row = 0; i < node_max; i++) {
2058     nodesz = tmp_vec1[i];
2059     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2060   }
2061   /* Using nsmap, create a reordered ns structure */
2062   for (i = 0, j = 0; i < node_max; i++) {
2063     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2064     tmp_vec2[i] = nodesz;
2065     j += nodesz;
2066   }
2067   PetscCall(PetscFree2(nsmap, tmp_vec1));
2068   /* Now use the correct ns */
2069   ns = tmp_vec2;
2070 
2071   do {
2072     sctx.newshift = PETSC_FALSE;
2073     /* Now loop over each block-row, and do the factorization */
2074     for (i = 0, row = 0; i < node_max; i++) {
2075       nodesz = ns[i];
2076       nz     = bi[row + 1] - bi[row];
2077       bjtmp  = bj + bi[row];
2078 
2079       switch (nodesz) {
2080       case 1:
2081         for (j = 0; j < nz; j++) {
2082           idx         = bjtmp[j];
2083           rtmp11[idx] = 0.0;
2084         }
2085 
2086         /* load in initial (unfactored row) */
2087         idx    = r[row];
2088         nz_tmp = ai[idx + 1] - ai[idx];
2089         ajtmp  = aj + ai[idx];
2090         v1     = aa + ai[idx];
2091 
2092         for (j = 0; j < nz_tmp; j++) {
2093           idx         = ics[ajtmp[j]];
2094           rtmp11[idx] = v1[j];
2095         }
2096         rtmp11[ics[r[row]]] += sctx.shift_amount;
2097 
2098         prow = *bjtmp++;
2099         while (prow < row) {
2100           pc1 = rtmp11 + prow;
2101           if (*pc1 != 0.0) {
2102             pv     = ba + bd[prow];
2103             pj     = nbj + bd[prow];
2104             mul1   = *pc1 * *pv++;
2105             *pc1   = mul1;
2106             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2107             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2108             for (j = 0; j < nz_tmp; j++) {
2109               tmp = pv[j];
2110               idx = pj[j];
2111               rtmp11[idx] -= mul1 * tmp;
2112             }
2113           }
2114           prow = *bjtmp++;
2115         }
2116         pj  = bj + bi[row];
2117         pc1 = ba + bi[row];
2118 
2119         sctx.pv     = rtmp11[row];
2120         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2121         rs          = 0.0;
2122         for (j = 0; j < nz; j++) {
2123           idx    = pj[j];
2124           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2125           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2126         }
2127         sctx.rs = rs;
2128         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2129         if (sctx.newshift) goto endofwhile;
2130         break;
2131 
2132       case 2:
2133         for (j = 0; j < nz; j++) {
2134           idx         = bjtmp[j];
2135           rtmp11[idx] = 0.0;
2136           rtmp22[idx] = 0.0;
2137         }
2138 
2139         /* load in initial (unfactored row) */
2140         idx    = r[row];
2141         nz_tmp = ai[idx + 1] - ai[idx];
2142         ajtmp  = aj + ai[idx];
2143         v1     = aa + ai[idx];
2144         v2     = aa + ai[idx + 1];
2145         for (j = 0; j < nz_tmp; j++) {
2146           idx         = ics[ajtmp[j]];
2147           rtmp11[idx] = v1[j];
2148           rtmp22[idx] = v2[j];
2149         }
2150         rtmp11[ics[r[row]]] += sctx.shift_amount;
2151         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2152 
2153         prow = *bjtmp++;
2154         while (prow < row) {
2155           pc1 = rtmp11 + prow;
2156           pc2 = rtmp22 + prow;
2157           if (*pc1 != 0.0 || *pc2 != 0.0) {
2158             pv   = ba + bd[prow];
2159             pj   = nbj + bd[prow];
2160             mul1 = *pc1 * *pv;
2161             mul2 = *pc2 * *pv;
2162             ++pv;
2163             *pc1 = mul1;
2164             *pc2 = mul2;
2165 
2166             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2167             for (j = 0; j < nz_tmp; j++) {
2168               tmp = pv[j];
2169               idx = pj[j];
2170               rtmp11[idx] -= mul1 * tmp;
2171               rtmp22[idx] -= mul2 * tmp;
2172             }
2173             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2174           }
2175           prow = *bjtmp++;
2176         }
2177 
2178         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2179         pc1 = rtmp11 + prow;
2180         pc2 = rtmp22 + prow;
2181 
2182         sctx.pv = *pc1;
2183         pj      = bj + bi[prow];
2184         rs      = 0.0;
2185         for (j = 0; j < nz; j++) {
2186           idx = pj[j];
2187           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2188         }
2189         sctx.rs = rs;
2190         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2191         if (sctx.newshift) goto endofwhile;
2192 
2193         if (*pc2 != 0.0) {
2194           pj     = nbj + bd[prow];
2195           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2196           *pc2   = mul2;
2197           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2198           for (j = 0; j < nz_tmp; j++) {
2199             idx = pj[j];
2200             tmp = rtmp11[idx];
2201             rtmp22[idx] -= mul2 * tmp;
2202           }
2203           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2204         }
2205 
2206         pj  = bj + bi[row];
2207         pc1 = ba + bi[row];
2208         pc2 = ba + bi[row + 1];
2209 
2210         sctx.pv         = rtmp22[row + 1];
2211         rs              = 0.0;
2212         rtmp11[row]     = 1.0 / rtmp11[row];
2213         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2214         /* copy row entries from dense representation to sparse */
2215         for (j = 0; j < nz; j++) {
2216           idx    = pj[j];
2217           pc1[j] = rtmp11[idx];
2218           pc2[j] = rtmp22[idx];
2219           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2220         }
2221         sctx.rs = rs;
2222         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2223         if (sctx.newshift) goto endofwhile;
2224         break;
2225 
2226       case 3:
2227         for (j = 0; j < nz; j++) {
2228           idx         = bjtmp[j];
2229           rtmp11[idx] = 0.0;
2230           rtmp22[idx] = 0.0;
2231           rtmp33[idx] = 0.0;
2232         }
2233         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2234         idx    = r[row];
2235         nz_tmp = ai[idx + 1] - ai[idx];
2236         ajtmp  = aj + ai[idx];
2237         v1     = aa + ai[idx];
2238         v2     = aa + ai[idx + 1];
2239         v3     = aa + ai[idx + 2];
2240         for (j = 0; j < nz_tmp; j++) {
2241           idx         = ics[ajtmp[j]];
2242           rtmp11[idx] = v1[j];
2243           rtmp22[idx] = v2[j];
2244           rtmp33[idx] = v3[j];
2245         }
2246         rtmp11[ics[r[row]]] += sctx.shift_amount;
2247         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2248         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2249 
2250         /* loop over all pivot row blocks above this row block */
2251         prow = *bjtmp++;
2252         while (prow < row) {
2253           pc1 = rtmp11 + prow;
2254           pc2 = rtmp22 + prow;
2255           pc3 = rtmp33 + prow;
2256           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2257             pv   = ba + bd[prow];
2258             pj   = nbj + bd[prow];
2259             mul1 = *pc1 * *pv;
2260             mul2 = *pc2 * *pv;
2261             mul3 = *pc3 * *pv;
2262             ++pv;
2263             *pc1 = mul1;
2264             *pc2 = mul2;
2265             *pc3 = mul3;
2266 
2267             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2268             /* update this row based on pivot row */
2269             for (j = 0; j < nz_tmp; j++) {
2270               tmp = pv[j];
2271               idx = pj[j];
2272               rtmp11[idx] -= mul1 * tmp;
2273               rtmp22[idx] -= mul2 * tmp;
2274               rtmp33[idx] -= mul3 * tmp;
2275             }
2276             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2277           }
2278           prow = *bjtmp++;
2279         }
2280 
2281         /* Now take care of diagonal 3x3 block in this set of rows */
2282         /* note: prow = row here */
2283         pc1 = rtmp11 + prow;
2284         pc2 = rtmp22 + prow;
2285         pc3 = rtmp33 + prow;
2286 
2287         sctx.pv = *pc1;
2288         pj      = bj + bi[prow];
2289         rs      = 0.0;
2290         for (j = 0; j < nz; j++) {
2291           idx = pj[j];
2292           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2293         }
2294         sctx.rs = rs;
2295         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
2296         if (sctx.newshift) goto endofwhile;
2297 
2298         if (*pc2 != 0.0 || *pc3 != 0.0) {
2299           mul2   = (*pc2) / (*pc1);
2300           mul3   = (*pc3) / (*pc1);
2301           *pc2   = mul2;
2302           *pc3   = mul3;
2303           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2304           pj     = nbj + bd[prow];
2305           for (j = 0; j < nz_tmp; j++) {
2306             idx = pj[j];
2307             tmp = rtmp11[idx];
2308             rtmp22[idx] -= mul2 * tmp;
2309             rtmp33[idx] -= mul3 * tmp;
2310           }
2311           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2312         }
2313         ++prow;
2314 
2315         pc2     = rtmp22 + prow;
2316         pc3     = rtmp33 + prow;
2317         sctx.pv = *pc2;
2318         pj      = bj + bi[prow];
2319         rs      = 0.0;
2320         for (j = 0; j < nz; j++) {
2321           idx = pj[j];
2322           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2323         }
2324         sctx.rs = rs;
2325         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
2326         if (sctx.newshift) goto endofwhile;
2327 
2328         if (*pc3 != 0.0) {
2329           mul3   = (*pc3) / (*pc2);
2330           *pc3   = mul3;
2331           pj     = nbj + bd[prow];
2332           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2333           for (j = 0; j < nz_tmp; j++) {
2334             idx = pj[j];
2335             tmp = rtmp22[idx];
2336             rtmp33[idx] -= mul3 * tmp;
2337           }
2338           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2339         }
2340 
2341         pj  = bj + bi[row];
2342         pc1 = ba + bi[row];
2343         pc2 = ba + bi[row + 1];
2344         pc3 = ba + bi[row + 2];
2345 
2346         sctx.pv         = rtmp33[row + 2];
2347         rs              = 0.0;
2348         rtmp11[row]     = 1.0 / rtmp11[row];
2349         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2350         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2351         /* copy row entries from dense representation to sparse */
2352         for (j = 0; j < nz; j++) {
2353           idx    = pj[j];
2354           pc1[j] = rtmp11[idx];
2355           pc2[j] = rtmp22[idx];
2356           pc3[j] = rtmp33[idx];
2357           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2358         }
2359 
2360         sctx.rs = rs;
2361         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
2362         if (sctx.newshift) goto endofwhile;
2363         break;
2364 
2365       default:
2366         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2367       }
2368       row += nodesz; /* Update the row */
2369     }
2370   endofwhile:;
2371   } while (sctx.newshift);
2372   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
2373   PetscCall(PetscFree(tmp_vec2));
2374   PetscCall(ISRestoreIndices(isicol, &ic));
2375   PetscCall(ISRestoreIndices(isrow, &r));
2376   PetscCall(ISRestoreIndices(iscol, &c));
2377 
2378   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2379   /* do not set solve add, since MatSolve_Inode + Add is faster */
2380   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2381   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2382   C->assembled              = PETSC_TRUE;
2383   C->preallocated           = PETSC_TRUE;
2384   if (sctx.nshift) {
2385     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2386       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2387     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2388       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2389     }
2390   }
2391   PetscCall(PetscLogFlops(C->cmap->n));
2392   PetscCall(MatSeqAIJCheckInode(C));
2393   PetscFunctionReturn(PETSC_SUCCESS);
2394 }
2395 #endif
2396 
2397 /* ----------------------------------------------------------- */
2398 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2399 {
2400   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2401   IS                 iscol = a->col, isrow = a->row;
2402   const PetscInt    *r, *c, *rout, *cout;
2403   PetscInt           i, j, n = A->rmap->n;
2404   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
2405   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2406   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2407   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2408   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2409   const PetscScalar *b;
2410 
2411   PetscFunctionBegin;
2412   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2413   node_max = a->inode.node_count;
2414   ns       = a->inode.size; /* Node Size array */
2415 
2416   PetscCall(VecGetArrayRead(bb, &b));
2417   PetscCall(VecGetArrayWrite(xx, &x));
2418   tmp = a->solve_work;
2419 
2420   PetscCall(ISGetIndices(isrow, &rout));
2421   r = rout;
2422   PetscCall(ISGetIndices(iscol, &cout));
2423   c = cout;
2424 
2425   /* forward solve the lower triangular */
2426   tmps = tmp;
2427   aa   = a_a;
2428   aj   = a_j;
2429   ad   = a->diag;
2430 
2431   for (i = 0, row = 0; i < node_max; ++i) {
2432     nsz = ns[i];
2433     aii = ai[row];
2434     v1  = aa + aii;
2435     vi  = aj + aii;
2436     nz  = ai[row + 1] - ai[row];
2437 
2438     if (i < node_max - 1) {
2439       /* Prefetch the indices for the next block */
2440       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2441       /* Prefetch the data for the next block */
2442       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2443     }
2444 
2445     switch (nsz) { /* Each loop in 'case' is unrolled */
2446     case 1:
2447       sum1 = b[r[row]];
2448       for (j = 0; j < nz - 1; j += 2) {
2449         i0   = vi[j];
2450         i1   = vi[j + 1];
2451         tmp0 = tmps[i0];
2452         tmp1 = tmps[i1];
2453         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2454       }
2455       if (j == nz - 1) {
2456         tmp0 = tmps[vi[j]];
2457         sum1 -= v1[j] * tmp0;
2458       }
2459       tmp[row++] = sum1;
2460       break;
2461     case 2:
2462       sum1 = b[r[row]];
2463       sum2 = b[r[row + 1]];
2464       v2   = aa + ai[row + 1];
2465 
2466       for (j = 0; j < nz - 1; j += 2) {
2467         i0   = vi[j];
2468         i1   = vi[j + 1];
2469         tmp0 = tmps[i0];
2470         tmp1 = tmps[i1];
2471         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2472         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2473       }
2474       if (j == nz - 1) {
2475         tmp0 = tmps[vi[j]];
2476         sum1 -= v1[j] * tmp0;
2477         sum2 -= v2[j] * tmp0;
2478       }
2479       sum2 -= v2[nz] * sum1;
2480       tmp[row++] = sum1;
2481       tmp[row++] = sum2;
2482       break;
2483     case 3:
2484       sum1 = b[r[row]];
2485       sum2 = b[r[row + 1]];
2486       sum3 = b[r[row + 2]];
2487       v2   = aa + ai[row + 1];
2488       v3   = aa + ai[row + 2];
2489 
2490       for (j = 0; j < nz - 1; j += 2) {
2491         i0   = vi[j];
2492         i1   = vi[j + 1];
2493         tmp0 = tmps[i0];
2494         tmp1 = tmps[i1];
2495         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2496         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2497         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2498       }
2499       if (j == nz - 1) {
2500         tmp0 = tmps[vi[j]];
2501         sum1 -= v1[j] * tmp0;
2502         sum2 -= v2[j] * tmp0;
2503         sum3 -= v3[j] * tmp0;
2504       }
2505       sum2 -= v2[nz] * sum1;
2506       sum3 -= v3[nz] * sum1;
2507       sum3 -= v3[nz + 1] * sum2;
2508       tmp[row++] = sum1;
2509       tmp[row++] = sum2;
2510       tmp[row++] = sum3;
2511       break;
2512 
2513     case 4:
2514       sum1 = b[r[row]];
2515       sum2 = b[r[row + 1]];
2516       sum3 = b[r[row + 2]];
2517       sum4 = b[r[row + 3]];
2518       v2   = aa + ai[row + 1];
2519       v3   = aa + ai[row + 2];
2520       v4   = aa + ai[row + 3];
2521 
2522       for (j = 0; j < nz - 1; j += 2) {
2523         i0   = vi[j];
2524         i1   = vi[j + 1];
2525         tmp0 = tmps[i0];
2526         tmp1 = tmps[i1];
2527         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2528         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2529         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2530         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2531       }
2532       if (j == nz - 1) {
2533         tmp0 = tmps[vi[j]];
2534         sum1 -= v1[j] * tmp0;
2535         sum2 -= v2[j] * tmp0;
2536         sum3 -= v3[j] * tmp0;
2537         sum4 -= v4[j] * tmp0;
2538       }
2539       sum2 -= v2[nz] * sum1;
2540       sum3 -= v3[nz] * sum1;
2541       sum4 -= v4[nz] * sum1;
2542       sum3 -= v3[nz + 1] * sum2;
2543       sum4 -= v4[nz + 1] * sum2;
2544       sum4 -= v4[nz + 2] * sum3;
2545 
2546       tmp[row++] = sum1;
2547       tmp[row++] = sum2;
2548       tmp[row++] = sum3;
2549       tmp[row++] = sum4;
2550       break;
2551     case 5:
2552       sum1 = b[r[row]];
2553       sum2 = b[r[row + 1]];
2554       sum3 = b[r[row + 2]];
2555       sum4 = b[r[row + 3]];
2556       sum5 = b[r[row + 4]];
2557       v2   = aa + ai[row + 1];
2558       v3   = aa + ai[row + 2];
2559       v4   = aa + ai[row + 3];
2560       v5   = aa + ai[row + 4];
2561 
2562       for (j = 0; j < nz - 1; j += 2) {
2563         i0   = vi[j];
2564         i1   = vi[j + 1];
2565         tmp0 = tmps[i0];
2566         tmp1 = tmps[i1];
2567         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2568         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2569         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2570         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2571         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2572       }
2573       if (j == nz - 1) {
2574         tmp0 = tmps[vi[j]];
2575         sum1 -= v1[j] * tmp0;
2576         sum2 -= v2[j] * tmp0;
2577         sum3 -= v3[j] * tmp0;
2578         sum4 -= v4[j] * tmp0;
2579         sum5 -= v5[j] * tmp0;
2580       }
2581 
2582       sum2 -= v2[nz] * sum1;
2583       sum3 -= v3[nz] * sum1;
2584       sum4 -= v4[nz] * sum1;
2585       sum5 -= v5[nz] * sum1;
2586       sum3 -= v3[nz + 1] * sum2;
2587       sum4 -= v4[nz + 1] * sum2;
2588       sum5 -= v5[nz + 1] * sum2;
2589       sum4 -= v4[nz + 2] * sum3;
2590       sum5 -= v5[nz + 2] * sum3;
2591       sum5 -= v5[nz + 3] * sum4;
2592 
2593       tmp[row++] = sum1;
2594       tmp[row++] = sum2;
2595       tmp[row++] = sum3;
2596       tmp[row++] = sum4;
2597       tmp[row++] = sum5;
2598       break;
2599     default:
2600       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2601     }
2602   }
2603   /* backward solve the upper triangular */
2604   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2605     nsz = ns[i];
2606     aii = ad[row + 1] + 1;
2607     v1  = aa + aii;
2608     vi  = aj + aii;
2609     nz  = ad[row] - ad[row + 1] - 1;
2610 
2611     if (i > 0) {
2612       /* Prefetch the indices for the next block */
2613       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2614       /* Prefetch the data for the next block */
2615       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2616     }
2617 
2618     switch (nsz) { /* Each loop in 'case' is unrolled */
2619     case 1:
2620       sum1 = tmp[row];
2621 
2622       for (j = 0; j < nz - 1; j += 2) {
2623         i0   = vi[j];
2624         i1   = vi[j + 1];
2625         tmp0 = tmps[i0];
2626         tmp1 = tmps[i1];
2627         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2628       }
2629       if (j == nz - 1) {
2630         tmp0 = tmps[vi[j]];
2631         sum1 -= v1[j] * tmp0;
2632       }
2633       x[c[row]] = tmp[row] = sum1 * v1[nz];
2634       row--;
2635       break;
2636     case 2:
2637       sum1 = tmp[row];
2638       sum2 = tmp[row - 1];
2639       v2   = aa + ad[row] + 1;
2640       for (j = 0; j < nz - 1; j += 2) {
2641         i0   = vi[j];
2642         i1   = vi[j + 1];
2643         tmp0 = tmps[i0];
2644         tmp1 = tmps[i1];
2645         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2646         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2647       }
2648       if (j == nz - 1) {
2649         tmp0 = tmps[vi[j]];
2650         sum1 -= v1[j] * tmp0;
2651         sum2 -= v2[j + 1] * tmp0;
2652       }
2653 
2654       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2655       row--;
2656       sum2 -= v2[0] * tmp0;
2657       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2658       row--;
2659       break;
2660     case 3:
2661       sum1 = tmp[row];
2662       sum2 = tmp[row - 1];
2663       sum3 = tmp[row - 2];
2664       v2   = aa + ad[row] + 1;
2665       v3   = aa + ad[row - 1] + 1;
2666       for (j = 0; j < nz - 1; j += 2) {
2667         i0   = vi[j];
2668         i1   = vi[j + 1];
2669         tmp0 = tmps[i0];
2670         tmp1 = tmps[i1];
2671         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2672         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2673         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2674       }
2675       if (j == nz - 1) {
2676         tmp0 = tmps[vi[j]];
2677         sum1 -= v1[j] * tmp0;
2678         sum2 -= v2[j + 1] * tmp0;
2679         sum3 -= v3[j + 2] * tmp0;
2680       }
2681       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2682       row--;
2683       sum2 -= v2[0] * tmp0;
2684       sum3 -= v3[1] * tmp0;
2685       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2686       row--;
2687       sum3 -= v3[0] * tmp0;
2688       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2689       row--;
2690 
2691       break;
2692     case 4:
2693       sum1 = tmp[row];
2694       sum2 = tmp[row - 1];
2695       sum3 = tmp[row - 2];
2696       sum4 = tmp[row - 3];
2697       v2   = aa + ad[row] + 1;
2698       v3   = aa + ad[row - 1] + 1;
2699       v4   = aa + ad[row - 2] + 1;
2700 
2701       for (j = 0; j < nz - 1; j += 2) {
2702         i0   = vi[j];
2703         i1   = vi[j + 1];
2704         tmp0 = tmps[i0];
2705         tmp1 = tmps[i1];
2706         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2707         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2708         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2709         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2710       }
2711       if (j == nz - 1) {
2712         tmp0 = tmps[vi[j]];
2713         sum1 -= v1[j] * tmp0;
2714         sum2 -= v2[j + 1] * tmp0;
2715         sum3 -= v3[j + 2] * tmp0;
2716         sum4 -= v4[j + 3] * tmp0;
2717       }
2718 
2719       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2720       row--;
2721       sum2 -= v2[0] * tmp0;
2722       sum3 -= v3[1] * tmp0;
2723       sum4 -= v4[2] * tmp0;
2724       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2725       row--;
2726       sum3 -= v3[0] * tmp0;
2727       sum4 -= v4[1] * tmp0;
2728       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2729       row--;
2730       sum4 -= v4[0] * tmp0;
2731       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2732       row--;
2733       break;
2734     case 5:
2735       sum1 = tmp[row];
2736       sum2 = tmp[row - 1];
2737       sum3 = tmp[row - 2];
2738       sum4 = tmp[row - 3];
2739       sum5 = tmp[row - 4];
2740       v2   = aa + ad[row] + 1;
2741       v3   = aa + ad[row - 1] + 1;
2742       v4   = aa + ad[row - 2] + 1;
2743       v5   = aa + ad[row - 3] + 1;
2744       for (j = 0; j < nz - 1; j += 2) {
2745         i0   = vi[j];
2746         i1   = vi[j + 1];
2747         tmp0 = tmps[i0];
2748         tmp1 = tmps[i1];
2749         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2750         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2751         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2752         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2753         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2754       }
2755       if (j == nz - 1) {
2756         tmp0 = tmps[vi[j]];
2757         sum1 -= v1[j] * tmp0;
2758         sum2 -= v2[j + 1] * tmp0;
2759         sum3 -= v3[j + 2] * tmp0;
2760         sum4 -= v4[j + 3] * tmp0;
2761         sum5 -= v5[j + 4] * tmp0;
2762       }
2763 
2764       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2765       row--;
2766       sum2 -= v2[0] * tmp0;
2767       sum3 -= v3[1] * tmp0;
2768       sum4 -= v4[2] * tmp0;
2769       sum5 -= v5[3] * tmp0;
2770       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2771       row--;
2772       sum3 -= v3[0] * tmp0;
2773       sum4 -= v4[1] * tmp0;
2774       sum5 -= v5[2] * tmp0;
2775       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2776       row--;
2777       sum4 -= v4[0] * tmp0;
2778       sum5 -= v5[1] * tmp0;
2779       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2780       row--;
2781       sum5 -= v5[0] * tmp0;
2782       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2783       row--;
2784       break;
2785     default:
2786       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2787     }
2788   }
2789   PetscCall(ISRestoreIndices(isrow, &rout));
2790   PetscCall(ISRestoreIndices(iscol, &cout));
2791   PetscCall(VecRestoreArrayRead(bb, &b));
2792   PetscCall(VecRestoreArrayWrite(xx, &x));
2793   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2794   PetscFunctionReturn(PETSC_SUCCESS);
2795 }
2796 
2797 /*
2798      Makes a longer coloring[] array and calls the usual code with that
2799 */
2800 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2801 {
2802   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2803   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
2804   PetscInt        *colorused, i;
2805   ISColoringValue *newcolor;
2806 
2807   PetscFunctionBegin;
2808   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2809   PetscCall(PetscMalloc1(n + 1, &newcolor));
2810   /* loop over inodes, marking a color for each column*/
2811   row = 0;
2812   for (i = 0; i < m; i++) {
2813     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
2814   }
2815 
2816   /* eliminate unneeded colors */
2817   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2818   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2819 
2820   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2821   ncolors = colorused[5 * ncolors - 1];
2822   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
2823   PetscCall(PetscFree(colorused));
2824   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2825   PetscCall(PetscFree(coloring));
2826   PetscFunctionReturn(PETSC_SUCCESS);
2827 }
2828 
2829 #include <petsc/private/kernels/blockinvert.h>
2830 
2831 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2832 {
2833   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2834   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2835   MatScalar         *ibdiag, *bdiag, work[25], *t;
2836   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2837   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2838   const PetscScalar *xb, *b;
2839   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2840   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
2841   PetscInt           sz, k, ipvt[5];
2842   PetscBool          allowzeropivot, zeropivotdetected;
2843   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
2844 
2845   PetscFunctionBegin;
2846   allowzeropivot = PetscNot(A->erroriffailure);
2847   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2848   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2849   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2850 
2851   if (!a->inode.ibdiagvalid) {
2852     if (!a->inode.ibdiag) {
2853       /* calculate space needed for diagonal blocks */
2854       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2855       a->inode.bdiagsize = cnt;
2856 
2857       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2858     }
2859 
2860     /* copy over the diagonal blocks and invert them */
2861     ibdiag = a->inode.ibdiag;
2862     bdiag  = a->inode.bdiag;
2863     cnt    = 0;
2864     for (i = 0, row = 0; i < m; i++) {
2865       for (j = 0; j < sizes[i]; j++) {
2866         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
2867       }
2868       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
2869 
2870       switch (sizes[i]) {
2871       case 1:
2872         /* Create matrix data structure */
2873         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2874           if (allowzeropivot) {
2875             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2876             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2877             A->factorerror_zeropivot_row   = row;
2878             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2879           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2880         }
2881         ibdiag[cnt] = 1.0 / ibdiag[cnt];
2882         break;
2883       case 2:
2884         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2885         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2886         break;
2887       case 3:
2888         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2889         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2890         break;
2891       case 4:
2892         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2893         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2894         break;
2895       case 5:
2896         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2897         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2898         break;
2899       default:
2900         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
2901       }
2902       cnt += sizes[i] * sizes[i];
2903       row += sizes[i];
2904     }
2905     a->inode.ibdiagvalid = PETSC_TRUE;
2906   }
2907   ibdiag = a->inode.ibdiag;
2908   bdiag  = a->inode.bdiag;
2909   t      = a->inode.ssor_work;
2910 
2911   PetscCall(VecGetArray(xx, &x));
2912   PetscCall(VecGetArrayRead(bb, &b));
2913   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2914   if (flag & SOR_ZERO_INITIAL_GUESS) {
2915     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2916       for (i = 0, row = 0; i < m; i++) {
2917         sz  = diag[row] - ii[row];
2918         v1  = a->a + ii[row];
2919         idx = a->j + ii[row];
2920 
2921         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2922         switch (sizes[i]) {
2923         case 1:
2924 
2925           sum1 = b[row];
2926           for (n = 0; n < sz - 1; n += 2) {
2927             i1 = idx[0];
2928             i2 = idx[1];
2929             idx += 2;
2930             tmp0 = x[i1];
2931             tmp1 = x[i2];
2932             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2933             v1 += 2;
2934           }
2935 
2936           if (n == sz - 1) {
2937             tmp0 = x[*idx];
2938             sum1 -= *v1 * tmp0;
2939           }
2940           t[row]   = sum1;
2941           x[row++] = sum1 * (*ibdiag++);
2942           break;
2943         case 2:
2944           v2   = a->a + ii[row + 1];
2945           sum1 = b[row];
2946           sum2 = b[row + 1];
2947           for (n = 0; n < sz - 1; n += 2) {
2948             i1 = idx[0];
2949             i2 = idx[1];
2950             idx += 2;
2951             tmp0 = x[i1];
2952             tmp1 = x[i2];
2953             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2954             v1 += 2;
2955             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2956             v2 += 2;
2957           }
2958 
2959           if (n == sz - 1) {
2960             tmp0 = x[*idx];
2961             sum1 -= v1[0] * tmp0;
2962             sum2 -= v2[0] * tmp0;
2963           }
2964           t[row]     = sum1;
2965           t[row + 1] = sum2;
2966           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2967           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2968           ibdiag += 4;
2969           break;
2970         case 3:
2971           v2   = a->a + ii[row + 1];
2972           v3   = a->a + ii[row + 2];
2973           sum1 = b[row];
2974           sum2 = b[row + 1];
2975           sum3 = b[row + 2];
2976           for (n = 0; n < sz - 1; n += 2) {
2977             i1 = idx[0];
2978             i2 = idx[1];
2979             idx += 2;
2980             tmp0 = x[i1];
2981             tmp1 = x[i2];
2982             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2983             v1 += 2;
2984             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2985             v2 += 2;
2986             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2987             v3 += 2;
2988           }
2989 
2990           if (n == sz - 1) {
2991             tmp0 = x[*idx];
2992             sum1 -= v1[0] * tmp0;
2993             sum2 -= v2[0] * tmp0;
2994             sum3 -= v3[0] * tmp0;
2995           }
2996           t[row]     = sum1;
2997           t[row + 1] = sum2;
2998           t[row + 2] = sum3;
2999           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3000           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3001           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3002           ibdiag += 9;
3003           break;
3004         case 4:
3005           v2   = a->a + ii[row + 1];
3006           v3   = a->a + ii[row + 2];
3007           v4   = a->a + ii[row + 3];
3008           sum1 = b[row];
3009           sum2 = b[row + 1];
3010           sum3 = b[row + 2];
3011           sum4 = b[row + 3];
3012           for (n = 0; n < sz - 1; n += 2) {
3013             i1 = idx[0];
3014             i2 = idx[1];
3015             idx += 2;
3016             tmp0 = x[i1];
3017             tmp1 = x[i2];
3018             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3019             v1 += 2;
3020             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3021             v2 += 2;
3022             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3023             v3 += 2;
3024             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3025             v4 += 2;
3026           }
3027 
3028           if (n == sz - 1) {
3029             tmp0 = x[*idx];
3030             sum1 -= v1[0] * tmp0;
3031             sum2 -= v2[0] * tmp0;
3032             sum3 -= v3[0] * tmp0;
3033             sum4 -= v4[0] * tmp0;
3034           }
3035           t[row]     = sum1;
3036           t[row + 1] = sum2;
3037           t[row + 2] = sum3;
3038           t[row + 3] = sum4;
3039           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3040           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3041           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3042           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3043           ibdiag += 16;
3044           break;
3045         case 5:
3046           v2   = a->a + ii[row + 1];
3047           v3   = a->a + ii[row + 2];
3048           v4   = a->a + ii[row + 3];
3049           v5   = a->a + ii[row + 4];
3050           sum1 = b[row];
3051           sum2 = b[row + 1];
3052           sum3 = b[row + 2];
3053           sum4 = b[row + 3];
3054           sum5 = b[row + 4];
3055           for (n = 0; n < sz - 1; n += 2) {
3056             i1 = idx[0];
3057             i2 = idx[1];
3058             idx += 2;
3059             tmp0 = x[i1];
3060             tmp1 = x[i2];
3061             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3062             v1 += 2;
3063             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3064             v2 += 2;
3065             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3066             v3 += 2;
3067             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3068             v4 += 2;
3069             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3070             v5 += 2;
3071           }
3072 
3073           if (n == sz - 1) {
3074             tmp0 = x[*idx];
3075             sum1 -= v1[0] * tmp0;
3076             sum2 -= v2[0] * tmp0;
3077             sum3 -= v3[0] * tmp0;
3078             sum4 -= v4[0] * tmp0;
3079             sum5 -= v5[0] * tmp0;
3080           }
3081           t[row]     = sum1;
3082           t[row + 1] = sum2;
3083           t[row + 2] = sum3;
3084           t[row + 3] = sum4;
3085           t[row + 4] = sum5;
3086           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3087           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3088           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3089           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3090           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3091           ibdiag += 25;
3092           break;
3093         default:
3094           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3095         }
3096       }
3097 
3098       xb = t;
3099       PetscCall(PetscLogFlops(a->nz));
3100     } else xb = b;
3101     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3102       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3103       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3104         ibdiag -= sizes[i] * sizes[i];
3105         sz  = ii[row + 1] - diag[row] - 1;
3106         v1  = a->a + diag[row] + 1;
3107         idx = a->j + diag[row] + 1;
3108 
3109         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3110         switch (sizes[i]) {
3111         case 1:
3112 
3113           sum1 = xb[row];
3114           for (n = 0; n < sz - 1; n += 2) {
3115             i1 = idx[0];
3116             i2 = idx[1];
3117             idx += 2;
3118             tmp0 = x[i1];
3119             tmp1 = x[i2];
3120             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3121             v1 += 2;
3122           }
3123 
3124           if (n == sz - 1) {
3125             tmp0 = x[*idx];
3126             sum1 -= *v1 * tmp0;
3127           }
3128           x[row--] = sum1 * (*ibdiag);
3129           break;
3130 
3131         case 2:
3132 
3133           sum1 = xb[row];
3134           sum2 = xb[row - 1];
3135           /* note that sum1 is associated with the second of the two rows */
3136           v2 = a->a + diag[row - 1] + 2;
3137           for (n = 0; n < sz - 1; n += 2) {
3138             i1 = idx[0];
3139             i2 = idx[1];
3140             idx += 2;
3141             tmp0 = x[i1];
3142             tmp1 = x[i2];
3143             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3144             v1 += 2;
3145             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3146             v2 += 2;
3147           }
3148 
3149           if (n == sz - 1) {
3150             tmp0 = x[*idx];
3151             sum1 -= *v1 * tmp0;
3152             sum2 -= *v2 * tmp0;
3153           }
3154           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3155           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3156           break;
3157         case 3:
3158 
3159           sum1 = xb[row];
3160           sum2 = xb[row - 1];
3161           sum3 = xb[row - 2];
3162           v2   = a->a + diag[row - 1] + 2;
3163           v3   = a->a + diag[row - 2] + 3;
3164           for (n = 0; n < sz - 1; n += 2) {
3165             i1 = idx[0];
3166             i2 = idx[1];
3167             idx += 2;
3168             tmp0 = x[i1];
3169             tmp1 = x[i2];
3170             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3171             v1 += 2;
3172             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3173             v2 += 2;
3174             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3175             v3 += 2;
3176           }
3177 
3178           if (n == sz - 1) {
3179             tmp0 = x[*idx];
3180             sum1 -= *v1 * tmp0;
3181             sum2 -= *v2 * tmp0;
3182             sum3 -= *v3 * tmp0;
3183           }
3184           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3185           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3186           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3187           break;
3188         case 4:
3189 
3190           sum1 = xb[row];
3191           sum2 = xb[row - 1];
3192           sum3 = xb[row - 2];
3193           sum4 = xb[row - 3];
3194           v2   = a->a + diag[row - 1] + 2;
3195           v3   = a->a + diag[row - 2] + 3;
3196           v4   = a->a + diag[row - 3] + 4;
3197           for (n = 0; n < sz - 1; n += 2) {
3198             i1 = idx[0];
3199             i2 = idx[1];
3200             idx += 2;
3201             tmp0 = x[i1];
3202             tmp1 = x[i2];
3203             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3204             v1 += 2;
3205             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3206             v2 += 2;
3207             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3208             v3 += 2;
3209             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3210             v4 += 2;
3211           }
3212 
3213           if (n == sz - 1) {
3214             tmp0 = x[*idx];
3215             sum1 -= *v1 * tmp0;
3216             sum2 -= *v2 * tmp0;
3217             sum3 -= *v3 * tmp0;
3218             sum4 -= *v4 * tmp0;
3219           }
3220           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3221           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3222           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3223           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3224           break;
3225         case 5:
3226 
3227           sum1 = xb[row];
3228           sum2 = xb[row - 1];
3229           sum3 = xb[row - 2];
3230           sum4 = xb[row - 3];
3231           sum5 = xb[row - 4];
3232           v2   = a->a + diag[row - 1] + 2;
3233           v3   = a->a + diag[row - 2] + 3;
3234           v4   = a->a + diag[row - 3] + 4;
3235           v5   = a->a + diag[row - 4] + 5;
3236           for (n = 0; n < sz - 1; n += 2) {
3237             i1 = idx[0];
3238             i2 = idx[1];
3239             idx += 2;
3240             tmp0 = x[i1];
3241             tmp1 = x[i2];
3242             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3243             v1 += 2;
3244             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3245             v2 += 2;
3246             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3247             v3 += 2;
3248             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3249             v4 += 2;
3250             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3251             v5 += 2;
3252           }
3253 
3254           if (n == sz - 1) {
3255             tmp0 = x[*idx];
3256             sum1 -= *v1 * tmp0;
3257             sum2 -= *v2 * tmp0;
3258             sum3 -= *v3 * tmp0;
3259             sum4 -= *v4 * tmp0;
3260             sum5 -= *v5 * tmp0;
3261           }
3262           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3263           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3264           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3265           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3266           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3267           break;
3268         default:
3269           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3270         }
3271       }
3272 
3273       PetscCall(PetscLogFlops(a->nz));
3274     }
3275     its--;
3276   }
3277   while (its--) {
3278     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3279       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3280         sz  = diag[row] - ii[row];
3281         v1  = a->a + ii[row];
3282         idx = a->j + ii[row];
3283         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3284         switch (sizes[i]) {
3285         case 1:
3286           sum1 = b[row];
3287           for (n = 0; n < sz - 1; n += 2) {
3288             i1 = idx[0];
3289             i2 = idx[1];
3290             idx += 2;
3291             tmp0 = x[i1];
3292             tmp1 = x[i2];
3293             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3294             v1 += 2;
3295           }
3296           if (n == sz - 1) {
3297             tmp0 = x[*idx++];
3298             sum1 -= *v1 * tmp0;
3299             v1++;
3300           }
3301           t[row] = sum1;
3302           sz     = ii[row + 1] - diag[row] - 1;
3303           idx    = a->j + diag[row] + 1;
3304           v1 += 1;
3305           for (n = 0; n < sz - 1; n += 2) {
3306             i1 = idx[0];
3307             i2 = idx[1];
3308             idx += 2;
3309             tmp0 = x[i1];
3310             tmp1 = x[i2];
3311             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3312             v1 += 2;
3313           }
3314           if (n == sz - 1) {
3315             tmp0 = x[*idx++];
3316             sum1 -= *v1 * tmp0;
3317           }
3318           /* in MatSOR_SeqAIJ this line would be
3319            *
3320            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3321            *
3322            * but omega == 1, so this becomes
3323            *
3324            * x[row] = sum1*(*ibdiag++);
3325            *
3326            */
3327           x[row] = sum1 * (*ibdiag);
3328           break;
3329         case 2:
3330           v2   = a->a + ii[row + 1];
3331           sum1 = b[row];
3332           sum2 = b[row + 1];
3333           for (n = 0; n < sz - 1; n += 2) {
3334             i1 = idx[0];
3335             i2 = idx[1];
3336             idx += 2;
3337             tmp0 = x[i1];
3338             tmp1 = x[i2];
3339             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3340             v1 += 2;
3341             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3342             v2 += 2;
3343           }
3344           if (n == sz - 1) {
3345             tmp0 = x[*idx++];
3346             sum1 -= v1[0] * tmp0;
3347             sum2 -= v2[0] * tmp0;
3348             v1++;
3349             v2++;
3350           }
3351           t[row]     = sum1;
3352           t[row + 1] = sum2;
3353           sz         = ii[row + 1] - diag[row] - 2;
3354           idx        = a->j + diag[row] + 2;
3355           v1 += 2;
3356           v2 += 2;
3357           for (n = 0; n < sz - 1; n += 2) {
3358             i1 = idx[0];
3359             i2 = idx[1];
3360             idx += 2;
3361             tmp0 = x[i1];
3362             tmp1 = x[i2];
3363             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3364             v1 += 2;
3365             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3366             v2 += 2;
3367           }
3368           if (n == sz - 1) {
3369             tmp0 = x[*idx];
3370             sum1 -= v1[0] * tmp0;
3371             sum2 -= v2[0] * tmp0;
3372           }
3373           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3374           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3375           break;
3376         case 3:
3377           v2   = a->a + ii[row + 1];
3378           v3   = a->a + ii[row + 2];
3379           sum1 = b[row];
3380           sum2 = b[row + 1];
3381           sum3 = b[row + 2];
3382           for (n = 0; n < sz - 1; n += 2) {
3383             i1 = idx[0];
3384             i2 = idx[1];
3385             idx += 2;
3386             tmp0 = x[i1];
3387             tmp1 = x[i2];
3388             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3389             v1 += 2;
3390             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3391             v2 += 2;
3392             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3393             v3 += 2;
3394           }
3395           if (n == sz - 1) {
3396             tmp0 = x[*idx++];
3397             sum1 -= v1[0] * tmp0;
3398             sum2 -= v2[0] * tmp0;
3399             sum3 -= v3[0] * tmp0;
3400             v1++;
3401             v2++;
3402             v3++;
3403           }
3404           t[row]     = sum1;
3405           t[row + 1] = sum2;
3406           t[row + 2] = sum3;
3407           sz         = ii[row + 1] - diag[row] - 3;
3408           idx        = a->j + diag[row] + 3;
3409           v1 += 3;
3410           v2 += 3;
3411           v3 += 3;
3412           for (n = 0; n < sz - 1; n += 2) {
3413             i1 = idx[0];
3414             i2 = idx[1];
3415             idx += 2;
3416             tmp0 = x[i1];
3417             tmp1 = x[i2];
3418             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3419             v1 += 2;
3420             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3421             v2 += 2;
3422             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3423             v3 += 2;
3424           }
3425           if (n == sz - 1) {
3426             tmp0 = x[*idx];
3427             sum1 -= v1[0] * tmp0;
3428             sum2 -= v2[0] * tmp0;
3429             sum3 -= v3[0] * tmp0;
3430           }
3431           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3432           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3433           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3434           break;
3435         case 4:
3436           v2   = a->a + ii[row + 1];
3437           v3   = a->a + ii[row + 2];
3438           v4   = a->a + ii[row + 3];
3439           sum1 = b[row];
3440           sum2 = b[row + 1];
3441           sum3 = b[row + 2];
3442           sum4 = b[row + 3];
3443           for (n = 0; n < sz - 1; n += 2) {
3444             i1 = idx[0];
3445             i2 = idx[1];
3446             idx += 2;
3447             tmp0 = x[i1];
3448             tmp1 = x[i2];
3449             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3450             v1 += 2;
3451             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3452             v2 += 2;
3453             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3454             v3 += 2;
3455             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3456             v4 += 2;
3457           }
3458           if (n == sz - 1) {
3459             tmp0 = x[*idx++];
3460             sum1 -= v1[0] * tmp0;
3461             sum2 -= v2[0] * tmp0;
3462             sum3 -= v3[0] * tmp0;
3463             sum4 -= v4[0] * tmp0;
3464             v1++;
3465             v2++;
3466             v3++;
3467             v4++;
3468           }
3469           t[row]     = sum1;
3470           t[row + 1] = sum2;
3471           t[row + 2] = sum3;
3472           t[row + 3] = sum4;
3473           sz         = ii[row + 1] - diag[row] - 4;
3474           idx        = a->j + diag[row] + 4;
3475           v1 += 4;
3476           v2 += 4;
3477           v3 += 4;
3478           v4 += 4;
3479           for (n = 0; n < sz - 1; n += 2) {
3480             i1 = idx[0];
3481             i2 = idx[1];
3482             idx += 2;
3483             tmp0 = x[i1];
3484             tmp1 = x[i2];
3485             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3486             v1 += 2;
3487             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3488             v2 += 2;
3489             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3490             v3 += 2;
3491             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3492             v4 += 2;
3493           }
3494           if (n == sz - 1) {
3495             tmp0 = x[*idx];
3496             sum1 -= v1[0] * tmp0;
3497             sum2 -= v2[0] * tmp0;
3498             sum3 -= v3[0] * tmp0;
3499             sum4 -= v4[0] * tmp0;
3500           }
3501           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3502           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3503           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3504           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3505           break;
3506         case 5:
3507           v2   = a->a + ii[row + 1];
3508           v3   = a->a + ii[row + 2];
3509           v4   = a->a + ii[row + 3];
3510           v5   = a->a + ii[row + 4];
3511           sum1 = b[row];
3512           sum2 = b[row + 1];
3513           sum3 = b[row + 2];
3514           sum4 = b[row + 3];
3515           sum5 = b[row + 4];
3516           for (n = 0; n < sz - 1; n += 2) {
3517             i1 = idx[0];
3518             i2 = idx[1];
3519             idx += 2;
3520             tmp0 = x[i1];
3521             tmp1 = x[i2];
3522             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3523             v1 += 2;
3524             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3525             v2 += 2;
3526             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3527             v3 += 2;
3528             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3529             v4 += 2;
3530             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3531             v5 += 2;
3532           }
3533           if (n == sz - 1) {
3534             tmp0 = x[*idx++];
3535             sum1 -= v1[0] * tmp0;
3536             sum2 -= v2[0] * tmp0;
3537             sum3 -= v3[0] * tmp0;
3538             sum4 -= v4[0] * tmp0;
3539             sum5 -= v5[0] * tmp0;
3540             v1++;
3541             v2++;
3542             v3++;
3543             v4++;
3544             v5++;
3545           }
3546           t[row]     = sum1;
3547           t[row + 1] = sum2;
3548           t[row + 2] = sum3;
3549           t[row + 3] = sum4;
3550           t[row + 4] = sum5;
3551           sz         = ii[row + 1] - diag[row] - 5;
3552           idx        = a->j + diag[row] + 5;
3553           v1 += 5;
3554           v2 += 5;
3555           v3 += 5;
3556           v4 += 5;
3557           v5 += 5;
3558           for (n = 0; n < sz - 1; n += 2) {
3559             i1 = idx[0];
3560             i2 = idx[1];
3561             idx += 2;
3562             tmp0 = x[i1];
3563             tmp1 = x[i2];
3564             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3565             v1 += 2;
3566             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3567             v2 += 2;
3568             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3569             v3 += 2;
3570             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3571             v4 += 2;
3572             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3573             v5 += 2;
3574           }
3575           if (n == sz - 1) {
3576             tmp0 = x[*idx];
3577             sum1 -= v1[0] * tmp0;
3578             sum2 -= v2[0] * tmp0;
3579             sum3 -= v3[0] * tmp0;
3580             sum4 -= v4[0] * tmp0;
3581             sum5 -= v5[0] * tmp0;
3582           }
3583           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3584           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3585           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3586           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3587           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3588           break;
3589         default:
3590           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3591         }
3592       }
3593       xb = t;
3594       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3595     } else xb = b;
3596 
3597     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3598       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3599       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3600         ibdiag -= sizes[i] * sizes[i];
3601 
3602         /* set RHS */
3603         if (xb == b) {
3604           /* whole (old way) */
3605           sz  = ii[row + 1] - ii[row];
3606           idx = a->j + ii[row];
3607           switch (sizes[i]) {
3608           case 5:
3609             v5 = a->a + ii[row - 4]; /* fall through */
3610           case 4:
3611             v4 = a->a + ii[row - 3]; /* fall through */
3612           case 3:
3613             v3 = a->a + ii[row - 2]; /* fall through */
3614           case 2:
3615             v2 = a->a + ii[row - 1]; /* fall through */
3616           case 1:
3617             v1 = a->a + ii[row];
3618             break;
3619           default:
3620             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3621           }
3622         } else {
3623           /* upper, no diag */
3624           sz  = ii[row + 1] - diag[row] - 1;
3625           idx = a->j + diag[row] + 1;
3626           switch (sizes[i]) {
3627           case 5:
3628             v5 = a->a + diag[row - 4] + 5; /* fall through */
3629           case 4:
3630             v4 = a->a + diag[row - 3] + 4; /* fall through */
3631           case 3:
3632             v3 = a->a + diag[row - 2] + 3; /* fall through */
3633           case 2:
3634             v2 = a->a + diag[row - 1] + 2; /* fall through */
3635           case 1:
3636             v1 = a->a + diag[row] + 1;
3637           }
3638         }
3639         /* set sum */
3640         switch (sizes[i]) {
3641         case 5:
3642           sum5 = xb[row - 4]; /* fall through */
3643         case 4:
3644           sum4 = xb[row - 3]; /* fall through */
3645         case 3:
3646           sum3 = xb[row - 2]; /* fall through */
3647         case 2:
3648           sum2 = xb[row - 1]; /* fall through */
3649         case 1:
3650           /* note that sum1 is associated with the last row */
3651           sum1 = xb[row];
3652         }
3653         /* do sums */
3654         for (n = 0; n < sz - 1; n += 2) {
3655           i1 = idx[0];
3656           i2 = idx[1];
3657           idx += 2;
3658           tmp0 = x[i1];
3659           tmp1 = x[i2];
3660           switch (sizes[i]) {
3661           case 5:
3662             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3663             v5 += 2; /* fall through */
3664           case 4:
3665             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3666             v4 += 2; /* fall through */
3667           case 3:
3668             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3669             v3 += 2; /* fall through */
3670           case 2:
3671             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3672             v2 += 2; /* fall through */
3673           case 1:
3674             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3675             v1 += 2;
3676           }
3677         }
3678         /* ragged edge */
3679         if (n == sz - 1) {
3680           tmp0 = x[*idx];
3681           switch (sizes[i]) {
3682           case 5:
3683             sum5 -= *v5 * tmp0; /* fall through */
3684           case 4:
3685             sum4 -= *v4 * tmp0; /* fall through */
3686           case 3:
3687             sum3 -= *v3 * tmp0; /* fall through */
3688           case 2:
3689             sum2 -= *v2 * tmp0; /* fall through */
3690           case 1:
3691             sum1 -= *v1 * tmp0;
3692           }
3693         }
3694         /* update */
3695         if (xb == b) {
3696           /* whole (old way) w/ diag */
3697           switch (sizes[i]) {
3698           case 5:
3699             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3700             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3701             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3702             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3703             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3704             break;
3705           case 4:
3706             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3707             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3708             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3709             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3710             break;
3711           case 3:
3712             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3713             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3714             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3715             break;
3716           case 2:
3717             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3718             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3719             break;
3720           case 1:
3721             x[row--] += sum1 * (*ibdiag);
3722             break;
3723           }
3724         } else {
3725           /* no diag so set =  */
3726           switch (sizes[i]) {
3727           case 5:
3728             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3729             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3730             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3731             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3732             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3733             break;
3734           case 4:
3735             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3736             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3737             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3738             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3739             break;
3740           case 3:
3741             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3742             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3743             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3744             break;
3745           case 2:
3746             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3747             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3748             break;
3749           case 1:
3750             x[row--] = sum1 * (*ibdiag);
3751             break;
3752           }
3753         }
3754       }
3755       if (xb == b) {
3756         PetscCall(PetscLogFlops(2.0 * a->nz));
3757       } else {
3758         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3759       }
3760     }
3761   }
3762   if (flag & SOR_EISENSTAT) {
3763     /*
3764           Apply  (U + D)^-1  where D is now the block diagonal
3765     */
3766     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3767     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3768       ibdiag -= sizes[i] * sizes[i];
3769       sz  = ii[row + 1] - diag[row] - 1;
3770       v1  = a->a + diag[row] + 1;
3771       idx = a->j + diag[row] + 1;
3772       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3773       switch (sizes[i]) {
3774       case 1:
3775 
3776         sum1 = b[row];
3777         for (n = 0; n < sz - 1; n += 2) {
3778           i1 = idx[0];
3779           i2 = idx[1];
3780           idx += 2;
3781           tmp0 = x[i1];
3782           tmp1 = x[i2];
3783           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3784           v1 += 2;
3785         }
3786 
3787         if (n == sz - 1) {
3788           tmp0 = x[*idx];
3789           sum1 -= *v1 * tmp0;
3790         }
3791         x[row] = sum1 * (*ibdiag);
3792         row--;
3793         break;
3794 
3795       case 2:
3796 
3797         sum1 = b[row];
3798         sum2 = b[row - 1];
3799         /* note that sum1 is associated with the second of the two rows */
3800         v2 = a->a + diag[row - 1] + 2;
3801         for (n = 0; n < sz - 1; n += 2) {
3802           i1 = idx[0];
3803           i2 = idx[1];
3804           idx += 2;
3805           tmp0 = x[i1];
3806           tmp1 = x[i2];
3807           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3808           v1 += 2;
3809           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3810           v2 += 2;
3811         }
3812 
3813         if (n == sz - 1) {
3814           tmp0 = x[*idx];
3815           sum1 -= *v1 * tmp0;
3816           sum2 -= *v2 * tmp0;
3817         }
3818         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3819         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3820         row -= 2;
3821         break;
3822       case 3:
3823 
3824         sum1 = b[row];
3825         sum2 = b[row - 1];
3826         sum3 = b[row - 2];
3827         v2   = a->a + diag[row - 1] + 2;
3828         v3   = a->a + diag[row - 2] + 3;
3829         for (n = 0; n < sz - 1; n += 2) {
3830           i1 = idx[0];
3831           i2 = idx[1];
3832           idx += 2;
3833           tmp0 = x[i1];
3834           tmp1 = x[i2];
3835           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3836           v1 += 2;
3837           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3838           v2 += 2;
3839           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3840           v3 += 2;
3841         }
3842 
3843         if (n == sz - 1) {
3844           tmp0 = x[*idx];
3845           sum1 -= *v1 * tmp0;
3846           sum2 -= *v2 * tmp0;
3847           sum3 -= *v3 * tmp0;
3848         }
3849         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3850         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3851         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3852         row -= 3;
3853         break;
3854       case 4:
3855 
3856         sum1 = b[row];
3857         sum2 = b[row - 1];
3858         sum3 = b[row - 2];
3859         sum4 = b[row - 3];
3860         v2   = a->a + diag[row - 1] + 2;
3861         v3   = a->a + diag[row - 2] + 3;
3862         v4   = a->a + diag[row - 3] + 4;
3863         for (n = 0; n < sz - 1; n += 2) {
3864           i1 = idx[0];
3865           i2 = idx[1];
3866           idx += 2;
3867           tmp0 = x[i1];
3868           tmp1 = x[i2];
3869           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3870           v1 += 2;
3871           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3872           v2 += 2;
3873           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3874           v3 += 2;
3875           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3876           v4 += 2;
3877         }
3878 
3879         if (n == sz - 1) {
3880           tmp0 = x[*idx];
3881           sum1 -= *v1 * tmp0;
3882           sum2 -= *v2 * tmp0;
3883           sum3 -= *v3 * tmp0;
3884           sum4 -= *v4 * tmp0;
3885         }
3886         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3887         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3888         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3889         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3890         row -= 4;
3891         break;
3892       case 5:
3893 
3894         sum1 = b[row];
3895         sum2 = b[row - 1];
3896         sum3 = b[row - 2];
3897         sum4 = b[row - 3];
3898         sum5 = b[row - 4];
3899         v2   = a->a + diag[row - 1] + 2;
3900         v3   = a->a + diag[row - 2] + 3;
3901         v4   = a->a + diag[row - 3] + 4;
3902         v5   = a->a + diag[row - 4] + 5;
3903         for (n = 0; n < sz - 1; n += 2) {
3904           i1 = idx[0];
3905           i2 = idx[1];
3906           idx += 2;
3907           tmp0 = x[i1];
3908           tmp1 = x[i2];
3909           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3910           v1 += 2;
3911           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3912           v2 += 2;
3913           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3914           v3 += 2;
3915           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3916           v4 += 2;
3917           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3918           v5 += 2;
3919         }
3920 
3921         if (n == sz - 1) {
3922           tmp0 = x[*idx];
3923           sum1 -= *v1 * tmp0;
3924           sum2 -= *v2 * tmp0;
3925           sum3 -= *v3 * tmp0;
3926           sum4 -= *v4 * tmp0;
3927           sum5 -= *v5 * tmp0;
3928         }
3929         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3930         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3931         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3932         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3933         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3934         row -= 5;
3935         break;
3936       default:
3937         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3938       }
3939     }
3940     PetscCall(PetscLogFlops(a->nz));
3941 
3942     /*
3943            t = b - D x    where D is the block diagonal
3944     */
3945     cnt = 0;
3946     for (i = 0, row = 0; i < m; i++) {
3947       switch (sizes[i]) {
3948       case 1:
3949         t[row] = b[row] - bdiag[cnt++] * x[row];
3950         row++;
3951         break;
3952       case 2:
3953         x1         = x[row];
3954         x2         = x[row + 1];
3955         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3956         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3957         t[row]     = b[row] - tmp1;
3958         t[row + 1] = b[row + 1] - tmp2;
3959         row += 2;
3960         cnt += 4;
3961         break;
3962       case 3:
3963         x1         = x[row];
3964         x2         = x[row + 1];
3965         x3         = x[row + 2];
3966         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3967         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3968         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3969         t[row]     = b[row] - tmp1;
3970         t[row + 1] = b[row + 1] - tmp2;
3971         t[row + 2] = b[row + 2] - tmp3;
3972         row += 3;
3973         cnt += 9;
3974         break;
3975       case 4:
3976         x1         = x[row];
3977         x2         = x[row + 1];
3978         x3         = x[row + 2];
3979         x4         = x[row + 3];
3980         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3981         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3982         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3983         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3984         t[row]     = b[row] - tmp1;
3985         t[row + 1] = b[row + 1] - tmp2;
3986         t[row + 2] = b[row + 2] - tmp3;
3987         t[row + 3] = b[row + 3] - tmp4;
3988         row += 4;
3989         cnt += 16;
3990         break;
3991       case 5:
3992         x1         = x[row];
3993         x2         = x[row + 1];
3994         x3         = x[row + 2];
3995         x4         = x[row + 3];
3996         x5         = x[row + 4];
3997         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3998         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3999         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4000         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4001         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4002         t[row]     = b[row] - tmp1;
4003         t[row + 1] = b[row + 1] - tmp2;
4004         t[row + 2] = b[row + 2] - tmp3;
4005         t[row + 3] = b[row + 3] - tmp4;
4006         t[row + 4] = b[row + 4] - tmp5;
4007         row += 5;
4008         cnt += 25;
4009         break;
4010       default:
4011         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4012       }
4013     }
4014     PetscCall(PetscLogFlops(m));
4015 
4016     /*
4017           Apply (L + D)^-1 where D is the block diagonal
4018     */
4019     for (i = 0, row = 0; i < m; i++) {
4020       sz  = diag[row] - ii[row];
4021       v1  = a->a + ii[row];
4022       idx = a->j + ii[row];
4023       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4024       switch (sizes[i]) {
4025       case 1:
4026 
4027         sum1 = t[row];
4028         for (n = 0; n < sz - 1; n += 2) {
4029           i1 = idx[0];
4030           i2 = idx[1];
4031           idx += 2;
4032           tmp0 = t[i1];
4033           tmp1 = t[i2];
4034           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4035           v1 += 2;
4036         }
4037 
4038         if (n == sz - 1) {
4039           tmp0 = t[*idx];
4040           sum1 -= *v1 * tmp0;
4041         }
4042         x[row] += t[row] = sum1 * (*ibdiag++);
4043         row++;
4044         break;
4045       case 2:
4046         v2   = a->a + ii[row + 1];
4047         sum1 = t[row];
4048         sum2 = t[row + 1];
4049         for (n = 0; n < sz - 1; n += 2) {
4050           i1 = idx[0];
4051           i2 = idx[1];
4052           idx += 2;
4053           tmp0 = t[i1];
4054           tmp1 = t[i2];
4055           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4056           v1 += 2;
4057           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4058           v2 += 2;
4059         }
4060 
4061         if (n == sz - 1) {
4062           tmp0 = t[*idx];
4063           sum1 -= v1[0] * tmp0;
4064           sum2 -= v2[0] * tmp0;
4065         }
4066         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
4067         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
4068         ibdiag += 4;
4069         row += 2;
4070         break;
4071       case 3:
4072         v2   = a->a + ii[row + 1];
4073         v3   = a->a + ii[row + 2];
4074         sum1 = t[row];
4075         sum2 = t[row + 1];
4076         sum3 = t[row + 2];
4077         for (n = 0; n < sz - 1; n += 2) {
4078           i1 = idx[0];
4079           i2 = idx[1];
4080           idx += 2;
4081           tmp0 = t[i1];
4082           tmp1 = t[i2];
4083           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4084           v1 += 2;
4085           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4086           v2 += 2;
4087           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4088           v3 += 2;
4089         }
4090 
4091         if (n == sz - 1) {
4092           tmp0 = t[*idx];
4093           sum1 -= v1[0] * tmp0;
4094           sum2 -= v2[0] * tmp0;
4095           sum3 -= v3[0] * tmp0;
4096         }
4097         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
4098         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
4099         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
4100         ibdiag += 9;
4101         row += 3;
4102         break;
4103       case 4:
4104         v2   = a->a + ii[row + 1];
4105         v3   = a->a + ii[row + 2];
4106         v4   = a->a + ii[row + 3];
4107         sum1 = t[row];
4108         sum2 = t[row + 1];
4109         sum3 = t[row + 2];
4110         sum4 = t[row + 3];
4111         for (n = 0; n < sz - 1; n += 2) {
4112           i1 = idx[0];
4113           i2 = idx[1];
4114           idx += 2;
4115           tmp0 = t[i1];
4116           tmp1 = t[i2];
4117           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4118           v1 += 2;
4119           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4120           v2 += 2;
4121           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4122           v3 += 2;
4123           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4124           v4 += 2;
4125         }
4126 
4127         if (n == sz - 1) {
4128           tmp0 = t[*idx];
4129           sum1 -= v1[0] * tmp0;
4130           sum2 -= v2[0] * tmp0;
4131           sum3 -= v3[0] * tmp0;
4132           sum4 -= v4[0] * tmp0;
4133         }
4134         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
4135         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
4136         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
4137         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
4138         ibdiag += 16;
4139         row += 4;
4140         break;
4141       case 5:
4142         v2   = a->a + ii[row + 1];
4143         v3   = a->a + ii[row + 2];
4144         v4   = a->a + ii[row + 3];
4145         v5   = a->a + ii[row + 4];
4146         sum1 = t[row];
4147         sum2 = t[row + 1];
4148         sum3 = t[row + 2];
4149         sum4 = t[row + 3];
4150         sum5 = t[row + 4];
4151         for (n = 0; n < sz - 1; n += 2) {
4152           i1 = idx[0];
4153           i2 = idx[1];
4154           idx += 2;
4155           tmp0 = t[i1];
4156           tmp1 = t[i2];
4157           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
4158           v1 += 2;
4159           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
4160           v2 += 2;
4161           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
4162           v3 += 2;
4163           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
4164           v4 += 2;
4165           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
4166           v5 += 2;
4167         }
4168 
4169         if (n == sz - 1) {
4170           tmp0 = t[*idx];
4171           sum1 -= v1[0] * tmp0;
4172           sum2 -= v2[0] * tmp0;
4173           sum3 -= v3[0] * tmp0;
4174           sum4 -= v4[0] * tmp0;
4175           sum5 -= v5[0] * tmp0;
4176         }
4177         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
4178         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
4179         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
4180         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
4181         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
4182         ibdiag += 25;
4183         row += 5;
4184         break;
4185       default:
4186         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4187       }
4188     }
4189     PetscCall(PetscLogFlops(a->nz));
4190   }
4191   PetscCall(VecRestoreArray(xx, &x));
4192   PetscCall(VecRestoreArrayRead(bb, &b));
4193   PetscFunctionReturn(PETSC_SUCCESS);
4194 }
4195 
4196 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4197 {
4198   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
4199   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
4200   const MatScalar   *bdiag = a->inode.bdiag;
4201   const PetscScalar *b;
4202   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
4203   const PetscInt    *sizes = a->inode.size;
4204 
4205   PetscFunctionBegin;
4206   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
4207   PetscCall(VecGetArray(xx, &x));
4208   PetscCall(VecGetArrayRead(bb, &b));
4209   cnt = 0;
4210   for (i = 0, row = 0; i < m; i++) {
4211     switch (sizes[i]) {
4212     case 1:
4213       x[row] = b[row] * bdiag[cnt++];
4214       row++;
4215       break;
4216     case 2:
4217       x1       = b[row];
4218       x2       = b[row + 1];
4219       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
4220       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
4221       x[row++] = tmp1;
4222       x[row++] = tmp2;
4223       cnt += 4;
4224       break;
4225     case 3:
4226       x1       = b[row];
4227       x2       = b[row + 1];
4228       x3       = b[row + 2];
4229       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
4230       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
4231       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
4232       x[row++] = tmp1;
4233       x[row++] = tmp2;
4234       x[row++] = tmp3;
4235       cnt += 9;
4236       break;
4237     case 4:
4238       x1       = b[row];
4239       x2       = b[row + 1];
4240       x3       = b[row + 2];
4241       x4       = b[row + 3];
4242       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
4243       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
4244       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
4245       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
4246       x[row++] = tmp1;
4247       x[row++] = tmp2;
4248       x[row++] = tmp3;
4249       x[row++] = tmp4;
4250       cnt += 16;
4251       break;
4252     case 5:
4253       x1       = b[row];
4254       x2       = b[row + 1];
4255       x3       = b[row + 2];
4256       x4       = b[row + 3];
4257       x5       = b[row + 4];
4258       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
4259       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
4260       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
4261       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
4262       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
4263       x[row++] = tmp1;
4264       x[row++] = tmp2;
4265       x[row++] = tmp3;
4266       x[row++] = tmp4;
4267       x[row++] = tmp5;
4268       cnt += 25;
4269       break;
4270     default:
4271       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
4272     }
4273   }
4274   PetscCall(PetscLogFlops(2.0 * cnt));
4275   PetscCall(VecRestoreArray(xx, &x));
4276   PetscCall(VecRestoreArrayRead(bb, &b));
4277   PetscFunctionReturn(PETSC_SUCCESS);
4278 }
4279 
4280 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4281 {
4282   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4283 
4284   PetscFunctionBegin;
4285   a->inode.node_count       = 0;
4286   a->inode.use              = PETSC_FALSE;
4287   a->inode.checked          = PETSC_FALSE;
4288   a->inode.mat_nonzerostate = -1;
4289   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4290   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4291   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4292   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4293   A->ops->coloringpatch     = NULL;
4294   A->ops->multdiagonalblock = NULL;
4295   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4296   PetscFunctionReturn(PETSC_SUCCESS);
4297 }
4298 
4299 /*
4300     samestructure indicates that the matrix has not changed its nonzero structure so we
4301     do not need to recompute the inodes
4302 */
4303 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4304 {
4305   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4306   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4307   PetscBool       flag;
4308   const PetscInt *idx, *idy, *ii;
4309 
4310   PetscFunctionBegin;
4311   if (!a->inode.use) {
4312     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4313     PetscCall(PetscFree(a->inode.size));
4314     PetscFunctionReturn(PETSC_SUCCESS);
4315   }
4316   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
4317 
4318   m = A->rmap->n;
4319   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4320   ns = a->inode.size;
4321 
4322   i          = 0;
4323   node_count = 0;
4324   idx        = a->j;
4325   ii         = a->i;
4326   while (i < m) {            /* For each row */
4327     nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
4328     /* Limits the number of elements in a node to 'a->inode.limit' */
4329     for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4330       nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
4331       if (nzy != nzx) break;
4332       idy += nzx; /* Same nonzero pattern */
4333       PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
4334       if (!flag) break;
4335     }
4336     ns[node_count++] = blk_size;
4337     idx += blk_size * nzx;
4338     i = j;
4339   }
4340 
4341   /* If not enough inodes found,, do not use inode version of the routines */
4342   if (!m || node_count > .8 * m) {
4343     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4344     PetscCall(PetscFree(a->inode.size));
4345     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4346   } else {
4347     if (!A->factortype) {
4348       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4349       if (A->rmap->n == A->cmap->n) {
4350         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4351         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4352         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4353         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4354         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4355       }
4356     } else {
4357       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4358     }
4359     a->inode.node_count = node_count;
4360     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4361   }
4362   a->inode.checked          = PETSC_TRUE;
4363   a->inode.mat_nonzerostate = A->nonzerostate;
4364   PetscFunctionReturn(PETSC_SUCCESS);
4365 }
4366 
4367 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4368 {
4369   Mat         B = *C;
4370   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4371   PetscInt    m = A->rmap->n;
4372 
4373   PetscFunctionBegin;
4374   c->inode.use              = a->inode.use;
4375   c->inode.limit            = a->inode.limit;
4376   c->inode.max_limit        = a->inode.max_limit;
4377   c->inode.checked          = PETSC_FALSE;
4378   c->inode.size             = NULL;
4379   c->inode.node_count       = 0;
4380   c->inode.ibdiagvalid      = PETSC_FALSE;
4381   c->inode.ibdiag           = NULL;
4382   c->inode.bdiag            = NULL;
4383   c->inode.mat_nonzerostate = -1;
4384   if (a->inode.use) {
4385     if (a->inode.checked && a->inode.size) {
4386       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
4387       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4388 
4389       c->inode.checked          = PETSC_TRUE;
4390       c->inode.node_count       = a->inode.node_count;
4391       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4392     }
4393     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4394     if (!B->factortype) {
4395       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4396       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4397       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4398       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4399       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4400       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4401     } else {
4402       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4403     }
4404   }
4405   PetscFunctionReturn(PETSC_SUCCESS);
4406 }
4407 
4408 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4409 {
4410   PetscInt        k;
4411   const PetscInt *vi;
4412 
4413   PetscFunctionBegin;
4414   vi = aj + ai[row];
4415   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4416   vi        = aj + adiag[row];
4417   cols[nzl] = vi[0];
4418   vi        = aj + adiag[row + 1] + 1;
4419   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4420   PetscFunctionReturn(PETSC_SUCCESS);
4421 }
4422 /*
4423    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4424    Modified from MatSeqAIJCheckInode().
4425 
4426    Input Parameters:
4427 .  Mat A - ILU or LU matrix factor
4428 
4429 */
4430 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4431 {
4432   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4433   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4434   PetscInt       *cols1, *cols2, *ns;
4435   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4436   PetscBool       flag;
4437 
4438   PetscFunctionBegin;
4439   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4440   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4441 
4442   m = A->rmap->n;
4443   if (a->inode.size) ns = a->inode.size;
4444   else PetscCall(PetscMalloc1(m + 1, &ns));
4445 
4446   i          = 0;
4447   node_count = 0;
4448   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4449   while (i < m) {                       /* For each row */
4450     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4451     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4452     nzx  = nzl1 + nzu1 + 1;
4453     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4454 
4455     /* Limits the number of elements in a node to 'a->inode.limit' */
4456     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4457       nzl2 = ai[j + 1] - ai[j];
4458       nzu2 = adiag[j] - adiag[j + 1] - 1;
4459       nzy  = nzl2 + nzu2 + 1;
4460       if (nzy != nzx) break;
4461       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4462       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4463       if (!flag) break;
4464     }
4465     ns[node_count++] = blk_size;
4466     i                = j;
4467   }
4468   PetscCall(PetscFree2(cols1, cols2));
4469   /* If not enough inodes found,, do not use inode version of the routines */
4470   if (!m || node_count > .8 * m) {
4471     PetscCall(PetscFree(ns));
4472 
4473     a->inode.node_count = 0;
4474     a->inode.size       = NULL;
4475     a->inode.use        = PETSC_FALSE;
4476 
4477     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4478   } else {
4479     A->ops->mult              = NULL;
4480     A->ops->sor               = NULL;
4481     A->ops->multadd           = NULL;
4482     A->ops->getrowij          = NULL;
4483     A->ops->restorerowij      = NULL;
4484     A->ops->getcolumnij       = NULL;
4485     A->ops->restorecolumnij   = NULL;
4486     A->ops->coloringpatch     = NULL;
4487     A->ops->multdiagonalblock = NULL;
4488     a->inode.node_count       = node_count;
4489     a->inode.size             = ns;
4490 
4491     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4492   }
4493   a->inode.checked = PETSC_TRUE;
4494   PetscFunctionReturn(PETSC_SUCCESS);
4495 }
4496 
4497 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4498 {
4499   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4500 
4501   PetscFunctionBegin;
4502   a->inode.ibdiagvalid = PETSC_FALSE;
4503   PetscFunctionReturn(PETSC_SUCCESS);
4504 }
4505 
4506 /*
4507      This is really ugly. if inodes are used this replaces the
4508   permutations with ones that correspond to rows/cols of the matrix
4509   rather then inode blocks
4510 */
4511 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4512 {
4513   PetscFunctionBegin;
4514   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4515   PetscFunctionReturn(PETSC_SUCCESS);
4516 }
4517 
4518 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4519 {
4520   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4521   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4522   const PetscInt *ridx, *cidx;
4523   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
4524   PetscInt        nslim_col, *ns_col;
4525   IS              ris = *rperm, cis = *cperm;
4526 
4527   PetscFunctionBegin;
4528   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
4529   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4530 
4531   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4532   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
4533   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4534 
4535   PetscCall(ISGetIndices(ris, &ridx));
4536   PetscCall(ISGetIndices(cis, &cidx));
4537 
4538   /* Form the inode structure for the rows of permuted matric using inv perm*/
4539   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
4540 
4541   /* Construct the permutations for rows*/
4542   for (i = 0, row = 0; i < nslim_row; ++i) {
4543     indx      = ridx[i];
4544     start_val = tns[indx];
4545     end_val   = tns[indx + 1];
4546     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4547   }
4548 
4549   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4550   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
4551 
4552   /* Construct permutations for columns */
4553   for (i = 0, col = 0; i < nslim_col; ++i) {
4554     indx      = cidx[i];
4555     start_val = tns[indx];
4556     end_val   = tns[indx + 1];
4557     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4558   }
4559 
4560   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4561   PetscCall(ISSetPermutation(*rperm));
4562   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4563   PetscCall(ISSetPermutation(*cperm));
4564 
4565   PetscCall(ISRestoreIndices(ris, &ridx));
4566   PetscCall(ISRestoreIndices(cis, &cidx));
4567 
4568   PetscCall(PetscFree(ns_col));
4569   PetscCall(PetscFree2(permr, permc));
4570   PetscCall(ISDestroy(&cis));
4571   PetscCall(ISDestroy(&ris));
4572   PetscCall(PetscFree(tns));
4573   PetscFunctionReturn(PETSC_SUCCESS);
4574 }
4575 
4576 /*@C
4577    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4578 
4579    Not Collective
4580 
4581    Input Parameter:
4582 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4583 
4584    Output Parameters:
4585 +  node_count - no of inodes present in the matrix.
4586 .  sizes      - an array of size node_count,with sizes of each inode.
4587 -  limit      - the max size used to generate the inodes.
4588 
4589    Level: advanced
4590 
4591    Note:
4592     This routine returns some internal storage information
4593    of the matrix, it is intended to be used by advanced users.
4594    It should be called after the matrix is assembled.
4595    The contents of the sizes[] array should not be changed.
4596    NULL may be passed for information not requested.
4597 
4598 .seealso: `MatGetInfo()`
4599 @*/
4600 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4601 {
4602   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4603 
4604   PetscFunctionBegin;
4605   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4606   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4607   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4608   PetscFunctionReturn(PETSC_SUCCESS);
4609 }
4610 
4611 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4612 {
4613   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4614 
4615   PetscFunctionBegin;
4616   if (node_count) *node_count = a->inode.node_count;
4617   if (sizes) *sizes = a->inode.size;
4618   if (limit) *limit = a->inode.limit;
4619   PetscFunctionReturn(PETSC_SUCCESS);
4620 }
4621