xref: /petsc/src/mat/impls/aij/seq/inode.c (revision e8c0849ab8fe171bed529bea27238c9b402db591)
1 /*
2   This file provides high performance routines for the Inode format (compressed sparse row)
3   by taking advantage of rows with identical nonzero structure (I-nodes).
4 */
5 #include <../src/mat/impls/aij/seq/aij.h>
6 #if defined(PETSC_HAVE_XMMINTRIN_H)
7   #include <xmmintrin.h>
8 #endif
9 
MatCreateColInode_Private(Mat A,PetscInt * size,PetscInt ** ns)10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11 {
12   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
13   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
14 
15   PetscFunctionBegin;
16   n = A->cmap->n;
17   m = A->rmap->n;
18   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19   ns_row = a->inode.size_csr;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24     for (; count + 1 < n; count++, i++);
25     if (count < n) i++;
26     *size = i;
27     PetscFunctionReturn(PETSC_SUCCESS);
28   }
29   PetscCall(PetscMalloc1(n + 1, &ns_col));
30   ns_col[0] = 0;
31 
32   /* Use the same row structure wherever feasible. */
33   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
34 
35   /* if m < n; pad up the remainder with inode_limit */
36   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37   /* The last node is the odd ball. pad it up with the remaining rows; */
38   if (count < n) {
39     ns_col[i + 1] = ns_col[i] + (n - count);
40     i++;
41   } else if (count > n) {
42     /* Adjust for the over estimation */
43     ns_col[i] += n - count;
44   }
45   *size = i;
46   *ns   = ns_col;
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /*
51       This builds symmetric version of nonzero structure,
52 */
MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54 {
55   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
56   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
58   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
59 
60   PetscFunctionBegin;
61   nslim_row = a->inode.node_count;
62   m         = A->rmap->n;
63   n         = A->cmap->n;
64   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
66 
67   /* Use the row_inode as column_inode */
68   nslim_col = nslim_row;
69   ns_col    = ns_row;
70 
71   /* allocate space for reformatted inode structure */
72   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
74 
75   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76     nsz = ns_col[i1 + 1] - ns_col[i1];
77     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
78   }
79   /* allocate space for row pointers */
80   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
81   *iia = ia;
82   PetscCall(PetscMalloc1(nslim_row + 1, &work));
83 
84   /* determine the number of columns in each row */
85   ia[0] = oshift;
86   for (i1 = 0; i1 < nslim_row; i1++) {
87     row  = ns_row[i1];
88     j    = aj + ai[row] + ishift;
89     jmax = aj + ai[row + 1] + ishift;
90     if (j == jmax) continue; /* empty row */
91     col = *j++ + ishift;
92     i2  = tvc[col];
93     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
94       ia[i1 + 1]++;
95       ia[i2 + 1]++;
96       i2++; /* Start col of next node */
97       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
98       i2 = tvc[col];
99     }
100     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
101   }
102 
103   /* shift ia[i] to point to next row */
104   for (i1 = 1; i1 < nslim_row + 1; i1++) {
105     row = ia[i1 - 1];
106     ia[i1] += row;
107     work[i1 - 1] = row - oshift;
108   }
109 
110   /* allocate space for column pointers */
111   nz = ia[nslim_row] + (!ishift);
112   PetscCall(PetscMalloc1(nz, &ja));
113   *jja = ja;
114 
115   /* loop over lower triangular part putting into ja */
116   for (i1 = 0; i1 < nslim_row; i1++) {
117     row  = ns_row[i1];
118     j    = aj + ai[row] + ishift;
119     jmax = aj + ai[row + 1] + ishift;
120     if (j == jmax) continue; /* empty row */
121     col = *j++ + ishift;
122     i2  = tvc[col];
123     while (i2 < i1 && j < jmax) {
124       ja[work[i2]++] = i1 + oshift;
125       ja[work[i1]++] = i2 + oshift;
126       ++i2;
127       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
128       i2 = tvc[col];
129     }
130     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
131   }
132   PetscCall(PetscFree(work));
133   PetscCall(PetscFree2(tns, tvc));
134   PetscFunctionReturn(PETSC_SUCCESS);
135 }
136 
137 /*
138       This builds nonsymmetric version of nonzero structure,
139 */
MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141 {
142   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
143   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
144   PetscInt       *tns, *tvc, nsz, i1, i2;
145   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
146 
147   PetscFunctionBegin;
148   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
149   nslim_row = a->inode.node_count;
150   n         = A->cmap->n;
151 
152   /* Create The column_inode for this matrix */
153   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
154 
155   /* allocate space for reformatted column_inode structure */
156   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
158 
159   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160     nsz = ns_col[i1 + 1] - ns_col[i1];
161     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
162   }
163   /* allocate space for row pointers */
164   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
165   *iia = ia;
166   PetscCall(PetscMalloc1(nslim_row + 1, &work));
167 
168   /* determine the number of columns in each row */
169   ia[0] = oshift;
170   for (i1 = 0; i1 < nslim_row; i1++) {
171     row = ns_row[i1];
172     j   = aj + ai[row] + ishift;
173     nz  = ai[row + 1] - ai[row];
174     if (!nz) continue; /* empty row */
175     col = *j++ + ishift;
176     i2  = tvc[col];
177     while (nz-- > 0) { /* off-diagonal elements */
178       ia[i1 + 1]++;
179       i2++; /* Start col of next node */
180       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
181       if (nz > 0) i2 = tvc[col];
182     }
183   }
184 
185   /* shift ia[i] to point to next row */
186   for (i1 = 1; i1 < nslim_row + 1; i1++) {
187     row = ia[i1 - 1];
188     ia[i1] += row;
189     work[i1 - 1] = row - oshift;
190   }
191 
192   /* allocate space for column pointers */
193   nz = ia[nslim_row] + (!ishift);
194   PetscCall(PetscMalloc1(nz, &ja));
195   *jja = ja;
196 
197   /* loop over matrix putting into ja */
198   for (i1 = 0; i1 < nslim_row; i1++) {
199     row = ns_row[i1];
200     j   = aj + ai[row] + ishift;
201     nz  = ai[row + 1] - ai[row];
202     if (!nz) continue; /* empty row */
203     col = *j++ + ishift;
204     i2  = tvc[col];
205     while (nz-- > 0) {
206       ja[work[i1]++] = i2 + oshift;
207       ++i2;
208       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
209       if (nz > 0) i2 = tvc[col];
210     }
211   }
212   PetscCall(PetscFree(ns_col));
213   PetscCall(PetscFree(work));
214   PetscCall(PetscFree2(tns, tvc));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219 {
220   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
221 
222   PetscFunctionBegin;
223   if (n) *n = a->inode.node_count;
224   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
225   if (!blockcompressed) {
226     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
227   } else if (symmetric) {
228     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
229   } else {
230     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
231   }
232   PetscFunctionReturn(PETSC_SUCCESS);
233 }
234 
MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236 {
237   PetscFunctionBegin;
238   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
239 
240   if (!blockcompressed) {
241     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
242   } else {
243     PetscCall(PetscFree(*ia));
244     PetscCall(PetscFree(*ja));
245   }
246   PetscFunctionReturn(PETSC_SUCCESS);
247 }
248 
MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250 {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
254 
255   PetscFunctionBegin;
256   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
257   nslim_row = a->inode.node_count;
258   n         = A->cmap->n;
259 
260   /* Create The column_inode for this matrix */
261   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
262 
263   /* allocate space for reformatted column_inode structure */
264   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
266 
267   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268     nsz = ns_col[i1 + 1] - ns_col[i1];
269     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
270   }
271   /* allocate space for column pointers */
272   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
273   *iia = ia;
274   PetscCall(PetscMalloc1(nslim_col + 1, &work));
275 
276   /* determine the number of columns in each row */
277   ia[0] = oshift;
278   for (i1 = 0; i1 < nslim_row; i1++) {
279     row = ns_row[i1];
280     j   = aj + ai[row] + ishift;
281     col = *j++ + ishift;
282     i2  = tvc[col];
283     nz  = ai[row + 1] - ai[row];
284     while (nz-- > 0) { /* off-diagonal elements */
285       /* ia[i1+1]++; */
286       ia[i2 + 1]++;
287       i2++;
288       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
289       if (nz > 0) i2 = tvc[col];
290     }
291   }
292 
293   /* shift ia[i] to point to next col */
294   for (i1 = 1; i1 < nslim_col + 1; i1++) {
295     col = ia[i1 - 1];
296     ia[i1] += col;
297     work[i1 - 1] = col - oshift;
298   }
299 
300   /* allocate space for column pointers */
301   nz = ia[nslim_col] + (!ishift);
302   PetscCall(PetscMalloc1(nz, &ja));
303   *jja = ja;
304 
305   /* loop over matrix putting into ja */
306   for (i1 = 0; i1 < nslim_row; i1++) {
307     row = ns_row[i1];
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row + 1] - ai[row];
312     while (nz-- > 0) {
313       /* ja[work[i1]++] = i2 + oshift; */
314       ja[work[i2]++] = i1 + oshift;
315       i2++;
316       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320   PetscCall(PetscFree(ns_col));
321   PetscCall(PetscFree(work));
322   PetscCall(PetscFree2(tns, tvc));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 
MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327 {
328   PetscFunctionBegin;
329   PetscCall(MatCreateColInode_Private(A, n, NULL));
330   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
331 
332   if (!blockcompressed) {
333     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
334   } else if (symmetric) {
335     /* Since the indices are symmetric it doesn't matter */
336     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
337   } else {
338     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
339   }
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344 {
345   PetscFunctionBegin;
346   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
347   if (!blockcompressed) {
348     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
349   } else {
350     PetscCall(PetscFree(*ia));
351     PetscCall(PetscFree(*ja));
352   }
353   PetscFunctionReturn(PETSC_SUCCESS);
354 }
355 
MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357 {
358   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
359   PetscScalar       *y;
360   const PetscScalar *x;
361   PetscInt           row, node_max, nonzerorow = 0;
362   PetscInt          *ns;
363 
364   PetscFunctionBegin;
365   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
366   node_max = a->inode.node_count;
367   ns       = a->inode.size_csr; /* Node Size array */
368   PetscCall(VecGetArrayRead(xx, &x));
369   PetscCall(VecGetArray(yy, &y));
370 
371   PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372   for (PetscInt i = 0; i < node_max; ++i) {
373     PetscInt         i1, i2, nsz, n, sz;
374     const MatScalar *v1, *v2, *v3, *v4, *v5;
375     PetscScalar      sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376     const PetscInt  *idx;
377 
378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380 #endif
381     row = ns[i];
382     nsz = ns[i + 1] - ns[i];
383     n   = a->i[row + 1] - a->i[row];
384     nonzerorow += (n > 0) * nsz;
385 
386     idx = &a->j[a->i[row]];
387     v1  = &a->a[a->i[row]];
388     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
389     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390     sz = n;                                                                /* No of non zeros in this row */
391                                                                            /* Switch on the size of Node */
392     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
393     case 1:
394       sum1 = 0.;
395 
396       for (n = 0; n < sz - 1; n += 2) {
397         i1 = idx[0]; /* The instructions are ordered to */
398         i2 = idx[1]; /* make the compiler's job easy */
399         idx += 2;
400         tmp0 = x[i1];
401         tmp1 = x[i2];
402         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
403         v1 += 2;
404       }
405 
406       if (n == sz - 1) { /* Take care of the last nonzero  */
407         tmp0 = x[*idx++];
408         sum1 += *v1++ * tmp0;
409       }
410       y[row++] = sum1;
411       break;
412     case 2:
413       sum1 = 0.;
414       sum2 = 0.;
415       v2   = v1 + n;
416 
417       for (n = 0; n < sz - 1; n += 2) {
418         i1 = idx[0];
419         i2 = idx[1];
420         idx += 2;
421         tmp0 = x[i1];
422         tmp1 = x[i2];
423         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
424         v1 += 2;
425         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
426         v2 += 2;
427       }
428       if (n == sz - 1) {
429         tmp0 = x[*idx++];
430         sum1 += *v1++ * tmp0;
431         sum2 += *v2++ * tmp0;
432       }
433       y[row++] = sum1;
434       y[row++] = sum2;
435       v1       = v2; /* Since the next block to be processed starts there*/
436       idx += sz;
437       break;
438     case 3:
439       sum1 = 0.;
440       sum2 = 0.;
441       sum3 = 0.;
442       v2   = v1 + n;
443       v3   = v2 + n;
444 
445       for (n = 0; n < sz - 1; n += 2) {
446         i1 = idx[0];
447         i2 = idx[1];
448         idx += 2;
449         tmp0 = x[i1];
450         tmp1 = x[i2];
451         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
452         v1 += 2;
453         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
454         v2 += 2;
455         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
456         v3 += 2;
457       }
458       if (n == sz - 1) {
459         tmp0 = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462         sum3 += *v3++ * tmp0;
463       }
464       y[row++] = sum1;
465       y[row++] = sum2;
466       y[row++] = sum3;
467       v1       = v3; /* Since the next block to be processed starts there*/
468       idx += 2 * sz;
469       break;
470     case 4:
471       sum1 = 0.;
472       sum2 = 0.;
473       sum3 = 0.;
474       sum4 = 0.;
475       v2   = v1 + n;
476       v3   = v2 + n;
477       v4   = v3 + n;
478 
479       for (n = 0; n < sz - 1; n += 2) {
480         i1 = idx[0];
481         i2 = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
486         v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
488         v2 += 2;
489         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
490         v3 += 2;
491         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
492         v4 += 2;
493       }
494       if (n == sz - 1) {
495         tmp0 = x[*idx++];
496         sum1 += *v1++ * tmp0;
497         sum2 += *v2++ * tmp0;
498         sum3 += *v3++ * tmp0;
499         sum4 += *v4++ * tmp0;
500       }
501       y[row++] = sum1;
502       y[row++] = sum2;
503       y[row++] = sum3;
504       y[row++] = sum4;
505       v1       = v4; /* Since the next block to be processed starts there*/
506       idx += 3 * sz;
507       break;
508     case 5:
509       sum1 = 0.;
510       sum2 = 0.;
511       sum3 = 0.;
512       sum4 = 0.;
513       sum5 = 0.;
514       v2   = v1 + n;
515       v3   = v2 + n;
516       v4   = v3 + n;
517       v5   = v4 + n;
518 
519       for (n = 0; n < sz - 1; n += 2) {
520         i1 = idx[0];
521         i2 = idx[1];
522         idx += 2;
523         tmp0 = x[i1];
524         tmp1 = x[i2];
525         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
526         v1 += 2;
527         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
528         v2 += 2;
529         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
530         v3 += 2;
531         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
532         v4 += 2;
533         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
534         v5 += 2;
535       }
536       if (n == sz - 1) {
537         tmp0 = x[*idx++];
538         sum1 += *v1++ * tmp0;
539         sum2 += *v2++ * tmp0;
540         sum3 += *v3++ * tmp0;
541         sum4 += *v4++ * tmp0;
542         sum5 += *v5++ * tmp0;
543       }
544       y[row++] = sum1;
545       y[row++] = sum2;
546       y[row++] = sum3;
547       y[row++] = sum4;
548       y[row++] = sum5;
549       v1       = v5; /* Since the next block to be processed starts there */
550       idx += 4 * sz;
551       break;
552     default:
553       SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
554     }
555   }
556   PetscCall(VecRestoreArrayRead(xx, &x));
557   PetscCall(VecRestoreArray(yy, &y));
558   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
559   PetscFunctionReturn(PETSC_SUCCESS);
560 }
561 
562 /* Almost same code as the MatMult_SeqAIJ_Inode() */
MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564 {
565   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
566   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
567   const MatScalar   *v1, *v2, *v3, *v4, *v5;
568   const PetscScalar *x;
569   PetscScalar       *y, *z, *zt;
570   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
571   const PetscInt    *idx, *ns, *ii;
572 
573   PetscFunctionBegin;
574   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
575   node_max = a->inode.node_count;
576   ns       = a->inode.size_csr; /* Node Size array */
577 
578   PetscCall(VecGetArrayRead(xx, &x));
579   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
580   zt = z;
581 
582   idx = a->j;
583   v1  = a->a;
584   ii  = a->i;
585 
586   for (i = 0; i < node_max; ++i) {
587     row = ns[i];
588     nsz = ns[i + 1] - ns[i];
589     n   = ii[1] - ii[0];
590     ii += nsz;
591     sz = n;        /* No of non zeros in this row */
592                    /* Switch on the size of Node */
593     switch (nsz) { /* Each loop in 'case' is unrolled */
594     case 1:
595       sum1 = *zt++;
596 
597       for (n = 0; n < sz - 1; n += 2) {
598         i1 = idx[0]; /* The instructions are ordered to */
599         i2 = idx[1]; /* make the compiler's job easy */
600         idx += 2;
601         tmp0 = x[i1];
602         tmp1 = x[i2];
603         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
604         v1 += 2;
605       }
606 
607       if (n == sz - 1) { /* Take care of the last nonzero  */
608         tmp0 = x[*idx++];
609         sum1 += *v1++ * tmp0;
610       }
611       y[row++] = sum1;
612       break;
613     case 2:
614       sum1 = *zt++;
615       sum2 = *zt++;
616       v2   = v1 + n;
617 
618       for (n = 0; n < sz - 1; n += 2) {
619         i1 = idx[0];
620         i2 = idx[1];
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
625         v1 += 2;
626         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
627         v2 += 2;
628       }
629       if (n == sz - 1) {
630         tmp0 = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++] = sum1;
635       y[row++] = sum2;
636       v1       = v2; /* Since the next block to be processed starts there*/
637       idx += sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n < sz - 1; n += 2) {
647         i1 = idx[0];
648         i2 = idx[1];
649         idx += 2;
650         tmp0 = x[i1];
651         tmp1 = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
653         v1 += 2;
654         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
655         v2 += 2;
656         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
657         v3 += 2;
658       }
659       if (n == sz - 1) {
660         tmp0 = x[*idx++];
661         sum1 += *v1++ * tmp0;
662         sum2 += *v2++ * tmp0;
663         sum3 += *v3++ * tmp0;
664       }
665       y[row++] = sum1;
666       y[row++] = sum2;
667       y[row++] = sum3;
668       v1       = v3; /* Since the next block to be processed starts there*/
669       idx += 2 * sz;
670       break;
671     case 4:
672       sum1 = *zt++;
673       sum2 = *zt++;
674       sum3 = *zt++;
675       sum4 = *zt++;
676       v2   = v1 + n;
677       v3   = v2 + n;
678       v4   = v3 + n;
679 
680       for (n = 0; n < sz - 1; n += 2) {
681         i1 = idx[0];
682         i2 = idx[1];
683         idx += 2;
684         tmp0 = x[i1];
685         tmp1 = x[i2];
686         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
687         v1 += 2;
688         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
689         v2 += 2;
690         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
691         v3 += 2;
692         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
693         v4 += 2;
694       }
695       if (n == sz - 1) {
696         tmp0 = x[*idx++];
697         sum1 += *v1++ * tmp0;
698         sum2 += *v2++ * tmp0;
699         sum3 += *v3++ * tmp0;
700         sum4 += *v4++ * tmp0;
701       }
702       y[row++] = sum1;
703       y[row++] = sum2;
704       y[row++] = sum3;
705       y[row++] = sum4;
706       v1       = v4; /* Since the next block to be processed starts there*/
707       idx += 3 * sz;
708       break;
709     case 5:
710       sum1 = *zt++;
711       sum2 = *zt++;
712       sum3 = *zt++;
713       sum4 = *zt++;
714       sum5 = *zt++;
715       v2   = v1 + n;
716       v3   = v2 + n;
717       v4   = v3 + n;
718       v5   = v4 + n;
719 
720       for (n = 0; n < sz - 1; n += 2) {
721         i1 = idx[0];
722         i2 = idx[1];
723         idx += 2;
724         tmp0 = x[i1];
725         tmp1 = x[i2];
726         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
727         v1 += 2;
728         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
729         v2 += 2;
730         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
731         v3 += 2;
732         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
733         v4 += 2;
734         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
735         v5 += 2;
736       }
737       if (n == sz - 1) {
738         tmp0 = x[*idx++];
739         sum1 += *v1++ * tmp0;
740         sum2 += *v2++ * tmp0;
741         sum3 += *v3++ * tmp0;
742         sum4 += *v4++ * tmp0;
743         sum5 += *v5++ * tmp0;
744       }
745       y[row++] = sum1;
746       y[row++] = sum2;
747       y[row++] = sum3;
748       y[row++] = sum4;
749       y[row++] = sum5;
750       v1       = v5; /* Since the next block to be processed starts there */
751       idx += 4 * sz;
752       break;
753     default:
754       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
755     }
756   }
757   PetscCall(VecRestoreArrayRead(xx, &x));
758   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
759   PetscCall(PetscLogFlops(2.0 * a->nz));
760   PetscFunctionReturn(PETSC_SUCCESS);
761 }
762 
MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764 {
765   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
766   IS                 iscol = a->col, isrow = a->row;
767   const PetscInt    *r, *c, *rout, *cout;
768   PetscInt           i, j, n = A->rmap->n, nz;
769   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
770   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
772   PetscScalar        sum1, sum2, sum3, sum4, sum5;
773   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774   const PetscScalar *b;
775 
776   PetscFunctionBegin;
777   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
778   node_max = a->inode.node_count;
779   ns       = a->inode.size_csr; /* Node Size array */
780 
781   PetscCall(VecGetArrayRead(bb, &b));
782   PetscCall(VecGetArrayWrite(xx, &x));
783   tmp = a->solve_work;
784 
785   PetscCall(ISGetIndices(isrow, &rout));
786   r = rout;
787   PetscCall(ISGetIndices(iscol, &cout));
788   c = cout + (n - 1);
789 
790   /* forward solve the lower triangular */
791   tmps = tmp;
792   aa   = a_a;
793   aj   = a_j;
794   ad   = a->diag;
795 
796   for (i = 0, row = 0; i < node_max; ++i) {
797     row = ns[i];
798     nsz = ns[i + 1] - ns[i];
799     aii = ai[row];
800     v1  = aa + aii;
801     vi  = aj + aii;
802     nz  = ad[row] - aii;
803     if (i < node_max - 1) {
804       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
805       * but our indexing to determine its size could. */
806       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
807       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
808       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
809       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
810     }
811 
812     switch (nsz) { /* Each loop in 'case' is unrolled */
813     case 1:
814       sum1 = b[*r++];
815       for (j = 0; j < nz - 1; j += 2) {
816         i0 = vi[0];
817         i1 = vi[1];
818         vi += 2;
819         tmp0 = tmps[i0];
820         tmp1 = tmps[i1];
821         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
822         v1 += 2;
823       }
824       if (j == nz - 1) {
825         tmp0 = tmps[*vi++];
826         sum1 -= *v1++ * tmp0;
827       }
828       tmp[row++] = sum1;
829       break;
830     case 2:
831       sum1 = b[*r++];
832       sum2 = b[*r++];
833       v2   = aa + ai[row + 1];
834 
835       for (j = 0; j < nz - 1; j += 2) {
836         i0 = vi[0];
837         i1 = vi[1];
838         vi += 2;
839         tmp0 = tmps[i0];
840         tmp1 = tmps[i1];
841         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
842         v1 += 2;
843         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
844         v2 += 2;
845       }
846       if (j == nz - 1) {
847         tmp0 = tmps[*vi++];
848         sum1 -= *v1++ * tmp0;
849         sum2 -= *v2++ * tmp0;
850       }
851       sum2 -= *v2++ * sum1;
852       tmp[row++] = sum1;
853       tmp[row++] = sum2;
854       break;
855     case 3:
856       sum1 = b[*r++];
857       sum2 = b[*r++];
858       sum3 = b[*r++];
859       v2   = aa + ai[row + 1];
860       v3   = aa + ai[row + 2];
861 
862       for (j = 0; j < nz - 1; j += 2) {
863         i0 = vi[0];
864         i1 = vi[1];
865         vi += 2;
866         tmp0 = tmps[i0];
867         tmp1 = tmps[i1];
868         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
869         v1 += 2;
870         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
871         v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
873         v3 += 2;
874       }
875       if (j == nz - 1) {
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ * tmp0;
878         sum2 -= *v2++ * tmp0;
879         sum3 -= *v3++ * tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884 
885       tmp[row++] = sum1;
886       tmp[row++] = sum2;
887       tmp[row++] = sum3;
888       break;
889 
890     case 4:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       v2   = aa + ai[row + 1];
896       v3   = aa + ai[row + 2];
897       v4   = aa + ai[row + 3];
898 
899       for (j = 0; j < nz - 1; j += 2) {
900         i0 = vi[0];
901         i1 = vi[1];
902         vi += 2;
903         tmp0 = tmps[i0];
904         tmp1 = tmps[i1];
905         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
906         v1 += 2;
907         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
908         v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
910         v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
912         v4 += 2;
913       }
914       if (j == nz - 1) {
915         tmp0 = tmps[*vi++];
916         sum1 -= *v1++ * tmp0;
917         sum2 -= *v2++ * tmp0;
918         sum3 -= *v3++ * tmp0;
919         sum4 -= *v4++ * tmp0;
920       }
921       sum2 -= *v2++ * sum1;
922       sum3 -= *v3++ * sum1;
923       sum4 -= *v4++ * sum1;
924       sum3 -= *v3++ * sum2;
925       sum4 -= *v4++ * sum2;
926       sum4 -= *v4++ * sum3;
927 
928       tmp[row++] = sum1;
929       tmp[row++] = sum2;
930       tmp[row++] = sum3;
931       tmp[row++] = sum4;
932       break;
933     case 5:
934       sum1 = b[*r++];
935       sum2 = b[*r++];
936       sum3 = b[*r++];
937       sum4 = b[*r++];
938       sum5 = b[*r++];
939       v2   = aa + ai[row + 1];
940       v3   = aa + ai[row + 2];
941       v4   = aa + ai[row + 3];
942       v5   = aa + ai[row + 4];
943 
944       for (j = 0; j < nz - 1; j += 2) {
945         i0 = vi[0];
946         i1 = vi[1];
947         vi += 2;
948         tmp0 = tmps[i0];
949         tmp1 = tmps[i1];
950         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
951         v1 += 2;
952         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
953         v2 += 2;
954         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
955         v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
957         v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
959         v5 += 2;
960       }
961       if (j == nz - 1) {
962         tmp0 = tmps[*vi++];
963         sum1 -= *v1++ * tmp0;
964         sum2 -= *v2++ * tmp0;
965         sum3 -= *v3++ * tmp0;
966         sum4 -= *v4++ * tmp0;
967         sum5 -= *v5++ * tmp0;
968       }
969 
970       sum2 -= *v2++ * sum1;
971       sum3 -= *v3++ * sum1;
972       sum4 -= *v4++ * sum1;
973       sum5 -= *v5++ * sum1;
974       sum3 -= *v3++ * sum2;
975       sum4 -= *v4++ * sum2;
976       sum5 -= *v5++ * sum2;
977       sum4 -= *v4++ * sum3;
978       sum5 -= *v5++ * sum3;
979       sum5 -= *v5++ * sum4;
980 
981       tmp[row++] = sum1;
982       tmp[row++] = sum2;
983       tmp[row++] = sum3;
984       tmp[row++] = sum4;
985       tmp[row++] = sum5;
986       break;
987     default:
988       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
989     }
990   }
991   /* backward solve the upper triangular */
992   for (i = node_max - 1; i >= 0; i--) {
993     row = ns[i + 1];
994     nsz = ns[i + 1] - ns[i];
995     aii = ai[row + 1] - 1;
996     v1  = aa + aii;
997     vi  = aj + aii;
998     nz  = aii - ad[row];
999     switch (nsz) { /* Each loop in 'case' is unrolled */
1000     case 1:
1001       sum1 = tmp[row];
1002 
1003       for (j = nz; j > 1; j -= 2) {
1004         vi -= 2;
1005         i0   = vi[2];
1006         i1   = vi[1];
1007         tmp0 = tmps[i0];
1008         tmp1 = tmps[i1];
1009         v1 -= 2;
1010         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1011       }
1012       if (j == 1) {
1013         tmp0 = tmps[*vi--];
1014         sum1 -= *v1-- * tmp0;
1015       }
1016       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1017       row--;
1018       break;
1019     case 2:
1020       sum1 = tmp[row];
1021       sum2 = tmp[row - 1];
1022       v2   = aa + ai[row] - 1;
1023       for (j = nz; j > 1; j -= 2) {
1024         vi -= 2;
1025         i0   = vi[2];
1026         i1   = vi[1];
1027         tmp0 = tmps[i0];
1028         tmp1 = tmps[i1];
1029         v1 -= 2;
1030         v2 -= 2;
1031         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1032         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1033       }
1034       if (j == 1) {
1035         tmp0 = tmps[*vi--];
1036         sum1 -= *v1-- * tmp0;
1037         sum2 -= *v2-- * tmp0;
1038       }
1039 
1040       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1041       row--;
1042       sum2 -= *v2-- * tmp0;
1043       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1044       row--;
1045       break;
1046     case 3:
1047       sum1 = tmp[row];
1048       sum2 = tmp[row - 1];
1049       sum3 = tmp[row - 2];
1050       v2   = aa + ai[row] - 1;
1051       v3   = aa + ai[row - 1] - 1;
1052       for (j = nz; j > 1; j -= 2) {
1053         vi -= 2;
1054         i0   = vi[2];
1055         i1   = vi[1];
1056         tmp0 = tmps[i0];
1057         tmp1 = tmps[i1];
1058         v1 -= 2;
1059         v2 -= 2;
1060         v3 -= 2;
1061         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064       }
1065       if (j == 1) {
1066         tmp0 = tmps[*vi--];
1067         sum1 -= *v1-- * tmp0;
1068         sum2 -= *v2-- * tmp0;
1069         sum3 -= *v3-- * tmp0;
1070       }
1071       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1072       row--;
1073       sum2 -= *v2-- * tmp0;
1074       sum3 -= *v3-- * tmp0;
1075       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1076       row--;
1077       sum3 -= *v3-- * tmp0;
1078       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1079       row--;
1080 
1081       break;
1082     case 4:
1083       sum1 = tmp[row];
1084       sum2 = tmp[row - 1];
1085       sum3 = tmp[row - 2];
1086       sum4 = tmp[row - 3];
1087       v2   = aa + ai[row] - 1;
1088       v3   = aa + ai[row - 1] - 1;
1089       v4   = aa + ai[row - 2] - 1;
1090 
1091       for (j = nz; j > 1; j -= 2) {
1092         vi -= 2;
1093         i0   = vi[2];
1094         i1   = vi[1];
1095         tmp0 = tmps[i0];
1096         tmp1 = tmps[i1];
1097         v1 -= 2;
1098         v2 -= 2;
1099         v3 -= 2;
1100         v4 -= 2;
1101         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1102         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1103         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1104         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1105       }
1106       if (j == 1) {
1107         tmp0 = tmps[*vi--];
1108         sum1 -= *v1-- * tmp0;
1109         sum2 -= *v2-- * tmp0;
1110         sum3 -= *v3-- * tmp0;
1111         sum4 -= *v4-- * tmp0;
1112       }
1113 
1114       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1115       row--;
1116       sum2 -= *v2-- * tmp0;
1117       sum3 -= *v3-- * tmp0;
1118       sum4 -= *v4-- * tmp0;
1119       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1120       row--;
1121       sum3 -= *v3-- * tmp0;
1122       sum4 -= *v4-- * tmp0;
1123       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1124       row--;
1125       sum4 -= *v4-- * tmp0;
1126       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1127       row--;
1128       break;
1129     case 5:
1130       sum1 = tmp[row];
1131       sum2 = tmp[row - 1];
1132       sum3 = tmp[row - 2];
1133       sum4 = tmp[row - 3];
1134       sum5 = tmp[row - 4];
1135       v2   = aa + ai[row] - 1;
1136       v3   = aa + ai[row - 1] - 1;
1137       v4   = aa + ai[row - 2] - 1;
1138       v5   = aa + ai[row - 3] - 1;
1139       for (j = nz; j > 1; j -= 2) {
1140         vi -= 2;
1141         i0   = vi[2];
1142         i1   = vi[1];
1143         tmp0 = tmps[i0];
1144         tmp1 = tmps[i1];
1145         v1 -= 2;
1146         v2 -= 2;
1147         v3 -= 2;
1148         v4 -= 2;
1149         v5 -= 2;
1150         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1151         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1152         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1153         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1154         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1155       }
1156       if (j == 1) {
1157         tmp0 = tmps[*vi--];
1158         sum1 -= *v1-- * tmp0;
1159         sum2 -= *v2-- * tmp0;
1160         sum3 -= *v3-- * tmp0;
1161         sum4 -= *v4-- * tmp0;
1162         sum5 -= *v5-- * tmp0;
1163       }
1164 
1165       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
1166       row--;
1167       sum2 -= *v2-- * tmp0;
1168       sum3 -= *v3-- * tmp0;
1169       sum4 -= *v4-- * tmp0;
1170       sum5 -= *v5-- * tmp0;
1171       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
1172       row--;
1173       sum3 -= *v3-- * tmp0;
1174       sum4 -= *v4-- * tmp0;
1175       sum5 -= *v5-- * tmp0;
1176       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
1177       row--;
1178       sum4 -= *v4-- * tmp0;
1179       sum5 -= *v5-- * tmp0;
1180       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
1181       row--;
1182       sum5 -= *v5-- * tmp0;
1183       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
1184       row--;
1185       break;
1186     default:
1187       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
1188     }
1189   }
1190   PetscCall(ISRestoreIndices(isrow, &rout));
1191   PetscCall(ISRestoreIndices(iscol, &cout));
1192   PetscCall(VecRestoreArrayRead(bb, &b));
1193   PetscCall(VecRestoreArrayWrite(xx, &x));
1194   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1195   PetscFunctionReturn(PETSC_SUCCESS);
1196 }
1197 
MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo * info)1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199 {
1200   Mat              C = B;
1201   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1202   IS               isrow = b->row, isicol = b->icol;
1203   const PetscInt  *r, *ic, *ics;
1204   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
1205   PetscInt         i, j, k, nz, nzL, row, *pj;
1206   const PetscInt  *ajtmp, *bjtmp;
1207   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
1208   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
1209   FactorShiftCtx   sctx;
1210   const PetscInt  *ddiag;
1211   PetscReal        rs;
1212   MatScalar        d;
1213   PetscInt         inod, nodesz, node_max, col;
1214   const PetscInt  *ns;
1215   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
1216 
1217   PetscFunctionBegin;
1218   /* MatPivotSetUp(): initialize shift context sctx */
1219   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
1220 
1221   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1222     ddiag          = a->diag;
1223     sctx.shift_top = info->zeropivot;
1224     for (i = 0; i < n; i++) {
1225       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1226       d  = (aa)[ddiag[i]];
1227       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1228       v  = aa + ai[i];
1229       nz = ai[i + 1] - ai[i];
1230       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
1231       if (rs > sctx.shift_top) sctx.shift_top = rs;
1232     }
1233     sctx.shift_top *= 1.1;
1234     sctx.nshift_max = 5;
1235     sctx.shift_lo   = 0.;
1236     sctx.shift_hi   = 1.;
1237   }
1238 
1239   PetscCall(ISGetIndices(isrow, &r));
1240   PetscCall(ISGetIndices(isicol, &ic));
1241 
1242   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
1243   ics = ic;
1244 
1245   node_max = a->inode.node_count;
1246   ns       = a->inode.size_csr;
1247   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
1248 
1249   /* If max inode size > 4, split it into two inodes.*/
1250   /* also map the inode sizes according to the ordering */
1251   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
1252   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1253     nodesz = ns[i + 1] - ns[i];
1254     if (nodesz > 4) {
1255       tmp_vec1[j] = 4;
1256       ++j;
1257       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
1258     } else {
1259       tmp_vec1[j] = nodesz;
1260     }
1261   }
1262   /* Use the correct node_max */
1263   node_max = j;
1264 
1265   /* Now reorder the inode info based on mat re-ordering info */
1266   /* First create a row -> inode_size_array_index map */
1267   PetscCall(PetscMalloc1(n + 1, &nsmap));
1268   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1269   tmp_vec2[0] = 0;
1270   for (i = 0, row = 0; i < node_max; i++) {
1271     nodesz = tmp_vec1[i];
1272     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
1273   }
1274   /* Using nsmap, create a reordered ns structure */
1275   for (i = 0, j = 0; i < node_max; i++) {
1276     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1277     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
1278     j += nodesz;
1279   }
1280   PetscCall(PetscFree(nsmap));
1281   PetscCall(PetscFree(tmp_vec1));
1282 
1283   /* Now use the correct ns */
1284   ns = tmp_vec2;
1285 
1286   do {
1287     sctx.newshift = PETSC_FALSE;
1288     /* Now loop over each block-row, and do the factorization */
1289     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1290       nodesz = ns[inod + 1] - ns[inod];
1291 
1292       switch (nodesz) {
1293       case 1:
1294         /* zero rtmp1 */
1295         /* L part */
1296         nz    = bi[i + 1] - bi[i];
1297         bjtmp = bj + bi[i];
1298         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1299 
1300         /* U part */
1301         nz    = bdiag[i] - bdiag[i + 1];
1302         bjtmp = bj + bdiag[i + 1] + 1;
1303         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
1304 
1305         /* load in initial (unfactored row) */
1306         nz    = ai[r[i] + 1] - ai[r[i]];
1307         ajtmp = aj + ai[r[i]];
1308         v     = aa + ai[r[i]];
1309         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1310 
1311         /* ZeropivotApply() */
1312         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1313 
1314         /* elimination */
1315         bjtmp = bj + bi[i];
1316         row   = *bjtmp++;
1317         nzL   = bi[i + 1] - bi[i];
1318         for (k = 0; k < nzL; k++) {
1319           pc = rtmp1 + row;
1320           if (*pc != 0.0) {
1321             pv   = b->a + bdiag[row];
1322             mul1 = *pc * (*pv);
1323             *pc  = mul1;
1324             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1325             pv   = b->a + bdiag[row + 1] + 1;
1326             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1328             PetscCall(PetscLogFlops(1 + 2.0 * nz));
1329           }
1330           row = *bjtmp++;
1331         }
1332 
1333         /* finished row so stick it into b->a */
1334         rs = 0.0;
1335         /* L part */
1336         pv = b->a + bi[i];
1337         pj = b->j + bi[i];
1338         nz = bi[i + 1] - bi[i];
1339         for (j = 0; j < nz; j++) {
1340           pv[j] = rtmp1[pj[j]];
1341           rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* U part */
1345         pv = b->a + bdiag[i + 1] + 1;
1346         pj = b->j + bdiag[i + 1] + 1;
1347         nz = bdiag[i] - bdiag[i + 1] - 1;
1348         for (j = 0; j < nz; j++) {
1349           pv[j] = rtmp1[pj[j]];
1350           rs += PetscAbsScalar(pv[j]);
1351         }
1352 
1353         /* Check zero pivot */
1354         sctx.rs = rs;
1355         sctx.pv = rtmp1[i];
1356         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1357         if (sctx.newshift) break;
1358 
1359         /* Mark diagonal and invert diagonal for simpler triangular solves */
1360         pv  = b->a + bdiag[i];
1361         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1362         break;
1363 
1364       case 2:
1365         /* zero rtmp1 and rtmp2 */
1366         /* L part */
1367         nz    = bi[i + 1] - bi[i];
1368         bjtmp = bj + bi[i];
1369         for (j = 0; j < nz; j++) {
1370           col        = bjtmp[j];
1371           rtmp1[col] = 0.0;
1372           rtmp2[col] = 0.0;
1373         }
1374 
1375         /* U part */
1376         nz    = bdiag[i] - bdiag[i + 1];
1377         bjtmp = bj + bdiag[i + 1] + 1;
1378         for (j = 0; j < nz; j++) {
1379           col        = bjtmp[j];
1380           rtmp1[col] = 0.0;
1381           rtmp2[col] = 0.0;
1382         }
1383 
1384         /* load in initial (unfactored row) */
1385         nz    = ai[r[i] + 1] - ai[r[i]];
1386         ajtmp = aj + ai[r[i]];
1387         v1    = aa + ai[r[i]];
1388         v2    = aa + ai[r[i + 1]];
1389         for (j = 0; j < nz; j++) {
1390           col        = ics[ajtmp[j]];
1391           rtmp1[col] = v1[j];
1392           rtmp2[col] = v2[j];
1393         }
1394         /* ZeropivotApply(): shift the diagonal of the matrix  */
1395         rtmp1[i] += sctx.shift_amount;
1396         rtmp2[i + 1] += sctx.shift_amount;
1397 
1398         /* elimination */
1399         bjtmp = bj + bi[i];
1400         row   = *bjtmp++; /* pivot row */
1401         nzL   = bi[i + 1] - bi[i];
1402         for (k = 0; k < nzL; k++) {
1403           pc1 = rtmp1 + row;
1404           pc2 = rtmp2 + row;
1405           if (*pc1 != 0.0 || *pc2 != 0.0) {
1406             pv   = b->a + bdiag[row];
1407             mul1 = *pc1 * (*pv);
1408             mul2 = *pc2 * (*pv);
1409             *pc1 = mul1;
1410             *pc2 = mul2;
1411 
1412             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1413             pv = b->a + bdiag[row + 1] + 1;
1414             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1415             for (j = 0; j < nz; j++) {
1416               col = pj[j];
1417               rtmp1[col] -= mul1 * pv[j];
1418               rtmp2[col] -= mul2 * pv[j];
1419             }
1420             PetscCall(PetscLogFlops(2 + 4.0 * nz));
1421           }
1422           row = *bjtmp++;
1423         }
1424 
1425         /* finished row i; check zero pivot, then stick row i into b->a */
1426         rs = 0.0;
1427         /* L part */
1428         pc1 = b->a + bi[i];
1429         pj  = b->j + bi[i];
1430         nz  = bi[i + 1] - bi[i];
1431         for (j = 0; j < nz; j++) {
1432           col    = pj[j];
1433           pc1[j] = rtmp1[col];
1434           rs += PetscAbsScalar(pc1[j]);
1435         }
1436         /* U part */
1437         pc1 = b->a + bdiag[i + 1] + 1;
1438         pj  = b->j + bdiag[i + 1] + 1;
1439         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1440         for (j = 0; j < nz; j++) {
1441           col    = pj[j];
1442           pc1[j] = rtmp1[col];
1443           rs += PetscAbsScalar(pc1[j]);
1444         }
1445 
1446         sctx.rs = rs;
1447         sctx.pv = rtmp1[i];
1448         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1449         if (sctx.newshift) break;
1450         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1451         *pc1 = 1.0 / sctx.pv;
1452 
1453         /* Now take care of diagonal 2x2 block. */
1454         pc2 = rtmp2 + i;
1455         if (*pc2 != 0.0) {
1456           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1457           *pc2 = mul1;                        /* insert L entry */
1458           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1459           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460           for (j = 0; j < nz; j++) {
1461             col = pj[j];
1462             rtmp2[col] -= mul1 * rtmp1[col];
1463           }
1464           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465         }
1466 
1467         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468         rs = 0.0;
1469         /* L part */
1470         pc2 = b->a + bi[i + 1];
1471         pj  = b->j + bi[i + 1];
1472         nz  = bi[i + 2] - bi[i + 1];
1473         for (j = 0; j < nz; j++) {
1474           col    = pj[j];
1475           pc2[j] = rtmp2[col];
1476           rs += PetscAbsScalar(pc2[j]);
1477         }
1478         /* U part */
1479         pc2 = b->a + bdiag[i + 2] + 1;
1480         pj  = b->j + bdiag[i + 2] + 1;
1481         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482         for (j = 0; j < nz; j++) {
1483           col    = pj[j];
1484           pc2[j] = rtmp2[col];
1485           rs += PetscAbsScalar(pc2[j]);
1486         }
1487 
1488         sctx.rs = rs;
1489         sctx.pv = rtmp2[i + 1];
1490         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1491         if (sctx.newshift) break;
1492         pc2  = b->a + bdiag[i + 1];
1493         *pc2 = 1.0 / sctx.pv;
1494         break;
1495 
1496       case 3:
1497         /* zero rtmp */
1498         /* L part */
1499         nz    = bi[i + 1] - bi[i];
1500         bjtmp = bj + bi[i];
1501         for (j = 0; j < nz; j++) {
1502           col        = bjtmp[j];
1503           rtmp1[col] = 0.0;
1504           rtmp2[col] = 0.0;
1505           rtmp3[col] = 0.0;
1506         }
1507 
1508         /* U part */
1509         nz    = bdiag[i] - bdiag[i + 1];
1510         bjtmp = bj + bdiag[i + 1] + 1;
1511         for (j = 0; j < nz; j++) {
1512           col        = bjtmp[j];
1513           rtmp1[col] = 0.0;
1514           rtmp2[col] = 0.0;
1515           rtmp3[col] = 0.0;
1516         }
1517 
1518         /* load in initial (unfactored row) */
1519         nz    = ai[r[i] + 1] - ai[r[i]];
1520         ajtmp = aj + ai[r[i]];
1521         v1    = aa + ai[r[i]];
1522         v2    = aa + ai[r[i + 1]];
1523         v3    = aa + ai[r[i + 2]];
1524         for (j = 0; j < nz; j++) {
1525           col        = ics[ajtmp[j]];
1526           rtmp1[col] = v1[j];
1527           rtmp2[col] = v2[j];
1528           rtmp3[col] = v3[j];
1529         }
1530         /* ZeropivotApply(): shift the diagonal of the matrix  */
1531         rtmp1[i] += sctx.shift_amount;
1532         rtmp2[i + 1] += sctx.shift_amount;
1533         rtmp3[i + 2] += sctx.shift_amount;
1534 
1535         /* elimination */
1536         bjtmp = bj + bi[i];
1537         row   = *bjtmp++; /* pivot row */
1538         nzL   = bi[i + 1] - bi[i];
1539         for (k = 0; k < nzL; k++) {
1540           pc1 = rtmp1 + row;
1541           pc2 = rtmp2 + row;
1542           pc3 = rtmp3 + row;
1543           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1544             pv   = b->a + bdiag[row];
1545             mul1 = *pc1 * (*pv);
1546             mul2 = *pc2 * (*pv);
1547             mul3 = *pc3 * (*pv);
1548             *pc1 = mul1;
1549             *pc2 = mul2;
1550             *pc3 = mul3;
1551 
1552             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1553             pv = b->a + bdiag[row + 1] + 1;
1554             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1555             for (j = 0; j < nz; j++) {
1556               col = pj[j];
1557               rtmp1[col] -= mul1 * pv[j];
1558               rtmp2[col] -= mul2 * pv[j];
1559               rtmp3[col] -= mul3 * pv[j];
1560             }
1561             PetscCall(PetscLogFlops(3 + 6.0 * nz));
1562           }
1563           row = *bjtmp++;
1564         }
1565 
1566         /* finished row i; check zero pivot, then stick row i into b->a */
1567         rs = 0.0;
1568         /* L part */
1569         pc1 = b->a + bi[i];
1570         pj  = b->j + bi[i];
1571         nz  = bi[i + 1] - bi[i];
1572         for (j = 0; j < nz; j++) {
1573           col    = pj[j];
1574           pc1[j] = rtmp1[col];
1575           rs += PetscAbsScalar(pc1[j]);
1576         }
1577         /* U part */
1578         pc1 = b->a + bdiag[i + 1] + 1;
1579         pj  = b->j + bdiag[i + 1] + 1;
1580         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581         for (j = 0; j < nz; j++) {
1582           col    = pj[j];
1583           pc1[j] = rtmp1[col];
1584           rs += PetscAbsScalar(pc1[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp1[i];
1589         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1590         if (sctx.newshift) break;
1591         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1592         *pc1 = 1.0 / sctx.pv;
1593 
1594         /* Now take care of 1st column of diagonal 3x3 block. */
1595         pc2 = rtmp2 + i;
1596         pc3 = rtmp3 + i;
1597         if (*pc2 != 0.0 || *pc3 != 0.0) {
1598           mul2 = (*pc2) * (*pc1);
1599           *pc2 = mul2;
1600           mul3 = (*pc3) * (*pc1);
1601           *pc3 = mul3;
1602           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1603           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1604           for (j = 0; j < nz; j++) {
1605             col = pj[j];
1606             rtmp2[col] -= mul2 * rtmp1[col];
1607             rtmp3[col] -= mul3 * rtmp1[col];
1608           }
1609           PetscCall(PetscLogFlops(2 + 4.0 * nz));
1610         }
1611 
1612         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613         rs = 0.0;
1614         /* L part */
1615         pc2 = b->a + bi[i + 1];
1616         pj  = b->j + bi[i + 1];
1617         nz  = bi[i + 2] - bi[i + 1];
1618         for (j = 0; j < nz; j++) {
1619           col    = pj[j];
1620           pc2[j] = rtmp2[col];
1621           rs += PetscAbsScalar(pc2[j]);
1622         }
1623         /* U part */
1624         pc2 = b->a + bdiag[i + 2] + 1;
1625         pj  = b->j + bdiag[i + 2] + 1;
1626         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627         for (j = 0; j < nz; j++) {
1628           col    = pj[j];
1629           pc2[j] = rtmp2[col];
1630           rs += PetscAbsScalar(pc2[j]);
1631         }
1632 
1633         sctx.rs = rs;
1634         sctx.pv = rtmp2[i + 1];
1635         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1636         if (sctx.newshift) break;
1637         pc2  = b->a + bdiag[i + 1];
1638         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639 
1640         /* Now take care of 2nd column of diagonal 3x3 block. */
1641         pc3 = rtmp3 + i + 1;
1642         if (*pc3 != 0.0) {
1643           mul3 = (*pc3) * (*pc2);
1644           *pc3 = mul3;
1645           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1646           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1647           for (j = 0; j < nz; j++) {
1648             col = pj[j];
1649             rtmp3[col] -= mul3 * rtmp2[col];
1650           }
1651           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1652         }
1653 
1654         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1655         rs = 0.0;
1656         /* L part */
1657         pc3 = b->a + bi[i + 2];
1658         pj  = b->j + bi[i + 2];
1659         nz  = bi[i + 3] - bi[i + 2];
1660         for (j = 0; j < nz; j++) {
1661           col    = pj[j];
1662           pc3[j] = rtmp3[col];
1663           rs += PetscAbsScalar(pc3[j]);
1664         }
1665         /* U part */
1666         pc3 = b->a + bdiag[i + 3] + 1;
1667         pj  = b->j + bdiag[i + 3] + 1;
1668         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1669         for (j = 0; j < nz; j++) {
1670           col    = pj[j];
1671           pc3[j] = rtmp3[col];
1672           rs += PetscAbsScalar(pc3[j]);
1673         }
1674 
1675         sctx.rs = rs;
1676         sctx.pv = rtmp3[i + 2];
1677         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1678         if (sctx.newshift) break;
1679         pc3  = b->a + bdiag[i + 2];
1680         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1681         break;
1682       case 4:
1683         /* zero rtmp */
1684         /* L part */
1685         nz    = bi[i + 1] - bi[i];
1686         bjtmp = bj + bi[i];
1687         for (j = 0; j < nz; j++) {
1688           col        = bjtmp[j];
1689           rtmp1[col] = 0.0;
1690           rtmp2[col] = 0.0;
1691           rtmp3[col] = 0.0;
1692           rtmp4[col] = 0.0;
1693         }
1694 
1695         /* U part */
1696         nz    = bdiag[i] - bdiag[i + 1];
1697         bjtmp = bj + bdiag[i + 1] + 1;
1698         for (j = 0; j < nz; j++) {
1699           col        = bjtmp[j];
1700           rtmp1[col] = 0.0;
1701           rtmp2[col] = 0.0;
1702           rtmp3[col] = 0.0;
1703           rtmp4[col] = 0.0;
1704         }
1705 
1706         /* load in initial (unfactored row) */
1707         nz    = ai[r[i] + 1] - ai[r[i]];
1708         ajtmp = aj + ai[r[i]];
1709         v1    = aa + ai[r[i]];
1710         v2    = aa + ai[r[i + 1]];
1711         v3    = aa + ai[r[i + 2]];
1712         v4    = aa + ai[r[i + 3]];
1713         for (j = 0; j < nz; j++) {
1714           col        = ics[ajtmp[j]];
1715           rtmp1[col] = v1[j];
1716           rtmp2[col] = v2[j];
1717           rtmp3[col] = v3[j];
1718           rtmp4[col] = v4[j];
1719         }
1720         /* ZeropivotApply(): shift the diagonal of the matrix  */
1721         rtmp1[i] += sctx.shift_amount;
1722         rtmp2[i + 1] += sctx.shift_amount;
1723         rtmp3[i + 2] += sctx.shift_amount;
1724         rtmp4[i + 3] += sctx.shift_amount;
1725 
1726         /* elimination */
1727         bjtmp = bj + bi[i];
1728         row   = *bjtmp++; /* pivot row */
1729         nzL   = bi[i + 1] - bi[i];
1730         for (k = 0; k < nzL; k++) {
1731           pc1 = rtmp1 + row;
1732           pc2 = rtmp2 + row;
1733           pc3 = rtmp3 + row;
1734           pc4 = rtmp4 + row;
1735           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1736             pv   = b->a + bdiag[row];
1737             mul1 = *pc1 * (*pv);
1738             mul2 = *pc2 * (*pv);
1739             mul3 = *pc3 * (*pv);
1740             mul4 = *pc4 * (*pv);
1741             *pc1 = mul1;
1742             *pc2 = mul2;
1743             *pc3 = mul3;
1744             *pc4 = mul4;
1745 
1746             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
1747             pv = b->a + bdiag[row + 1] + 1;
1748             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1749             for (j = 0; j < nz; j++) {
1750               col = pj[j];
1751               rtmp1[col] -= mul1 * pv[j];
1752               rtmp2[col] -= mul2 * pv[j];
1753               rtmp3[col] -= mul3 * pv[j];
1754               rtmp4[col] -= mul4 * pv[j];
1755             }
1756             PetscCall(PetscLogFlops(4 + 8.0 * nz));
1757           }
1758           row = *bjtmp++;
1759         }
1760 
1761         /* finished row i; check zero pivot, then stick row i into b->a */
1762         rs = 0.0;
1763         /* L part */
1764         pc1 = b->a + bi[i];
1765         pj  = b->j + bi[i];
1766         nz  = bi[i + 1] - bi[i];
1767         for (j = 0; j < nz; j++) {
1768           col    = pj[j];
1769           pc1[j] = rtmp1[col];
1770           rs += PetscAbsScalar(pc1[j]);
1771         }
1772         /* U part */
1773         pc1 = b->a + bdiag[i + 1] + 1;
1774         pj  = b->j + bdiag[i + 1] + 1;
1775         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1776         for (j = 0; j < nz; j++) {
1777           col    = pj[j];
1778           pc1[j] = rtmp1[col];
1779           rs += PetscAbsScalar(pc1[j]);
1780         }
1781 
1782         sctx.rs = rs;
1783         sctx.pv = rtmp1[i];
1784         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
1785         if (sctx.newshift) break;
1786         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1787         *pc1 = 1.0 / sctx.pv;
1788 
1789         /* Now take care of 1st column of diagonal 4x4 block. */
1790         pc2 = rtmp2 + i;
1791         pc3 = rtmp3 + i;
1792         pc4 = rtmp4 + i;
1793         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1794           mul2 = (*pc2) * (*pc1);
1795           *pc2 = mul2;
1796           mul3 = (*pc3) * (*pc1);
1797           *pc3 = mul3;
1798           mul4 = (*pc4) * (*pc1);
1799           *pc4 = mul4;
1800           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1801           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1802           for (j = 0; j < nz; j++) {
1803             col = pj[j];
1804             rtmp2[col] -= mul2 * rtmp1[col];
1805             rtmp3[col] -= mul3 * rtmp1[col];
1806             rtmp4[col] -= mul4 * rtmp1[col];
1807           }
1808           PetscCall(PetscLogFlops(3 + 6.0 * nz));
1809         }
1810 
1811         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1812         rs = 0.0;
1813         /* L part */
1814         pc2 = b->a + bi[i + 1];
1815         pj  = b->j + bi[i + 1];
1816         nz  = bi[i + 2] - bi[i + 1];
1817         for (j = 0; j < nz; j++) {
1818           col    = pj[j];
1819           pc2[j] = rtmp2[col];
1820           rs += PetscAbsScalar(pc2[j]);
1821         }
1822         /* U part */
1823         pc2 = b->a + bdiag[i + 2] + 1;
1824         pj  = b->j + bdiag[i + 2] + 1;
1825         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1826         for (j = 0; j < nz; j++) {
1827           col    = pj[j];
1828           pc2[j] = rtmp2[col];
1829           rs += PetscAbsScalar(pc2[j]);
1830         }
1831 
1832         sctx.rs = rs;
1833         sctx.pv = rtmp2[i + 1];
1834         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
1835         if (sctx.newshift) break;
1836         pc2  = b->a + bdiag[i + 1];
1837         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1838 
1839         /* Now take care of 2nd column of diagonal 4x4 block. */
1840         pc3 = rtmp3 + i + 1;
1841         pc4 = rtmp4 + i + 1;
1842         if (*pc3 != 0.0 || *pc4 != 0.0) {
1843           mul3 = (*pc3) * (*pc2);
1844           *pc3 = mul3;
1845           mul4 = (*pc4) * (*pc2);
1846           *pc4 = mul4;
1847           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
1848           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
1849           for (j = 0; j < nz; j++) {
1850             col = pj[j];
1851             rtmp3[col] -= mul3 * rtmp2[col];
1852             rtmp4[col] -= mul4 * rtmp2[col];
1853           }
1854           PetscCall(PetscLogFlops(4.0 * nz));
1855         }
1856 
1857         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1858         rs = 0.0;
1859         /* L part */
1860         pc3 = b->a + bi[i + 2];
1861         pj  = b->j + bi[i + 2];
1862         nz  = bi[i + 3] - bi[i + 2];
1863         for (j = 0; j < nz; j++) {
1864           col    = pj[j];
1865           pc3[j] = rtmp3[col];
1866           rs += PetscAbsScalar(pc3[j]);
1867         }
1868         /* U part */
1869         pc3 = b->a + bdiag[i + 3] + 1;
1870         pj  = b->j + bdiag[i + 3] + 1;
1871         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
1872         for (j = 0; j < nz; j++) {
1873           col    = pj[j];
1874           pc3[j] = rtmp3[col];
1875           rs += PetscAbsScalar(pc3[j]);
1876         }
1877 
1878         sctx.rs = rs;
1879         sctx.pv = rtmp3[i + 2];
1880         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
1881         if (sctx.newshift) break;
1882         pc3  = b->a + bdiag[i + 2];
1883         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
1884 
1885         /* Now take care of 3rd column of diagonal 4x4 block. */
1886         pc4 = rtmp4 + i + 2;
1887         if (*pc4 != 0.0) {
1888           mul4 = (*pc4) * (*pc3);
1889           *pc4 = mul4;
1890           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
1891           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
1892           for (j = 0; j < nz; j++) {
1893             col = pj[j];
1894             rtmp4[col] -= mul4 * rtmp3[col];
1895           }
1896           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1897         }
1898 
1899         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1900         rs = 0.0;
1901         /* L part */
1902         pc4 = b->a + bi[i + 3];
1903         pj  = b->j + bi[i + 3];
1904         nz  = bi[i + 4] - bi[i + 3];
1905         for (j = 0; j < nz; j++) {
1906           col    = pj[j];
1907           pc4[j] = rtmp4[col];
1908           rs += PetscAbsScalar(pc4[j]);
1909         }
1910         /* U part */
1911         pc4 = b->a + bdiag[i + 4] + 1;
1912         pj  = b->j + bdiag[i + 4] + 1;
1913         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
1914         for (j = 0; j < nz; j++) {
1915           col    = pj[j];
1916           pc4[j] = rtmp4[col];
1917           rs += PetscAbsScalar(pc4[j]);
1918         }
1919 
1920         sctx.rs = rs;
1921         sctx.pv = rtmp4[i + 3];
1922         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
1923         if (sctx.newshift) break;
1924         pc4  = b->a + bdiag[i + 3];
1925         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
1926         break;
1927 
1928       default:
1929         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
1930       }
1931       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1932       i += nodesz;              /* Update the row */
1933     }
1934 
1935     /* MatPivotRefine() */
1936     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
1937       /*
1938        * if no shift in this attempt & shifting & started shifting & can refine,
1939        * then try lower shift
1940        */
1941       sctx.shift_hi       = sctx.shift_fraction;
1942       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
1943       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1944       sctx.newshift       = PETSC_TRUE;
1945       sctx.nshift++;
1946     }
1947   } while (sctx.newshift);
1948 
1949   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
1950   PetscCall(PetscFree(tmp_vec2));
1951   PetscCall(ISRestoreIndices(isicol, &ic));
1952   PetscCall(ISRestoreIndices(isrow, &r));
1953 
1954   if (b->inode.size_csr) {
1955     C->ops->solve = MatSolve_SeqAIJ_Inode;
1956   } else {
1957     C->ops->solve = MatSolve_SeqAIJ;
1958   }
1959   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1960   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1961   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1962   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1963   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
1964   C->assembled              = PETSC_TRUE;
1965   C->preallocated           = PETSC_TRUE;
1966 
1967   PetscCall(PetscLogFlops(C->cmap->n));
1968 
1969   /* MatShiftView(A,info,&sctx) */
1970   if (sctx.nshift) {
1971     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
1972       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1974       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1976       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
1977     }
1978   }
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)1982 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
1983 {
1984   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
1985   IS                 iscol = a->col, isrow = a->row;
1986   const PetscInt    *r, *c, *rout, *cout;
1987   PetscInt           i, j;
1988   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
1989   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
1990   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
1991   PetscScalar        sum1, sum2, sum3, sum4, sum5;
1992   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
1993   const PetscScalar *b;
1994 
1995   PetscFunctionBegin;
1996   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1997   node_max = a->inode.node_count;
1998   ns       = a->inode.size_csr; /* Node Size array */
1999 
2000   PetscCall(VecGetArrayRead(bb, &b));
2001   PetscCall(VecGetArrayWrite(xx, &x));
2002   tmp = a->solve_work;
2003 
2004   PetscCall(ISGetIndices(isrow, &rout));
2005   r = rout;
2006   PetscCall(ISGetIndices(iscol, &cout));
2007   c = cout;
2008 
2009   /* forward solve the lower triangular */
2010   tmps = tmp;
2011   aa   = a_a;
2012   aj   = a_j;
2013   ad   = a->diag;
2014 
2015   for (i = 0; i < node_max; ++i) {
2016     row = ns[i];
2017     nsz = ns[i + 1] - ns[i];
2018     aii = ai[row];
2019     v1  = aa + aii;
2020     vi  = aj + aii;
2021     nz  = ai[row + 1] - ai[row];
2022 
2023     if (i < node_max - 1) {
2024       /* Prefetch the indices for the next block */
2025       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
2026       /* Prefetch the data for the next block */
2027       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
2028     }
2029 
2030     switch (nsz) { /* Each loop in 'case' is unrolled */
2031     case 1:
2032       sum1 = b[r[row]];
2033       for (j = 0; j < nz - 1; j += 2) {
2034         i0   = vi[j];
2035         i1   = vi[j + 1];
2036         tmp0 = tmps[i0];
2037         tmp1 = tmps[i1];
2038         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2039       }
2040       if (j == nz - 1) {
2041         tmp0 = tmps[vi[j]];
2042         sum1 -= v1[j] * tmp0;
2043       }
2044       tmp[row++] = sum1;
2045       break;
2046     case 2:
2047       sum1 = b[r[row]];
2048       sum2 = b[r[row + 1]];
2049       v2   = aa + ai[row + 1];
2050 
2051       for (j = 0; j < nz - 1; j += 2) {
2052         i0   = vi[j];
2053         i1   = vi[j + 1];
2054         tmp0 = tmps[i0];
2055         tmp1 = tmps[i1];
2056         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2057         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2058       }
2059       if (j == nz - 1) {
2060         tmp0 = tmps[vi[j]];
2061         sum1 -= v1[j] * tmp0;
2062         sum2 -= v2[j] * tmp0;
2063       }
2064       sum2 -= v2[nz] * sum1;
2065       tmp[row++] = sum1;
2066       tmp[row++] = sum2;
2067       break;
2068     case 3:
2069       sum1 = b[r[row]];
2070       sum2 = b[r[row + 1]];
2071       sum3 = b[r[row + 2]];
2072       v2   = aa + ai[row + 1];
2073       v3   = aa + ai[row + 2];
2074 
2075       for (j = 0; j < nz - 1; j += 2) {
2076         i0   = vi[j];
2077         i1   = vi[j + 1];
2078         tmp0 = tmps[i0];
2079         tmp1 = tmps[i1];
2080         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2081         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2082         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2083       }
2084       if (j == nz - 1) {
2085         tmp0 = tmps[vi[j]];
2086         sum1 -= v1[j] * tmp0;
2087         sum2 -= v2[j] * tmp0;
2088         sum3 -= v3[j] * tmp0;
2089       }
2090       sum2 -= v2[nz] * sum1;
2091       sum3 -= v3[nz] * sum1;
2092       sum3 -= v3[nz + 1] * sum2;
2093       tmp[row++] = sum1;
2094       tmp[row++] = sum2;
2095       tmp[row++] = sum3;
2096       break;
2097 
2098     case 4:
2099       sum1 = b[r[row]];
2100       sum2 = b[r[row + 1]];
2101       sum3 = b[r[row + 2]];
2102       sum4 = b[r[row + 3]];
2103       v2   = aa + ai[row + 1];
2104       v3   = aa + ai[row + 2];
2105       v4   = aa + ai[row + 3];
2106 
2107       for (j = 0; j < nz - 1; j += 2) {
2108         i0   = vi[j];
2109         i1   = vi[j + 1];
2110         tmp0 = tmps[i0];
2111         tmp1 = tmps[i1];
2112         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2113         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2114         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2115         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2116       }
2117       if (j == nz - 1) {
2118         tmp0 = tmps[vi[j]];
2119         sum1 -= v1[j] * tmp0;
2120         sum2 -= v2[j] * tmp0;
2121         sum3 -= v3[j] * tmp0;
2122         sum4 -= v4[j] * tmp0;
2123       }
2124       sum2 -= v2[nz] * sum1;
2125       sum3 -= v3[nz] * sum1;
2126       sum4 -= v4[nz] * sum1;
2127       sum3 -= v3[nz + 1] * sum2;
2128       sum4 -= v4[nz + 1] * sum2;
2129       sum4 -= v4[nz + 2] * sum3;
2130 
2131       tmp[row++] = sum1;
2132       tmp[row++] = sum2;
2133       tmp[row++] = sum3;
2134       tmp[row++] = sum4;
2135       break;
2136     case 5:
2137       sum1 = b[r[row]];
2138       sum2 = b[r[row + 1]];
2139       sum3 = b[r[row + 2]];
2140       sum4 = b[r[row + 3]];
2141       sum5 = b[r[row + 4]];
2142       v2   = aa + ai[row + 1];
2143       v3   = aa + ai[row + 2];
2144       v4   = aa + ai[row + 3];
2145       v5   = aa + ai[row + 4];
2146 
2147       for (j = 0; j < nz - 1; j += 2) {
2148         i0   = vi[j];
2149         i1   = vi[j + 1];
2150         tmp0 = tmps[i0];
2151         tmp1 = tmps[i1];
2152         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2153         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2154         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2155         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2156         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2157       }
2158       if (j == nz - 1) {
2159         tmp0 = tmps[vi[j]];
2160         sum1 -= v1[j] * tmp0;
2161         sum2 -= v2[j] * tmp0;
2162         sum3 -= v3[j] * tmp0;
2163         sum4 -= v4[j] * tmp0;
2164         sum5 -= v5[j] * tmp0;
2165       }
2166 
2167       sum2 -= v2[nz] * sum1;
2168       sum3 -= v3[nz] * sum1;
2169       sum4 -= v4[nz] * sum1;
2170       sum5 -= v5[nz] * sum1;
2171       sum3 -= v3[nz + 1] * sum2;
2172       sum4 -= v4[nz + 1] * sum2;
2173       sum5 -= v5[nz + 1] * sum2;
2174       sum4 -= v4[nz + 2] * sum3;
2175       sum5 -= v5[nz + 2] * sum3;
2176       sum5 -= v5[nz + 3] * sum4;
2177 
2178       tmp[row++] = sum1;
2179       tmp[row++] = sum2;
2180       tmp[row++] = sum3;
2181       tmp[row++] = sum4;
2182       tmp[row++] = sum5;
2183       break;
2184     default:
2185       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2186     }
2187   }
2188   /* backward solve the upper triangular */
2189   for (i = node_max - 1; i >= 0; i--) {
2190     row = ns[i + 1] - 1;
2191     nsz = ns[i + 1] - ns[i];
2192     aii = ad[row + 1] + 1;
2193     v1  = aa + aii;
2194     vi  = aj + aii;
2195     nz  = ad[row] - ad[row + 1] - 1;
2196 
2197     if (i > 0) {
2198       /* Prefetch the indices for the next block */
2199       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2200       /* Prefetch the data for the next block */
2201       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
2202     }
2203 
2204     switch (nsz) { /* Each loop in 'case' is unrolled */
2205     case 1:
2206       sum1 = tmp[row];
2207 
2208       for (j = 0; j < nz - 1; j += 2) {
2209         i0   = vi[j];
2210         i1   = vi[j + 1];
2211         tmp0 = tmps[i0];
2212         tmp1 = tmps[i1];
2213         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2214       }
2215       if (j == nz - 1) {
2216         tmp0 = tmps[vi[j]];
2217         sum1 -= v1[j] * tmp0;
2218       }
2219       x[c[row]] = tmp[row] = sum1 * v1[nz];
2220       row--;
2221       break;
2222     case 2:
2223       sum1 = tmp[row];
2224       sum2 = tmp[row - 1];
2225       v2   = aa + ad[row] + 1;
2226       for (j = 0; j < nz - 1; j += 2) {
2227         i0   = vi[j];
2228         i1   = vi[j + 1];
2229         tmp0 = tmps[i0];
2230         tmp1 = tmps[i1];
2231         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2232         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2233       }
2234       if (j == nz - 1) {
2235         tmp0 = tmps[vi[j]];
2236         sum1 -= v1[j] * tmp0;
2237         sum2 -= v2[j + 1] * tmp0;
2238       }
2239 
2240       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2241       row--;
2242       sum2 -= v2[0] * tmp0;
2243       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2244       row--;
2245       break;
2246     case 3:
2247       sum1 = tmp[row];
2248       sum2 = tmp[row - 1];
2249       sum3 = tmp[row - 2];
2250       v2   = aa + ad[row] + 1;
2251       v3   = aa + ad[row - 1] + 1;
2252       for (j = 0; j < nz - 1; j += 2) {
2253         i0   = vi[j];
2254         i1   = vi[j + 1];
2255         tmp0 = tmps[i0];
2256         tmp1 = tmps[i1];
2257         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2258         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2259         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2260       }
2261       if (j == nz - 1) {
2262         tmp0 = tmps[vi[j]];
2263         sum1 -= v1[j] * tmp0;
2264         sum2 -= v2[j + 1] * tmp0;
2265         sum3 -= v3[j + 2] * tmp0;
2266       }
2267       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2268       row--;
2269       sum2 -= v2[0] * tmp0;
2270       sum3 -= v3[1] * tmp0;
2271       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2272       row--;
2273       sum3 -= v3[0] * tmp0;
2274       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2275       row--;
2276 
2277       break;
2278     case 4:
2279       sum1 = tmp[row];
2280       sum2 = tmp[row - 1];
2281       sum3 = tmp[row - 2];
2282       sum4 = tmp[row - 3];
2283       v2   = aa + ad[row] + 1;
2284       v3   = aa + ad[row - 1] + 1;
2285       v4   = aa + ad[row - 2] + 1;
2286 
2287       for (j = 0; j < nz - 1; j += 2) {
2288         i0   = vi[j];
2289         i1   = vi[j + 1];
2290         tmp0 = tmps[i0];
2291         tmp1 = tmps[i1];
2292         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2293         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2294         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2295         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2296       }
2297       if (j == nz - 1) {
2298         tmp0 = tmps[vi[j]];
2299         sum1 -= v1[j] * tmp0;
2300         sum2 -= v2[j + 1] * tmp0;
2301         sum3 -= v3[j + 2] * tmp0;
2302         sum4 -= v4[j + 3] * tmp0;
2303       }
2304 
2305       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2306       row--;
2307       sum2 -= v2[0] * tmp0;
2308       sum3 -= v3[1] * tmp0;
2309       sum4 -= v4[2] * tmp0;
2310       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2311       row--;
2312       sum3 -= v3[0] * tmp0;
2313       sum4 -= v4[1] * tmp0;
2314       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2315       row--;
2316       sum4 -= v4[0] * tmp0;
2317       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2318       row--;
2319       break;
2320     case 5:
2321       sum1 = tmp[row];
2322       sum2 = tmp[row - 1];
2323       sum3 = tmp[row - 2];
2324       sum4 = tmp[row - 3];
2325       sum5 = tmp[row - 4];
2326       v2   = aa + ad[row] + 1;
2327       v3   = aa + ad[row - 1] + 1;
2328       v4   = aa + ad[row - 2] + 1;
2329       v5   = aa + ad[row - 3] + 1;
2330       for (j = 0; j < nz - 1; j += 2) {
2331         i0   = vi[j];
2332         i1   = vi[j + 1];
2333         tmp0 = tmps[i0];
2334         tmp1 = tmps[i1];
2335         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2336         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2337         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2338         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2339         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2340       }
2341       if (j == nz - 1) {
2342         tmp0 = tmps[vi[j]];
2343         sum1 -= v1[j] * tmp0;
2344         sum2 -= v2[j + 1] * tmp0;
2345         sum3 -= v3[j + 2] * tmp0;
2346         sum4 -= v4[j + 3] * tmp0;
2347         sum5 -= v5[j + 4] * tmp0;
2348       }
2349 
2350       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
2351       row--;
2352       sum2 -= v2[0] * tmp0;
2353       sum3 -= v3[1] * tmp0;
2354       sum4 -= v4[2] * tmp0;
2355       sum5 -= v5[3] * tmp0;
2356       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
2357       row--;
2358       sum3 -= v3[0] * tmp0;
2359       sum4 -= v4[1] * tmp0;
2360       sum5 -= v5[2] * tmp0;
2361       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
2362       row--;
2363       sum4 -= v4[0] * tmp0;
2364       sum5 -= v5[1] * tmp0;
2365       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
2366       row--;
2367       sum5 -= v5[0] * tmp0;
2368       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
2369       row--;
2370       break;
2371     default:
2372       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2373     }
2374   }
2375   PetscCall(ISRestoreIndices(isrow, &rout));
2376   PetscCall(ISRestoreIndices(iscol, &cout));
2377   PetscCall(VecRestoreArrayRead(bb, &b));
2378   PetscCall(VecRestoreArrayWrite(xx, &x));
2379   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2380   PetscFunctionReturn(PETSC_SUCCESS);
2381 }
2382 
2383 /*
2384      Makes a longer coloring[] array and calls the usual code with that
2385 */
MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring * iscoloring)2386 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2387 {
2388   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2389   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
2390   PetscInt        *colorused, i;
2391   ISColoringValue *newcolor;
2392 
2393   PetscFunctionBegin;
2394   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2395   PetscCall(PetscMalloc1(n + 1, &newcolor));
2396   /* loop over inodes, marking a color for each column*/
2397   row = 0;
2398   for (i = 0; i < m; i++) {
2399     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
2400   }
2401 
2402   /* eliminate unneeded colors */
2403   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2404   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
2405 
2406   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
2407   ncolors = colorused[5 * ncolors - 1];
2408   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
2409   PetscCall(PetscFree(colorused));
2410   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
2411   PetscCall(PetscFree(coloring));
2412   PetscFunctionReturn(PETSC_SUCCESS);
2413 }
2414 
2415 #include <petsc/private/kernels/blockinvert.h>
2416 
2417 /*
2418    Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
2419 */
MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A,PetscScalar omega,PetscScalar fshift)2420 static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2421 {
2422   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data;
2423   MatScalar       *ibdiag, *bdiag, work[25];
2424   const MatScalar *v         = a->a;
2425   PetscReal        zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2426   PetscInt         m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
2427   PetscInt         k, ipvt[5];
2428   PetscBool        allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2429   const PetscInt  *sizes          = a->inode.size_csr, *diag;
2430 
2431   PetscFunctionBegin;
2432   if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2433   PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL));
2434   if (!a->inode.ibdiag) {
2435     /* calculate space needed for diagonal blocks */
2436     for (i = 0; i < m; i++) {
2437       nodesz = sizes[i + 1] - sizes[i];
2438       cnt += nodesz * nodesz;
2439     }
2440     a->inode.bdiagsize = cnt;
2441     PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
2442   }
2443 
2444   /* copy over the diagonal blocks and invert them */
2445   ibdiag = a->inode.ibdiag;
2446   bdiag  = a->inode.bdiag;
2447   cnt    = 0;
2448   for (i = 0, row = 0; i < m; i++) {
2449     nodesz = sizes[i + 1] - sizes[i];
2450     for (j = 0; j < nodesz; j++) {
2451       for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
2452     }
2453     PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
2454 
2455     switch (nodesz) {
2456     case 1:
2457       /* Create matrix data structure */
2458       if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2459         PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
2460         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2461         A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2462         A->factorerror_zeropivot_row   = row;
2463         PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
2464       }
2465       ibdiag[cnt] = 1.0 / ibdiag[cnt];
2466       break;
2467     case 2:
2468       PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2469       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2470       break;
2471     case 3:
2472       PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2473       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2474       break;
2475     case 4:
2476       PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
2477       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2478       break;
2479     case 5:
2480       PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
2481       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2482       break;
2483     default:
2484       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2485     }
2486     cnt += nodesz * nodesz;
2487     row += nodesz;
2488   }
2489   a->inode.ibdiagState = ((PetscObject)A)->state;
2490   PetscFunctionReturn(PETSC_SUCCESS);
2491 }
2492 
MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)2493 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2494 {
2495   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
2496   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
2497   MatScalar         *ibdiag, *bdiag, *t;
2498   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
2499   const MatScalar   *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
2500   const PetscScalar *xb, *b;
2501   PetscInt           n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
2502   PetscInt           sz;
2503   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
2504 
2505   PetscFunctionBegin;
2506   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2507   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
2508   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
2509   PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
2510   diag = a->diag;
2511 
2512   ibdiag = a->inode.ibdiag;
2513   bdiag  = a->inode.bdiag;
2514   t      = a->inode.ssor_work;
2515 
2516   PetscCall(VecGetArray(xx, &x));
2517   PetscCall(VecGetArrayRead(bb, &b));
2518   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2519   if (flag & SOR_ZERO_INITIAL_GUESS) {
2520     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2521       for (i = 0, row = 0; i < m; i++) {
2522         sz  = diag[row] - ii[row];
2523         v1  = a->a + ii[row];
2524         idx = a->j + ii[row];
2525 
2526         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2527         nodesz = sizes[i + 1] - sizes[i];
2528         switch (nodesz) {
2529         case 1:
2530 
2531           sum1 = b[row];
2532           for (n = 0; n < sz - 1; n += 2) {
2533             i1 = idx[0];
2534             i2 = idx[1];
2535             idx += 2;
2536             tmp0 = x[i1];
2537             tmp1 = x[i2];
2538             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2539             v1 += 2;
2540           }
2541 
2542           if (n == sz - 1) {
2543             tmp0 = x[*idx];
2544             sum1 -= *v1 * tmp0;
2545           }
2546           t[row]   = sum1;
2547           x[row++] = sum1 * (*ibdiag++);
2548           break;
2549         case 2:
2550           v2   = a->a + ii[row + 1];
2551           sum1 = b[row];
2552           sum2 = b[row + 1];
2553           for (n = 0; n < sz - 1; n += 2) {
2554             i1 = idx[0];
2555             i2 = idx[1];
2556             idx += 2;
2557             tmp0 = x[i1];
2558             tmp1 = x[i2];
2559             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2560             v1 += 2;
2561             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2562             v2 += 2;
2563           }
2564 
2565           if (n == sz - 1) {
2566             tmp0 = x[*idx];
2567             sum1 -= v1[0] * tmp0;
2568             sum2 -= v2[0] * tmp0;
2569           }
2570           t[row]     = sum1;
2571           t[row + 1] = sum2;
2572           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2573           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2574           ibdiag += 4;
2575           break;
2576         case 3:
2577           v2   = a->a + ii[row + 1];
2578           v3   = a->a + ii[row + 2];
2579           sum1 = b[row];
2580           sum2 = b[row + 1];
2581           sum3 = b[row + 2];
2582           for (n = 0; n < sz - 1; n += 2) {
2583             i1 = idx[0];
2584             i2 = idx[1];
2585             idx += 2;
2586             tmp0 = x[i1];
2587             tmp1 = x[i2];
2588             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2589             v1 += 2;
2590             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2591             v2 += 2;
2592             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2593             v3 += 2;
2594           }
2595 
2596           if (n == sz - 1) {
2597             tmp0 = x[*idx];
2598             sum1 -= v1[0] * tmp0;
2599             sum2 -= v2[0] * tmp0;
2600             sum3 -= v3[0] * tmp0;
2601           }
2602           t[row]     = sum1;
2603           t[row + 1] = sum2;
2604           t[row + 2] = sum3;
2605           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2606           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2607           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2608           ibdiag += 9;
2609           break;
2610         case 4:
2611           v2   = a->a + ii[row + 1];
2612           v3   = a->a + ii[row + 2];
2613           v4   = a->a + ii[row + 3];
2614           sum1 = b[row];
2615           sum2 = b[row + 1];
2616           sum3 = b[row + 2];
2617           sum4 = b[row + 3];
2618           for (n = 0; n < sz - 1; n += 2) {
2619             i1 = idx[0];
2620             i2 = idx[1];
2621             idx += 2;
2622             tmp0 = x[i1];
2623             tmp1 = x[i2];
2624             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2625             v1 += 2;
2626             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2627             v2 += 2;
2628             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2629             v3 += 2;
2630             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2631             v4 += 2;
2632           }
2633 
2634           if (n == sz - 1) {
2635             tmp0 = x[*idx];
2636             sum1 -= v1[0] * tmp0;
2637             sum2 -= v2[0] * tmp0;
2638             sum3 -= v3[0] * tmp0;
2639             sum4 -= v4[0] * tmp0;
2640           }
2641           t[row]     = sum1;
2642           t[row + 1] = sum2;
2643           t[row + 2] = sum3;
2644           t[row + 3] = sum4;
2645           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
2646           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
2647           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
2648           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
2649           ibdiag += 16;
2650           break;
2651         case 5:
2652           v2   = a->a + ii[row + 1];
2653           v3   = a->a + ii[row + 2];
2654           v4   = a->a + ii[row + 3];
2655           v5   = a->a + ii[row + 4];
2656           sum1 = b[row];
2657           sum2 = b[row + 1];
2658           sum3 = b[row + 2];
2659           sum4 = b[row + 3];
2660           sum5 = b[row + 4];
2661           for (n = 0; n < sz - 1; n += 2) {
2662             i1 = idx[0];
2663             i2 = idx[1];
2664             idx += 2;
2665             tmp0 = x[i1];
2666             tmp1 = x[i2];
2667             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2668             v1 += 2;
2669             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2670             v2 += 2;
2671             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2672             v3 += 2;
2673             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2674             v4 += 2;
2675             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2676             v5 += 2;
2677           }
2678 
2679           if (n == sz - 1) {
2680             tmp0 = x[*idx];
2681             sum1 -= v1[0] * tmp0;
2682             sum2 -= v2[0] * tmp0;
2683             sum3 -= v3[0] * tmp0;
2684             sum4 -= v4[0] * tmp0;
2685             sum5 -= v5[0] * tmp0;
2686           }
2687           t[row]     = sum1;
2688           t[row + 1] = sum2;
2689           t[row + 2] = sum3;
2690           t[row + 3] = sum4;
2691           t[row + 4] = sum5;
2692           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
2693           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
2694           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
2695           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
2696           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
2697           ibdiag += 25;
2698           break;
2699         default:
2700           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2701         }
2702       }
2703 
2704       xb = t;
2705       PetscCall(PetscLogFlops(a->nz));
2706     } else xb = b;
2707     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2708       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
2709       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
2710         nodesz = sizes[i + 1] - sizes[i];
2711         ibdiag -= nodesz * nodesz;
2712         sz  = ii[row + 1] - diag[row] - 1;
2713         v1  = a->a + diag[row] + 1;
2714         idx = a->j + diag[row] + 1;
2715 
2716         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2717         switch (nodesz) {
2718         case 1:
2719 
2720           sum1 = xb[row];
2721           for (n = 0; n < sz - 1; n += 2) {
2722             i1 = idx[0];
2723             i2 = idx[1];
2724             idx += 2;
2725             tmp0 = x[i1];
2726             tmp1 = x[i2];
2727             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2728             v1 += 2;
2729           }
2730 
2731           if (n == sz - 1) {
2732             tmp0 = x[*idx];
2733             sum1 -= *v1 * tmp0;
2734           }
2735           x[row--] = sum1 * (*ibdiag);
2736           break;
2737 
2738         case 2:
2739 
2740           sum1 = xb[row];
2741           sum2 = xb[row - 1];
2742           /* note that sum1 is associated with the second of the two rows */
2743           v2 = a->a + diag[row - 1] + 2;
2744           for (n = 0; n < sz - 1; n += 2) {
2745             i1 = idx[0];
2746             i2 = idx[1];
2747             idx += 2;
2748             tmp0 = x[i1];
2749             tmp1 = x[i2];
2750             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2751             v1 += 2;
2752             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2753             v2 += 2;
2754           }
2755 
2756           if (n == sz - 1) {
2757             tmp0 = x[*idx];
2758             sum1 -= *v1 * tmp0;
2759             sum2 -= *v2 * tmp0;
2760           }
2761           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
2762           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
2763           break;
2764         case 3:
2765 
2766           sum1 = xb[row];
2767           sum2 = xb[row - 1];
2768           sum3 = xb[row - 2];
2769           v2   = a->a + diag[row - 1] + 2;
2770           v3   = a->a + diag[row - 2] + 3;
2771           for (n = 0; n < sz - 1; n += 2) {
2772             i1 = idx[0];
2773             i2 = idx[1];
2774             idx += 2;
2775             tmp0 = x[i1];
2776             tmp1 = x[i2];
2777             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2778             v1 += 2;
2779             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2780             v2 += 2;
2781             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2782             v3 += 2;
2783           }
2784 
2785           if (n == sz - 1) {
2786             tmp0 = x[*idx];
2787             sum1 -= *v1 * tmp0;
2788             sum2 -= *v2 * tmp0;
2789             sum3 -= *v3 * tmp0;
2790           }
2791           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
2792           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
2793           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
2794           break;
2795         case 4:
2796 
2797           sum1 = xb[row];
2798           sum2 = xb[row - 1];
2799           sum3 = xb[row - 2];
2800           sum4 = xb[row - 3];
2801           v2   = a->a + diag[row - 1] + 2;
2802           v3   = a->a + diag[row - 2] + 3;
2803           v4   = a->a + diag[row - 3] + 4;
2804           for (n = 0; n < sz - 1; n += 2) {
2805             i1 = idx[0];
2806             i2 = idx[1];
2807             idx += 2;
2808             tmp0 = x[i1];
2809             tmp1 = x[i2];
2810             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2811             v1 += 2;
2812             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2813             v2 += 2;
2814             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2815             v3 += 2;
2816             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2817             v4 += 2;
2818           }
2819 
2820           if (n == sz - 1) {
2821             tmp0 = x[*idx];
2822             sum1 -= *v1 * tmp0;
2823             sum2 -= *v2 * tmp0;
2824             sum3 -= *v3 * tmp0;
2825             sum4 -= *v4 * tmp0;
2826           }
2827           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
2828           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
2829           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
2830           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
2831           break;
2832         case 5:
2833 
2834           sum1 = xb[row];
2835           sum2 = xb[row - 1];
2836           sum3 = xb[row - 2];
2837           sum4 = xb[row - 3];
2838           sum5 = xb[row - 4];
2839           v2   = a->a + diag[row - 1] + 2;
2840           v3   = a->a + diag[row - 2] + 3;
2841           v4   = a->a + diag[row - 3] + 4;
2842           v5   = a->a + diag[row - 4] + 5;
2843           for (n = 0; n < sz - 1; n += 2) {
2844             i1 = idx[0];
2845             i2 = idx[1];
2846             idx += 2;
2847             tmp0 = x[i1];
2848             tmp1 = x[i2];
2849             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2850             v1 += 2;
2851             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2852             v2 += 2;
2853             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
2854             v3 += 2;
2855             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
2856             v4 += 2;
2857             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
2858             v5 += 2;
2859           }
2860 
2861           if (n == sz - 1) {
2862             tmp0 = x[*idx];
2863             sum1 -= *v1 * tmp0;
2864             sum2 -= *v2 * tmp0;
2865             sum3 -= *v3 * tmp0;
2866             sum4 -= *v4 * tmp0;
2867             sum5 -= *v5 * tmp0;
2868           }
2869           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
2870           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
2871           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
2872           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
2873           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
2874           break;
2875         default:
2876           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
2877         }
2878       }
2879 
2880       PetscCall(PetscLogFlops(a->nz));
2881     }
2882     its--;
2883   }
2884   while (its--) {
2885     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2886       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
2887         nodesz = sizes[i + 1] - sizes[i];
2888         sz     = diag[row] - ii[row];
2889         v1     = a->a + ii[row];
2890         idx    = a->j + ii[row];
2891         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2892         switch (nodesz) {
2893         case 1:
2894           sum1 = b[row];
2895           for (n = 0; n < sz - 1; n += 2) {
2896             i1 = idx[0];
2897             i2 = idx[1];
2898             idx += 2;
2899             tmp0 = x[i1];
2900             tmp1 = x[i2];
2901             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2902             v1 += 2;
2903           }
2904           if (n == sz - 1) {
2905             tmp0 = x[*idx++];
2906             sum1 -= *v1 * tmp0;
2907             v1++;
2908           }
2909           t[row] = sum1;
2910           sz     = ii[row + 1] - diag[row] - 1;
2911           idx    = a->j + diag[row] + 1;
2912           v1 += 1;
2913           for (n = 0; n < sz - 1; n += 2) {
2914             i1 = idx[0];
2915             i2 = idx[1];
2916             idx += 2;
2917             tmp0 = x[i1];
2918             tmp1 = x[i2];
2919             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2920             v1 += 2;
2921           }
2922           if (n == sz - 1) {
2923             tmp0 = x[*idx++];
2924             sum1 -= *v1 * tmp0;
2925           }
2926           /* in MatSOR_SeqAIJ this line would be
2927            *
2928            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
2929            *
2930            * but omega == 1, so this becomes
2931            *
2932            * x[row] = sum1*(*ibdiag++);
2933            *
2934            */
2935           x[row] = sum1 * (*ibdiag);
2936           break;
2937         case 2:
2938           v2   = a->a + ii[row + 1];
2939           sum1 = b[row];
2940           sum2 = b[row + 1];
2941           for (n = 0; n < sz - 1; n += 2) {
2942             i1 = idx[0];
2943             i2 = idx[1];
2944             idx += 2;
2945             tmp0 = x[i1];
2946             tmp1 = x[i2];
2947             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2948             v1 += 2;
2949             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2950             v2 += 2;
2951           }
2952           if (n == sz - 1) {
2953             tmp0 = x[*idx++];
2954             sum1 -= v1[0] * tmp0;
2955             sum2 -= v2[0] * tmp0;
2956             v1++;
2957             v2++;
2958           }
2959           t[row]     = sum1;
2960           t[row + 1] = sum2;
2961           sz         = ii[row + 1] - diag[row] - 2;
2962           idx        = a->j + diag[row] + 2;
2963           v1 += 2;
2964           v2 += 2;
2965           for (n = 0; n < sz - 1; n += 2) {
2966             i1 = idx[0];
2967             i2 = idx[1];
2968             idx += 2;
2969             tmp0 = x[i1];
2970             tmp1 = x[i2];
2971             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2972             v1 += 2;
2973             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2974             v2 += 2;
2975           }
2976           if (n == sz - 1) {
2977             tmp0 = x[*idx];
2978             sum1 -= v1[0] * tmp0;
2979             sum2 -= v2[0] * tmp0;
2980           }
2981           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2982           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2983           break;
2984         case 3:
2985           v2   = a->a + ii[row + 1];
2986           v3   = a->a + ii[row + 2];
2987           sum1 = b[row];
2988           sum2 = b[row + 1];
2989           sum3 = b[row + 2];
2990           for (n = 0; n < sz - 1; n += 2) {
2991             i1 = idx[0];
2992             i2 = idx[1];
2993             idx += 2;
2994             tmp0 = x[i1];
2995             tmp1 = x[i2];
2996             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
2997             v1 += 2;
2998             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
2999             v2 += 2;
3000             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3001             v3 += 2;
3002           }
3003           if (n == sz - 1) {
3004             tmp0 = x[*idx++];
3005             sum1 -= v1[0] * tmp0;
3006             sum2 -= v2[0] * tmp0;
3007             sum3 -= v3[0] * tmp0;
3008             v1++;
3009             v2++;
3010             v3++;
3011           }
3012           t[row]     = sum1;
3013           t[row + 1] = sum2;
3014           t[row + 2] = sum3;
3015           sz         = ii[row + 1] - diag[row] - 3;
3016           idx        = a->j + diag[row] + 3;
3017           v1 += 3;
3018           v2 += 3;
3019           v3 += 3;
3020           for (n = 0; n < sz - 1; n += 2) {
3021             i1 = idx[0];
3022             i2 = idx[1];
3023             idx += 2;
3024             tmp0 = x[i1];
3025             tmp1 = x[i2];
3026             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3027             v1 += 2;
3028             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3029             v2 += 2;
3030             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3031             v3 += 2;
3032           }
3033           if (n == sz - 1) {
3034             tmp0 = x[*idx];
3035             sum1 -= v1[0] * tmp0;
3036             sum2 -= v2[0] * tmp0;
3037             sum3 -= v3[0] * tmp0;
3038           }
3039           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3040           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3041           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3042           break;
3043         case 4:
3044           v2   = a->a + ii[row + 1];
3045           v3   = a->a + ii[row + 2];
3046           v4   = a->a + ii[row + 3];
3047           sum1 = b[row];
3048           sum2 = b[row + 1];
3049           sum3 = b[row + 2];
3050           sum4 = b[row + 3];
3051           for (n = 0; n < sz - 1; n += 2) {
3052             i1 = idx[0];
3053             i2 = idx[1];
3054             idx += 2;
3055             tmp0 = x[i1];
3056             tmp1 = x[i2];
3057             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3058             v1 += 2;
3059             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3060             v2 += 2;
3061             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3062             v3 += 2;
3063             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3064             v4 += 2;
3065           }
3066           if (n == sz - 1) {
3067             tmp0 = x[*idx++];
3068             sum1 -= v1[0] * tmp0;
3069             sum2 -= v2[0] * tmp0;
3070             sum3 -= v3[0] * tmp0;
3071             sum4 -= v4[0] * tmp0;
3072             v1++;
3073             v2++;
3074             v3++;
3075             v4++;
3076           }
3077           t[row]     = sum1;
3078           t[row + 1] = sum2;
3079           t[row + 2] = sum3;
3080           t[row + 3] = sum4;
3081           sz         = ii[row + 1] - diag[row] - 4;
3082           idx        = a->j + diag[row] + 4;
3083           v1 += 4;
3084           v2 += 4;
3085           v3 += 4;
3086           v4 += 4;
3087           for (n = 0; n < sz - 1; n += 2) {
3088             i1 = idx[0];
3089             i2 = idx[1];
3090             idx += 2;
3091             tmp0 = x[i1];
3092             tmp1 = x[i2];
3093             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3094             v1 += 2;
3095             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3096             v2 += 2;
3097             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3098             v3 += 2;
3099             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3100             v4 += 2;
3101           }
3102           if (n == sz - 1) {
3103             tmp0 = x[*idx];
3104             sum1 -= v1[0] * tmp0;
3105             sum2 -= v2[0] * tmp0;
3106             sum3 -= v3[0] * tmp0;
3107             sum4 -= v4[0] * tmp0;
3108           }
3109           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3110           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3111           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3112           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3113           break;
3114         case 5:
3115           v2   = a->a + ii[row + 1];
3116           v3   = a->a + ii[row + 2];
3117           v4   = a->a + ii[row + 3];
3118           v5   = a->a + ii[row + 4];
3119           sum1 = b[row];
3120           sum2 = b[row + 1];
3121           sum3 = b[row + 2];
3122           sum4 = b[row + 3];
3123           sum5 = b[row + 4];
3124           for (n = 0; n < sz - 1; n += 2) {
3125             i1 = idx[0];
3126             i2 = idx[1];
3127             idx += 2;
3128             tmp0 = x[i1];
3129             tmp1 = x[i2];
3130             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3131             v1 += 2;
3132             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3133             v2 += 2;
3134             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3135             v3 += 2;
3136             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3137             v4 += 2;
3138             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3139             v5 += 2;
3140           }
3141           if (n == sz - 1) {
3142             tmp0 = x[*idx++];
3143             sum1 -= v1[0] * tmp0;
3144             sum2 -= v2[0] * tmp0;
3145             sum3 -= v3[0] * tmp0;
3146             sum4 -= v4[0] * tmp0;
3147             sum5 -= v5[0] * tmp0;
3148             v1++;
3149             v2++;
3150             v3++;
3151             v4++;
3152             v5++;
3153           }
3154           t[row]     = sum1;
3155           t[row + 1] = sum2;
3156           t[row + 2] = sum3;
3157           t[row + 3] = sum4;
3158           t[row + 4] = sum5;
3159           sz         = ii[row + 1] - diag[row] - 5;
3160           idx        = a->j + diag[row] + 5;
3161           v1 += 5;
3162           v2 += 5;
3163           v3 += 5;
3164           v4 += 5;
3165           v5 += 5;
3166           for (n = 0; n < sz - 1; n += 2) {
3167             i1 = idx[0];
3168             i2 = idx[1];
3169             idx += 2;
3170             tmp0 = x[i1];
3171             tmp1 = x[i2];
3172             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3173             v1 += 2;
3174             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3175             v2 += 2;
3176             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3177             v3 += 2;
3178             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3179             v4 += 2;
3180             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3181             v5 += 2;
3182           }
3183           if (n == sz - 1) {
3184             tmp0 = x[*idx];
3185             sum1 -= v1[0] * tmp0;
3186             sum2 -= v2[0] * tmp0;
3187             sum3 -= v3[0] * tmp0;
3188             sum4 -= v4[0] * tmp0;
3189             sum5 -= v5[0] * tmp0;
3190           }
3191           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3192           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3193           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3194           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3195           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3196           break;
3197         default:
3198           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3199         }
3200       }
3201       xb = t;
3202       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3203     } else xb = b;
3204 
3205     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3206       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3207       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3208         nodesz = sizes[i + 1] - sizes[i];
3209         ibdiag -= nodesz * nodesz;
3210 
3211         /* set RHS */
3212         if (xb == b) {
3213           /* whole (old way) */
3214           sz  = ii[row + 1] - ii[row];
3215           idx = a->j + ii[row];
3216           switch (nodesz) {
3217           case 5:
3218             v5 = a->a + ii[row - 4]; /* fall through */
3219           case 4:
3220             v4 = a->a + ii[row - 3]; /* fall through */
3221           case 3:
3222             v3 = a->a + ii[row - 2]; /* fall through */
3223           case 2:
3224             v2 = a->a + ii[row - 1]; /* fall through */
3225           case 1:
3226             v1 = a->a + ii[row];
3227             break;
3228           default:
3229             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3230           }
3231         } else {
3232           /* upper, no diag */
3233           sz  = ii[row + 1] - diag[row] - 1;
3234           idx = a->j + diag[row] + 1;
3235           switch (nodesz) {
3236           case 5:
3237             v5 = a->a + diag[row - 4] + 5; /* fall through */
3238           case 4:
3239             v4 = a->a + diag[row - 3] + 4; /* fall through */
3240           case 3:
3241             v3 = a->a + diag[row - 2] + 3; /* fall through */
3242           case 2:
3243             v2 = a->a + diag[row - 1] + 2; /* fall through */
3244           case 1:
3245             v1 = a->a + diag[row] + 1;
3246           }
3247         }
3248         /* set sum */
3249         switch (nodesz) {
3250         case 5:
3251           sum5 = xb[row - 4]; /* fall through */
3252         case 4:
3253           sum4 = xb[row - 3]; /* fall through */
3254         case 3:
3255           sum3 = xb[row - 2]; /* fall through */
3256         case 2:
3257           sum2 = xb[row - 1]; /* fall through */
3258         case 1:
3259           /* note that sum1 is associated with the last row */
3260           sum1 = xb[row];
3261         }
3262         /* do sums */
3263         for (n = 0; n < sz - 1; n += 2) {
3264           i1 = idx[0];
3265           i2 = idx[1];
3266           idx += 2;
3267           tmp0 = x[i1];
3268           tmp1 = x[i2];
3269           switch (nodesz) {
3270           case 5:
3271             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3272             v5 += 2; /* fall through */
3273           case 4:
3274             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3275             v4 += 2; /* fall through */
3276           case 3:
3277             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3278             v3 += 2; /* fall through */
3279           case 2:
3280             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3281             v2 += 2; /* fall through */
3282           case 1:
3283             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284             v1 += 2;
3285           }
3286         }
3287         /* ragged edge */
3288         if (n == sz - 1) {
3289           tmp0 = x[*idx];
3290           switch (nodesz) {
3291           case 5:
3292             sum5 -= *v5 * tmp0; /* fall through */
3293           case 4:
3294             sum4 -= *v4 * tmp0; /* fall through */
3295           case 3:
3296             sum3 -= *v3 * tmp0; /* fall through */
3297           case 2:
3298             sum2 -= *v2 * tmp0; /* fall through */
3299           case 1:
3300             sum1 -= *v1 * tmp0;
3301           }
3302         }
3303         /* update */
3304         if (xb == b) {
3305           /* whole (old way) w/ diag */
3306           switch (nodesz) {
3307           case 5:
3308             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3309             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3310             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3311             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3312             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3313             break;
3314           case 4:
3315             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3316             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3317             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3318             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3319             break;
3320           case 3:
3321             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3322             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3323             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3324             break;
3325           case 2:
3326             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3327             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3328             break;
3329           case 1:
3330             x[row--] += sum1 * (*ibdiag);
3331             break;
3332           }
3333         } else {
3334           /* no diag so set =  */
3335           switch (nodesz) {
3336           case 5:
3337             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3338             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3339             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3340             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3341             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3342             break;
3343           case 4:
3344             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3345             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3346             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3347             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3348             break;
3349           case 3:
3350             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3351             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3352             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3353             break;
3354           case 2:
3355             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3356             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3357             break;
3358           case 1:
3359             x[row--] = sum1 * (*ibdiag);
3360             break;
3361           }
3362         }
3363       }
3364       if (xb == b) {
3365         PetscCall(PetscLogFlops(2.0 * a->nz));
3366       } else {
3367         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3368       }
3369     }
3370   }
3371   if (flag & SOR_EISENSTAT) {
3372     /*
3373           Apply  (U + D)^-1  where D is now the block diagonal
3374     */
3375     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3376     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3377       nodesz = sizes[i + 1] - sizes[i];
3378       ibdiag -= nodesz * nodesz;
3379       sz  = ii[row + 1] - diag[row] - 1;
3380       v1  = a->a + diag[row] + 1;
3381       idx = a->j + diag[row] + 1;
3382       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3383       switch (nodesz) {
3384       case 1:
3385 
3386         sum1 = b[row];
3387         for (n = 0; n < sz - 1; n += 2) {
3388           i1 = idx[0];
3389           i2 = idx[1];
3390           idx += 2;
3391           tmp0 = x[i1];
3392           tmp1 = x[i2];
3393           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3394           v1 += 2;
3395         }
3396 
3397         if (n == sz - 1) {
3398           tmp0 = x[*idx];
3399           sum1 -= *v1 * tmp0;
3400         }
3401         x[row] = sum1 * (*ibdiag);
3402         row--;
3403         break;
3404 
3405       case 2:
3406 
3407         sum1 = b[row];
3408         sum2 = b[row - 1];
3409         /* note that sum1 is associated with the second of the two rows */
3410         v2 = a->a + diag[row - 1] + 2;
3411         for (n = 0; n < sz - 1; n += 2) {
3412           i1 = idx[0];
3413           i2 = idx[1];
3414           idx += 2;
3415           tmp0 = x[i1];
3416           tmp1 = x[i2];
3417           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3418           v1 += 2;
3419           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3420           v2 += 2;
3421         }
3422 
3423         if (n == sz - 1) {
3424           tmp0 = x[*idx];
3425           sum1 -= *v1 * tmp0;
3426           sum2 -= *v2 * tmp0;
3427         }
3428         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3429         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3430         row -= 2;
3431         break;
3432       case 3:
3433 
3434         sum1 = b[row];
3435         sum2 = b[row - 1];
3436         sum3 = b[row - 2];
3437         v2   = a->a + diag[row - 1] + 2;
3438         v3   = a->a + diag[row - 2] + 3;
3439         for (n = 0; n < sz - 1; n += 2) {
3440           i1 = idx[0];
3441           i2 = idx[1];
3442           idx += 2;
3443           tmp0 = x[i1];
3444           tmp1 = x[i2];
3445           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3446           v1 += 2;
3447           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3448           v2 += 2;
3449           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3450           v3 += 2;
3451         }
3452 
3453         if (n == sz - 1) {
3454           tmp0 = x[*idx];
3455           sum1 -= *v1 * tmp0;
3456           sum2 -= *v2 * tmp0;
3457           sum3 -= *v3 * tmp0;
3458         }
3459         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3460         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3461         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3462         row -= 3;
3463         break;
3464       case 4:
3465 
3466         sum1 = b[row];
3467         sum2 = b[row - 1];
3468         sum3 = b[row - 2];
3469         sum4 = b[row - 3];
3470         v2   = a->a + diag[row - 1] + 2;
3471         v3   = a->a + diag[row - 2] + 3;
3472         v4   = a->a + diag[row - 3] + 4;
3473         for (n = 0; n < sz - 1; n += 2) {
3474           i1 = idx[0];
3475           i2 = idx[1];
3476           idx += 2;
3477           tmp0 = x[i1];
3478           tmp1 = x[i2];
3479           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3480           v1 += 2;
3481           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3482           v2 += 2;
3483           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3484           v3 += 2;
3485           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3486           v4 += 2;
3487         }
3488 
3489         if (n == sz - 1) {
3490           tmp0 = x[*idx];
3491           sum1 -= *v1 * tmp0;
3492           sum2 -= *v2 * tmp0;
3493           sum3 -= *v3 * tmp0;
3494           sum4 -= *v4 * tmp0;
3495         }
3496         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3497         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3498         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3499         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3500         row -= 4;
3501         break;
3502       case 5:
3503 
3504         sum1 = b[row];
3505         sum2 = b[row - 1];
3506         sum3 = b[row - 2];
3507         sum4 = b[row - 3];
3508         sum5 = b[row - 4];
3509         v2   = a->a + diag[row - 1] + 2;
3510         v3   = a->a + diag[row - 2] + 3;
3511         v4   = a->a + diag[row - 3] + 4;
3512         v5   = a->a + diag[row - 4] + 5;
3513         for (n = 0; n < sz - 1; n += 2) {
3514           i1 = idx[0];
3515           i2 = idx[1];
3516           idx += 2;
3517           tmp0 = x[i1];
3518           tmp1 = x[i2];
3519           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3520           v1 += 2;
3521           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3522           v2 += 2;
3523           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3524           v3 += 2;
3525           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3526           v4 += 2;
3527           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3528           v5 += 2;
3529         }
3530 
3531         if (n == sz - 1) {
3532           tmp0 = x[*idx];
3533           sum1 -= *v1 * tmp0;
3534           sum2 -= *v2 * tmp0;
3535           sum3 -= *v3 * tmp0;
3536           sum4 -= *v4 * tmp0;
3537           sum5 -= *v5 * tmp0;
3538         }
3539         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3540         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3541         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3542         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3543         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3544         row -= 5;
3545         break;
3546       default:
3547         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3548       }
3549     }
3550     PetscCall(PetscLogFlops(a->nz));
3551 
3552     /*
3553            t = b - D x    where D is the block diagonal
3554     */
3555     cnt = 0;
3556     for (i = 0, row = 0; i < m; i++) {
3557       nodesz = sizes[i + 1] - sizes[i];
3558       switch (nodesz) {
3559       case 1:
3560         t[row] = b[row] - bdiag[cnt++] * x[row];
3561         row++;
3562         break;
3563       case 2:
3564         x1         = x[row];
3565         x2         = x[row + 1];
3566         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3567         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3568         t[row]     = b[row] - tmp1;
3569         t[row + 1] = b[row + 1] - tmp2;
3570         row += 2;
3571         cnt += 4;
3572         break;
3573       case 3:
3574         x1         = x[row];
3575         x2         = x[row + 1];
3576         x3         = x[row + 2];
3577         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3578         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3579         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3580         t[row]     = b[row] - tmp1;
3581         t[row + 1] = b[row + 1] - tmp2;
3582         t[row + 2] = b[row + 2] - tmp3;
3583         row += 3;
3584         cnt += 9;
3585         break;
3586       case 4:
3587         x1         = x[row];
3588         x2         = x[row + 1];
3589         x3         = x[row + 2];
3590         x4         = x[row + 3];
3591         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3592         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3593         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3594         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3595         t[row]     = b[row] - tmp1;
3596         t[row + 1] = b[row + 1] - tmp2;
3597         t[row + 2] = b[row + 2] - tmp3;
3598         t[row + 3] = b[row + 3] - tmp4;
3599         row += 4;
3600         cnt += 16;
3601         break;
3602       case 5:
3603         x1         = x[row];
3604         x2         = x[row + 1];
3605         x3         = x[row + 2];
3606         x4         = x[row + 3];
3607         x5         = x[row + 4];
3608         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3609         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3610         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3611         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3612         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3613         t[row]     = b[row] - tmp1;
3614         t[row + 1] = b[row + 1] - tmp2;
3615         t[row + 2] = b[row + 2] - tmp3;
3616         t[row + 3] = b[row + 3] - tmp4;
3617         t[row + 4] = b[row + 4] - tmp5;
3618         row += 5;
3619         cnt += 25;
3620         break;
3621       default:
3622         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3623       }
3624     }
3625     PetscCall(PetscLogFlops(m));
3626 
3627     /*
3628           Apply (L + D)^-1 where D is the block diagonal
3629     */
3630     for (i = 0, row = 0; i < m; i++) {
3631       nodesz = sizes[i + 1] - sizes[i];
3632       sz     = diag[row] - ii[row];
3633       v1     = a->a + ii[row];
3634       idx    = a->j + ii[row];
3635       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3636       switch (nodesz) {
3637       case 1:
3638 
3639         sum1 = t[row];
3640         for (n = 0; n < sz - 1; n += 2) {
3641           i1 = idx[0];
3642           i2 = idx[1];
3643           idx += 2;
3644           tmp0 = t[i1];
3645           tmp1 = t[i2];
3646           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3647           v1 += 2;
3648         }
3649 
3650         if (n == sz - 1) {
3651           tmp0 = t[*idx];
3652           sum1 -= *v1 * tmp0;
3653         }
3654         x[row] += t[row] = sum1 * (*ibdiag++);
3655         row++;
3656         break;
3657       case 2:
3658         v2   = a->a + ii[row + 1];
3659         sum1 = t[row];
3660         sum2 = t[row + 1];
3661         for (n = 0; n < sz - 1; n += 2) {
3662           i1 = idx[0];
3663           i2 = idx[1];
3664           idx += 2;
3665           tmp0 = t[i1];
3666           tmp1 = t[i2];
3667           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3668           v1 += 2;
3669           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3670           v2 += 2;
3671         }
3672 
3673         if (n == sz - 1) {
3674           tmp0 = t[*idx];
3675           sum1 -= v1[0] * tmp0;
3676           sum2 -= v2[0] * tmp0;
3677         }
3678         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3679         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
3680         ibdiag += 4;
3681         row += 2;
3682         break;
3683       case 3:
3684         v2   = a->a + ii[row + 1];
3685         v3   = a->a + ii[row + 2];
3686         sum1 = t[row];
3687         sum2 = t[row + 1];
3688         sum3 = t[row + 2];
3689         for (n = 0; n < sz - 1; n += 2) {
3690           i1 = idx[0];
3691           i2 = idx[1];
3692           idx += 2;
3693           tmp0 = t[i1];
3694           tmp1 = t[i2];
3695           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3696           v1 += 2;
3697           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3698           v2 += 2;
3699           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3700           v3 += 2;
3701         }
3702 
3703         if (n == sz - 1) {
3704           tmp0 = t[*idx];
3705           sum1 -= v1[0] * tmp0;
3706           sum2 -= v2[0] * tmp0;
3707           sum3 -= v3[0] * tmp0;
3708         }
3709         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3710         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3711         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3712         ibdiag += 9;
3713         row += 3;
3714         break;
3715       case 4:
3716         v2   = a->a + ii[row + 1];
3717         v3   = a->a + ii[row + 2];
3718         v4   = a->a + ii[row + 3];
3719         sum1 = t[row];
3720         sum2 = t[row + 1];
3721         sum3 = t[row + 2];
3722         sum4 = t[row + 3];
3723         for (n = 0; n < sz - 1; n += 2) {
3724           i1 = idx[0];
3725           i2 = idx[1];
3726           idx += 2;
3727           tmp0 = t[i1];
3728           tmp1 = t[i2];
3729           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3730           v1 += 2;
3731           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3732           v2 += 2;
3733           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3734           v3 += 2;
3735           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3736           v4 += 2;
3737         }
3738 
3739         if (n == sz - 1) {
3740           tmp0 = t[*idx];
3741           sum1 -= v1[0] * tmp0;
3742           sum2 -= v2[0] * tmp0;
3743           sum3 -= v3[0] * tmp0;
3744           sum4 -= v4[0] * tmp0;
3745         }
3746         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3747         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3748         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3749         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3750         ibdiag += 16;
3751         row += 4;
3752         break;
3753       case 5:
3754         v2   = a->a + ii[row + 1];
3755         v3   = a->a + ii[row + 2];
3756         v4   = a->a + ii[row + 3];
3757         v5   = a->a + ii[row + 4];
3758         sum1 = t[row];
3759         sum2 = t[row + 1];
3760         sum3 = t[row + 2];
3761         sum4 = t[row + 3];
3762         sum5 = t[row + 4];
3763         for (n = 0; n < sz - 1; n += 2) {
3764           i1 = idx[0];
3765           i2 = idx[1];
3766           idx += 2;
3767           tmp0 = t[i1];
3768           tmp1 = t[i2];
3769           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3770           v1 += 2;
3771           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3772           v2 += 2;
3773           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3774           v3 += 2;
3775           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3776           v4 += 2;
3777           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3778           v5 += 2;
3779         }
3780 
3781         if (n == sz - 1) {
3782           tmp0 = t[*idx];
3783           sum1 -= v1[0] * tmp0;
3784           sum2 -= v2[0] * tmp0;
3785           sum3 -= v3[0] * tmp0;
3786           sum4 -= v4[0] * tmp0;
3787           sum5 -= v5[0] * tmp0;
3788         }
3789         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3790         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3791         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3792         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3793         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3794         ibdiag += 25;
3795         row += 5;
3796         break;
3797       default:
3798         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3799       }
3800     }
3801     PetscCall(PetscLogFlops(a->nz));
3802   }
3803   PetscCall(VecRestoreArray(xx, &x));
3804   PetscCall(VecRestoreArrayRead(bb, &b));
3805   PetscFunctionReturn(PETSC_SUCCESS);
3806 }
3807 
MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)3808 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
3809 {
3810   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
3811   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
3812   const MatScalar   *bdiag = a->inode.bdiag;
3813   const PetscScalar *b;
3814   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
3815   const PetscInt    *sizes = a->inode.size_csr;
3816 
3817   PetscFunctionBegin;
3818   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3819   PetscCall(VecGetArray(xx, &x));
3820   PetscCall(VecGetArrayRead(bb, &b));
3821   cnt = 0;
3822   for (i = 0, row = 0; i < m; i++) {
3823     nodesz = sizes[i + 1] - sizes[i];
3824     switch (nodesz) {
3825     case 1:
3826       x[row] = b[row] * bdiag[cnt++];
3827       row++;
3828       break;
3829     case 2:
3830       x1       = b[row];
3831       x2       = b[row + 1];
3832       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
3833       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
3834       x[row++] = tmp1;
3835       x[row++] = tmp2;
3836       cnt += 4;
3837       break;
3838     case 3:
3839       x1       = b[row];
3840       x2       = b[row + 1];
3841       x3       = b[row + 2];
3842       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
3843       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
3844       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
3845       x[row++] = tmp1;
3846       x[row++] = tmp2;
3847       x[row++] = tmp3;
3848       cnt += 9;
3849       break;
3850     case 4:
3851       x1       = b[row];
3852       x2       = b[row + 1];
3853       x3       = b[row + 2];
3854       x4       = b[row + 3];
3855       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
3856       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
3857       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
3858       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
3859       x[row++] = tmp1;
3860       x[row++] = tmp2;
3861       x[row++] = tmp3;
3862       x[row++] = tmp4;
3863       cnt += 16;
3864       break;
3865     case 5:
3866       x1       = b[row];
3867       x2       = b[row + 1];
3868       x3       = b[row + 2];
3869       x4       = b[row + 3];
3870       x5       = b[row + 4];
3871       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
3872       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
3873       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
3874       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
3875       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
3876       x[row++] = tmp1;
3877       x[row++] = tmp2;
3878       x[row++] = tmp3;
3879       x[row++] = tmp4;
3880       x[row++] = tmp5;
3881       cnt += 25;
3882       break;
3883     default:
3884       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3885     }
3886   }
3887   PetscCall(PetscLogFlops(2.0 * cnt));
3888   PetscCall(VecRestoreArray(xx, &x));
3889   PetscCall(VecRestoreArrayRead(bb, &b));
3890   PetscFunctionReturn(PETSC_SUCCESS);
3891 }
3892 
MatSeqAIJ_Inode_ResetOps(Mat A)3893 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
3894 {
3895   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3896 
3897   PetscFunctionBegin;
3898   a->inode.node_count       = 0;
3899   a->inode.use              = PETSC_FALSE;
3900   a->inode.checked          = PETSC_FALSE;
3901   a->inode.mat_nonzerostate = -1;
3902   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
3903   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
3904   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
3905   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
3906   A->ops->coloringpatch     = NULL;
3907   A->ops->multdiagonalblock = NULL;
3908   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
3909   PetscFunctionReturn(PETSC_SUCCESS);
3910 }
3911 
3912 /*
3913     samestructure indicates that the matrix has not changed its nonzero structure so we
3914     do not need to recompute the inodes
3915 */
MatSeqAIJCheckInode(Mat A)3916 PetscErrorCode MatSeqAIJCheckInode(Mat A)
3917 {
3918   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
3919   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
3920   PetscBool       flag;
3921   const PetscInt *idx, *idy, *ii;
3922 
3923   PetscFunctionBegin;
3924   if (!a->inode.use) {
3925     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3926     PetscCall(PetscFree(a->inode.size_csr));
3927     PetscFunctionReturn(PETSC_SUCCESS);
3928   }
3929   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
3930 
3931   m = A->rmap->n;
3932   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
3933   ns    = a->inode.size_csr;
3934   ns[0] = 0;
3935 
3936   i          = 0;
3937   node_count = 0;
3938   idx        = a->j;
3939   ii         = a->i;
3940   if (idx) {
3941     while (i < m) {            /* For each row */
3942       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
3943       /* Limits the number of elements in a node to 'a->inode.limit' */
3944       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
3945         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
3946         if (nzy != nzx) break;
3947         idy += nzx; /* Same nonzero pattern */
3948         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
3949         if (!flag) break;
3950       }
3951       ns[node_count + 1] = ns[node_count] + blk_size;
3952       node_count++;
3953       idx += blk_size * nzx;
3954       i = j;
3955     }
3956   }
3957   /* If not enough inodes found,, do not use inode version of the routines */
3958   if (!m || !idx || node_count > .8 * m) {
3959     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
3960     PetscCall(PetscFree(a->inode.size_csr));
3961     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
3962   } else {
3963     if (!A->factortype) {
3964       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3965       if (A->rmap->n == A->cmap->n) {
3966         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
3967         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
3968         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
3969         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
3970         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
3971       }
3972     } else {
3973       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3974     }
3975     a->inode.node_count = node_count;
3976     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
3977   }
3978   a->inode.checked          = PETSC_TRUE;
3979   a->inode.mat_nonzerostate = A->nonzerostate;
3980   PetscFunctionReturn(PETSC_SUCCESS);
3981 }
3982 
MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat * C)3983 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
3984 {
3985   Mat         B = *C;
3986   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
3987   PetscInt    m = A->rmap->n;
3988 
3989   PetscFunctionBegin;
3990   c->inode.use              = a->inode.use;
3991   c->inode.limit            = a->inode.limit;
3992   c->inode.max_limit        = a->inode.max_limit;
3993   c->inode.checked          = PETSC_FALSE;
3994   c->inode.size_csr         = NULL;
3995   c->inode.node_count       = 0;
3996   c->inode.ibdiag           = NULL;
3997   c->inode.bdiag            = NULL;
3998   c->inode.mat_nonzerostate = -1;
3999   if (a->inode.use) {
4000     if (a->inode.checked && a->inode.size_csr) {
4001       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4002       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4003 
4004       c->inode.checked          = PETSC_TRUE;
4005       c->inode.node_count       = a->inode.node_count;
4006       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4007     }
4008     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4009     if (!B->factortype) {
4010       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4011       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4012       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4013       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4014       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4015       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4016     } else {
4017       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4018     }
4019   }
4020   PetscFunctionReturn(PETSC_SUCCESS);
4021 }
4022 
MatGetRow_FactoredLU(PetscInt * cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt * ai,const PetscInt * aj,const PetscInt * adiag,PetscInt row)4023 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4024 {
4025   PetscInt        k;
4026   const PetscInt *vi;
4027 
4028   PetscFunctionBegin;
4029   vi = aj + ai[row];
4030   for (k = 0; k < nzl; k++) cols[k] = vi[k];
4031   vi        = aj + adiag[row];
4032   cols[nzl] = vi[0];
4033   vi        = aj + adiag[row + 1] + 1;
4034   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
4035   PetscFunctionReturn(PETSC_SUCCESS);
4036 }
4037 /*
4038    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4039    Modified from MatSeqAIJCheckInode().
4040 
4041    Input Parameters:
4042 .  Mat A - ILU or LU matrix factor
4043 
4044 */
MatSeqAIJCheckInode_FactorLU(Mat A)4045 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4046 {
4047   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4048   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
4049   PetscInt       *cols1, *cols2, *ns;
4050   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4051   PetscBool       flag;
4052 
4053   PetscFunctionBegin;
4054   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
4055   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4056 
4057   m = A->rmap->n;
4058   if (a->inode.size_csr) ns = a->inode.size_csr;
4059   else PetscCall(PetscMalloc1(m + 1, &ns));
4060   ns[0] = 0;
4061 
4062   i          = 0;
4063   node_count = 0;
4064   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4065   while (i < m) {                       /* For each row */
4066     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4067     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4068     nzx  = nzl1 + nzu1 + 1;
4069     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4070 
4071     /* Limits the number of elements in a node to 'a->inode.limit' */
4072     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4073       nzl2 = ai[j + 1] - ai[j];
4074       nzu2 = adiag[j] - adiag[j + 1] - 1;
4075       nzy  = nzl2 + nzu2 + 1;
4076       if (nzy != nzx) break;
4077       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
4078       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
4079       if (!flag) break;
4080     }
4081     ns[node_count + 1] = ns[node_count] + blk_size;
4082     node_count++;
4083     i = j;
4084   }
4085   PetscCall(PetscFree2(cols1, cols2));
4086   /* If not enough inodes found,, do not use inode version of the routines */
4087   if (!m || node_count > .8 * m) {
4088     PetscCall(PetscFree(ns));
4089 
4090     a->inode.node_count = 0;
4091     a->inode.size_csr   = NULL;
4092     a->inode.use        = PETSC_FALSE;
4093 
4094     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4095   } else {
4096     A->ops->mult              = NULL;
4097     A->ops->sor               = NULL;
4098     A->ops->multadd           = NULL;
4099     A->ops->getrowij          = NULL;
4100     A->ops->restorerowij      = NULL;
4101     A->ops->getcolumnij       = NULL;
4102     A->ops->restorecolumnij   = NULL;
4103     A->ops->coloringpatch     = NULL;
4104     A->ops->multdiagonalblock = NULL;
4105     a->inode.node_count       = node_count;
4106     a->inode.size_csr         = ns;
4107     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4108   }
4109   a->inode.checked = PETSC_TRUE;
4110   PetscFunctionReturn(PETSC_SUCCESS);
4111 }
4112 
4113 /*
4114      This is really ugly. if inodes are used this replaces the
4115   permutations with ones that correspond to rows/cols of the matrix
4116   rather than inode blocks
4117 */
MatInodeAdjustForInodes(Mat A,IS * rperm,IS * cperm)4118 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4119 {
4120   PetscFunctionBegin;
4121   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
4122   PetscFunctionReturn(PETSC_SUCCESS);
4123 }
4124 
MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS * rperm,IS * cperm)4125 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4126 {
4127   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4128   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
4129   const PetscInt *ridx, *cidx;
4130   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
4131   PetscInt        nslim_col, *ns_col;
4132   IS              ris = *rperm, cis = *cperm;
4133 
4134   PetscFunctionBegin;
4135   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
4136   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
4137 
4138   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
4139   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
4140   PetscCall(PetscMalloc2(m, &permr, n, &permc));
4141 
4142   PetscCall(ISGetIndices(ris, &ridx));
4143   PetscCall(ISGetIndices(cis, &cidx));
4144 
4145   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4146   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
4147 
4148   /* Construct the permutations for rows*/
4149   for (i = 0, row = 0; i < nslim_row; ++i) {
4150     indx      = ridx[i];
4151     start_val = tns[indx];
4152     end_val   = tns[indx + 1];
4153     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
4154   }
4155 
4156   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4157   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
4158 
4159   /* Construct permutations for columns */
4160   for (i = 0, col = 0; i < nslim_col; ++i) {
4161     indx      = cidx[i];
4162     start_val = tns[indx];
4163     end_val   = tns[indx + 1];
4164     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
4165   }
4166 
4167   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
4168   PetscCall(ISSetPermutation(*rperm));
4169   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
4170   PetscCall(ISSetPermutation(*cperm));
4171 
4172   PetscCall(ISRestoreIndices(ris, &ridx));
4173   PetscCall(ISRestoreIndices(cis, &cidx));
4174 
4175   PetscCall(PetscFree(ns_col));
4176   PetscCall(PetscFree2(permr, permc));
4177   PetscCall(ISDestroy(&cis));
4178   PetscCall(ISDestroy(&ris));
4179   PetscCall(PetscFree(tns));
4180   PetscFunctionReturn(PETSC_SUCCESS);
4181 }
4182 
4183 /*@C
4184   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
4185 
4186   Not Collective
4187 
4188   Input Parameter:
4189 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
4190 
4191   Output Parameters:
4192 + node_count - no of inodes present in the matrix.
4193 . sizes      - an array of size `node_count`, with the sizes of each inode.
4194 - limit      - the max size used to generate the inodes.
4195 
4196   Level: advanced
4197 
4198   Note:
4199   It should be called after the matrix is assembled.
4200   The contents of the sizes[] array should not be changed.
4201   `NULL` may be passed for information not needed
4202 
4203 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
4204 @*/
MatInodeGetInodeSizes(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4205 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4206 {
4207   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
4208 
4209   PetscFunctionBegin;
4210   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
4211   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
4212   if (f) PetscCall((*f)(A, node_count, sizes, limit));
4213   PetscFunctionReturn(PETSC_SUCCESS);
4214 }
4215 
MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4216 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4217 {
4218   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4219 
4220   PetscFunctionBegin;
4221   if (node_count) *node_count = a->inode.node_count;
4222   if (sizes) *sizes = a->inode.size_csr;
4223   if (limit) *limit = a->inode.limit;
4224   PetscFunctionReturn(PETSC_SUCCESS);
4225 }
4226