xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
12 
13   PetscFunctionBegin;
14   n = A->cmap->n;
15   m = A->rmap->n;
16   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22     for (; count+1 < n; count++,i++) ;
23     if (count < n)  {
24       i++;
25     }
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   PetscCall(PetscMalloc1(n+1,&ns_col));
30 
31   /* Use the same row structure wherever feasible. */
32   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33     ns_col[i] = ns_row[i];
34   }
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count+1 < n; count++,i++) {
38     ns_col[i] = 1;
39   }
40   /* The last node is the odd ball. padd it up with the remaining rows; */
41   if (count < n) {
42     ns_col[i] = n - count;
43     i++;
44   } else if (count > n) {
45     /* Adjust for the over estimation */
46     ns_col[i-1] += n - count;
47   }
48   *size = i;
49   *ns   = ns_col;
50   PetscFunctionReturn(0);
51 }
52 
53 /*
54       This builds symmetric version of nonzero structure,
55 */
56 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
57 {
58   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
59   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
60   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
61   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
62 
63   PetscFunctionBegin;
64   nslim_row = a->inode.node_count;
65   m         = A->rmap->n;
66   n         = A->cmap->n;
67   PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
68   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
69 
70   /* Use the row_inode as column_inode */
71   nslim_col = nslim_row;
72   ns_col    = ns_row;
73 
74   /* allocate space for reformated inode structure */
75   PetscCall(PetscMalloc2(nslim_col+1,&tns,n+1,&tvc));
76   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
77 
78   for (i1=0,col=0; i1<nslim_col; ++i1) {
79     nsz = ns_col[i1];
80     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
81   }
82   /* allocate space for row pointers */
83   PetscCall(PetscCalloc1(nslim_row+1,&ia));
84   *iia = ia;
85   PetscCall(PetscMalloc1(nslim_row+1,&work));
86 
87   /* determine the number of columns in each row */
88   ia[0] = oshift;
89   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
90 
91     j    = aj + ai[row] + ishift;
92     jmax = aj + ai[row+1] + ishift;
93     if (j==jmax) continue; /* empty row */
94     col  = *j++ + ishift;
95     i2   = tvc[col];
96     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
97       ia[i1+1]++;
98       ia[i2+1]++;
99       i2++;                     /* Start col of next node */
100       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
101       i2 = tvc[col];
102     }
103     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
104   }
105 
106   /* shift ia[i] to point to next row */
107   for (i1=1; i1<nslim_row+1; i1++) {
108     row        = ia[i1-1];
109     ia[i1]    += row;
110     work[i1-1] = row - oshift;
111   }
112 
113   /* allocate space for column pointers */
114   nz   = ia[nslim_row] + (!ishift);
115   PetscCall(PetscMalloc1(nz,&ja));
116   *jja = ja;
117 
118   /* loop over lower triangular part putting into ja */
119   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
120     j    = aj + ai[row] + ishift;
121     jmax = aj + ai[row+1] + ishift;
122     if (j==jmax) continue; /* empty row */
123     col  = *j++ + ishift;
124     i2   = tvc[col];
125     while (i2<i1 && j<jmax) {
126       ja[work[i2]++] = i1 + oshift;
127       ja[work[i1]++] = i2 + oshift;
128       ++i2;
129       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
130       i2 = tvc[col];
131     }
132     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
133 
134   }
135   PetscCall(PetscFree(work));
136   PetscCall(PetscFree2(tns,tvc));
137   PetscFunctionReturn(0);
138 }
139 
140 /*
141       This builds nonsymmetric version of nonzero structure,
142 */
143 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
144 {
145   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
146   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
147   PetscInt       *tns,*tvc,nsz,i1,i2;
148   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
149 
150   PetscFunctionBegin;
151   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
152   nslim_row = a->inode.node_count;
153   n         = A->cmap->n;
154 
155   /* Create The column_inode for this matrix */
156   PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col));
157 
158   /* allocate space for reformated column_inode structure */
159   PetscCall(PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc));
160   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
161 
162   for (i1=0,col=0; i1<nslim_col; ++i1) {
163     nsz = ns_col[i1];
164     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
165   }
166   /* allocate space for row pointers */
167   PetscCall(PetscCalloc1(nslim_row+1,&ia));
168   *iia = ia;
169   PetscCall(PetscMalloc1(nslim_row+1,&work));
170 
171   /* determine the number of columns in each row */
172   ia[0] = oshift;
173   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
174     j   = aj + ai[row] + ishift;
175     nz  = ai[row+1] - ai[row];
176     if (!nz) continue; /* empty row */
177     col = *j++ + ishift;
178     i2  = tvc[col];
179     while (nz-- > 0) {           /* off-diagonal elements */
180       ia[i1+1]++;
181       i2++;                     /* Start col of next node */
182       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
183       if (nz > 0) i2 = tvc[col];
184     }
185   }
186 
187   /* shift ia[i] to point to next row */
188   for (i1=1; i1<nslim_row+1; i1++) {
189     row        = ia[i1-1];
190     ia[i1]    += row;
191     work[i1-1] = row - oshift;
192   }
193 
194   /* allocate space for column pointers */
195   nz   = ia[nslim_row] + (!ishift);
196   PetscCall(PetscMalloc1(nz,&ja));
197   *jja = ja;
198 
199   /* loop over matrix putting into ja */
200   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
201     j   = aj + ai[row] + ishift;
202     nz  = ai[row+1] - ai[row];
203     if (!nz) continue; /* empty row */
204     col = *j++ + ishift;
205     i2  = tvc[col];
206     while (nz-- > 0) {
207       ja[work[i1]++] = i2 + oshift;
208       ++i2;
209       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
210       if (nz > 0) i2 = tvc[col];
211     }
212   }
213   PetscCall(PetscFree(ns_col));
214   PetscCall(PetscFree(work));
215   PetscCall(PetscFree2(tns,tvc));
216   PetscFunctionReturn(0);
217 }
218 
219 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
220 {
221   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
222 
223   PetscFunctionBegin;
224   *n = a->inode.node_count;
225   if (!ia) PetscFunctionReturn(0);
226   if (!blockcompressed) {
227     PetscCall(MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done));
228   } else if (symmetric) {
229     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift));
230   } else {
231     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift));
232   }
233   PetscFunctionReturn(0);
234 }
235 
236 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
237 {
238   PetscFunctionBegin;
239   if (!ia) PetscFunctionReturn(0);
240 
241   if (!blockcompressed) {
242     PetscCall(MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done));
243   } else {
244     PetscCall(PetscFree(*ia));
245     PetscCall(PetscFree(*ja));
246   }
247   PetscFunctionReturn(0);
248 }
249 
250 /* ----------------------------------------------------------- */
251 
252 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
253 {
254   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
255   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
256   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
257 
258   PetscFunctionBegin;
259   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
260   nslim_row = a->inode.node_count;
261   n         = A->cmap->n;
262 
263   /* Create The column_inode for this matrix */
264   PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col));
265 
266   /* allocate space for reformated column_inode structure */
267   PetscCall(PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc));
268   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
269 
270   for (i1=0,col=0; i1<nslim_col; ++i1) {
271     nsz = ns_col[i1];
272     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
273   }
274   /* allocate space for column pointers */
275   PetscCall(PetscCalloc1(nslim_col+1,&ia));
276   *iia = ia;
277   PetscCall(PetscMalloc1(nslim_col+1,&work));
278 
279   /* determine the number of columns in each row */
280   ia[0] = oshift;
281   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
282     j   = aj + ai[row] + ishift;
283     col = *j++ + ishift;
284     i2  = tvc[col];
285     nz  = ai[row+1] - ai[row];
286     while (nz-- > 0) {           /* off-diagonal elements */
287       /* ia[i1+1]++; */
288       ia[i2+1]++;
289       i2++;
290       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
291       if (nz > 0) i2 = tvc[col];
292     }
293   }
294 
295   /* shift ia[i] to point to next col */
296   for (i1=1; i1<nslim_col+1; i1++) {
297     col        = ia[i1-1];
298     ia[i1]    += col;
299     work[i1-1] = col - oshift;
300   }
301 
302   /* allocate space for column pointers */
303   nz   = ia[nslim_col] + (!ishift);
304   PetscCall(PetscMalloc1(nz,&ja));
305   *jja = ja;
306 
307   /* loop over matrix putting into ja */
308   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
309     j   = aj + ai[row] + ishift;
310     col = *j++ + ishift;
311     i2  = tvc[col];
312     nz  = ai[row+1] - ai[row];
313     while (nz-- > 0) {
314       /* ja[work[i1]++] = i2 + oshift; */
315       ja[work[i2]++] = i1 + oshift;
316       i2++;
317       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
318       if (nz > 0) i2 = tvc[col];
319     }
320   }
321   PetscCall(PetscFree(ns_col));
322   PetscCall(PetscFree(work));
323   PetscCall(PetscFree2(tns,tvc));
324   PetscFunctionReturn(0);
325 }
326 
327 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
328 {
329   PetscFunctionBegin;
330   PetscCall(MatCreateColInode_Private(A,n,NULL));
331   if (!ia) PetscFunctionReturn(0);
332 
333   if (!blockcompressed) {
334     PetscCall(MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done));
335   } else if (symmetric) {
336     /* Since the indices are symmetric it doesn't matter */
337     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift));
338   } else {
339     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift));
340   }
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
345 {
346   PetscFunctionBegin;
347   if (!ia) PetscFunctionReturn(0);
348   if (!blockcompressed) {
349     PetscCall(MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done));
350   } else {
351     PetscCall(PetscFree(*ia));
352     PetscCall(PetscFree(*ja));
353   }
354   PetscFunctionReturn(0);
355 }
356 
357 /* ----------------------------------------------------------- */
358 
359 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
360 {
361   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
362   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
363   PetscScalar       *y;
364   const PetscScalar *x;
365   const MatScalar   *v1,*v2,*v3,*v4,*v5;
366   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
367   const PetscInt    *idx,*ns,*ii;
368 
369 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
370 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
371 #endif
372 
373   PetscFunctionBegin;
374   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
375   node_max = a->inode.node_count;
376   ns       = a->inode.size;     /* Node Size array */
377   PetscCall(VecGetArrayRead(xx,&x));
378   PetscCall(VecGetArray(yy,&y));
379   idx      = a->j;
380   v1       = a->a;
381   ii       = a->i;
382 
383   for (i = 0,row = 0; i< node_max; ++i) {
384     nsz         = ns[i];
385     n           = ii[1] - ii[0];
386     nonzerorow += (n>0)*nsz;
387     ii         += nsz;
388     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
389     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
390     sz = n;                     /* No of non zeros in this row */
391                                 /* Switch on the size of Node */
392     switch (nsz) {               /* Each loop in 'case' is unrolled */
393     case 1:
394       sum1 = 0.;
395 
396       for (n = 0; n< sz-1; n+=2) {
397         i1    = idx[0];         /* The instructions are ordered to */
398         i2    = idx[1];         /* make the compiler's job easy */
399         idx  += 2;
400         tmp0  = x[i1];
401         tmp1  = x[i2];
402         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
403       }
404 
405       if (n == sz-1) {          /* Take care of the last nonzero  */
406         tmp0  = x[*idx++];
407         sum1 += *v1++ *tmp0;
408       }
409       y[row++]=sum1;
410       break;
411     case 2:
412       sum1 = 0.;
413       sum2 = 0.;
414       v2   = v1 + n;
415 
416       for (n = 0; n< sz-1; n+=2) {
417         i1    = idx[0];
418         i2    = idx[1];
419         idx  += 2;
420         tmp0  = x[i1];
421         tmp1  = x[i2];
422         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
424       }
425       if (n == sz-1) {
426         tmp0  = x[*idx++];
427         sum1 += *v1++ * tmp0;
428         sum2 += *v2++ * tmp0;
429       }
430       y[row++]=sum1;
431       y[row++]=sum2;
432       v1      =v2;              /* Since the next block to be processed starts there*/
433       idx    +=sz;
434       break;
435     case 3:
436       sum1 = 0.;
437       sum2 = 0.;
438       sum3 = 0.;
439       v2   = v1 + n;
440       v3   = v2 + n;
441 
442       for (n = 0; n< sz-1; n+=2) {
443         i1    = idx[0];
444         i2    = idx[1];
445         idx  += 2;
446         tmp0  = x[i1];
447         tmp1  = x[i2];
448         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
449         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
450         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
451       }
452       if (n == sz-1) {
453         tmp0  = x[*idx++];
454         sum1 += *v1++ * tmp0;
455         sum2 += *v2++ * tmp0;
456         sum3 += *v3++ * tmp0;
457       }
458       y[row++]=sum1;
459       y[row++]=sum2;
460       y[row++]=sum3;
461       v1      =v3;              /* Since the next block to be processed starts there*/
462       idx    +=2*sz;
463       break;
464     case 4:
465       sum1 = 0.;
466       sum2 = 0.;
467       sum3 = 0.;
468       sum4 = 0.;
469       v2   = v1 + n;
470       v3   = v2 + n;
471       v4   = v3 + n;
472 
473       for (n = 0; n< sz-1; n+=2) {
474         i1    = idx[0];
475         i2    = idx[1];
476         idx  += 2;
477         tmp0  = x[i1];
478         tmp1  = x[i2];
479         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
480         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
481         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
482         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489         sum4 += *v4++ * tmp0;
490       }
491       y[row++]=sum1;
492       y[row++]=sum2;
493       y[row++]=sum3;
494       y[row++]=sum4;
495       v1      =v4;              /* Since the next block to be processed starts there*/
496       idx    +=3*sz;
497       break;
498     case 5:
499       sum1 = 0.;
500       sum2 = 0.;
501       sum3 = 0.;
502       sum4 = 0.;
503       sum5 = 0.;
504       v2   = v1 + n;
505       v3   = v2 + n;
506       v4   = v3 + n;
507       v5   = v4 + n;
508 
509       for (n = 0; n<sz-1; n+=2) {
510         i1    = idx[0];
511         i2    = idx[1];
512         idx  += 2;
513         tmp0  = x[i1];
514         tmp1  = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
520       }
521       if (n == sz-1) {
522         tmp0  = x[*idx++];
523         sum1 += *v1++ * tmp0;
524         sum2 += *v2++ * tmp0;
525         sum3 += *v3++ * tmp0;
526         sum4 += *v4++ * tmp0;
527         sum5 += *v5++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       y[row++]=sum5;
534       v1      =v5;       /* Since the next block to be processed starts there */
535       idx    +=4*sz;
536       break;
537     default:
538       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
539     }
540   }
541   PetscCall(VecRestoreArrayRead(xx,&x));
542   PetscCall(VecRestoreArray(yy,&y));
543   PetscCall(PetscLogFlops(2.0*a->nz - nonzerorow));
544   PetscFunctionReturn(0);
545 }
546 /* ----------------------------------------------------------- */
547 /* Almost same code as the MatMult_SeqAIJ_Inode() */
548 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
549 {
550   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
551   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
552   const MatScalar   *v1,*v2,*v3,*v4,*v5;
553   const PetscScalar *x;
554   PetscScalar       *y,*z,*zt;
555   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
556   const PetscInt    *idx,*ns,*ii;
557 
558   PetscFunctionBegin;
559   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
560   node_max = a->inode.node_count;
561   ns       = a->inode.size;     /* Node Size array */
562 
563   PetscCall(VecGetArrayRead(xx,&x));
564   PetscCall(VecGetArrayPair(zz,yy,&z,&y));
565   zt = z;
566 
567   idx = a->j;
568   v1  = a->a;
569   ii  = a->i;
570 
571   for (i = 0,row = 0; i< node_max; ++i) {
572     nsz = ns[i];
573     n   = ii[1] - ii[0];
574     ii += nsz;
575     sz  = n;                    /* No of non zeros in this row */
576                                 /* Switch on the size of Node */
577     switch (nsz) {               /* Each loop in 'case' is unrolled */
578     case 1:
579       sum1 = *zt++;
580 
581       for (n = 0; n< sz-1; n+=2) {
582         i1    = idx[0];         /* The instructions are ordered to */
583         i2    = idx[1];         /* make the compiler's job easy */
584         idx  += 2;
585         tmp0  = x[i1];
586         tmp1  = x[i2];
587         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
588       }
589 
590       if (n   == sz-1) {          /* Take care of the last nonzero  */
591         tmp0  = x[*idx++];
592         sum1 += *v1++ * tmp0;
593       }
594       y[row++]=sum1;
595       break;
596     case 2:
597       sum1 = *zt++;
598       sum2 = *zt++;
599       v2   = v1 + n;
600 
601       for (n = 0; n< sz-1; n+=2) {
602         i1    = idx[0];
603         i2    = idx[1];
604         idx  += 2;
605         tmp0  = x[i1];
606         tmp1  = x[i2];
607         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
608         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
609       }
610       if (n   == sz-1) {
611         tmp0  = x[*idx++];
612         sum1 += *v1++ * tmp0;
613         sum2 += *v2++ * tmp0;
614       }
615       y[row++]=sum1;
616       y[row++]=sum2;
617       v1      =v2;              /* Since the next block to be processed starts there*/
618       idx    +=sz;
619       break;
620     case 3:
621       sum1 = *zt++;
622       sum2 = *zt++;
623       sum3 = *zt++;
624       v2   = v1 + n;
625       v3   = v2 + n;
626 
627       for (n = 0; n< sz-1; n+=2) {
628         i1    = idx[0];
629         i2    = idx[1];
630         idx  += 2;
631         tmp0  = x[i1];
632         tmp1  = x[i2];
633         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
634         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
635         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
636       }
637       if (n == sz-1) {
638         tmp0  = x[*idx++];
639         sum1 += *v1++ * tmp0;
640         sum2 += *v2++ * tmp0;
641         sum3 += *v3++ * tmp0;
642       }
643       y[row++]=sum1;
644       y[row++]=sum2;
645       y[row++]=sum3;
646       v1      =v3;              /* Since the next block to be processed starts there*/
647       idx    +=2*sz;
648       break;
649     case 4:
650       sum1 = *zt++;
651       sum2 = *zt++;
652       sum3 = *zt++;
653       sum4 = *zt++;
654       v2   = v1 + n;
655       v3   = v2 + n;
656       v4   = v3 + n;
657 
658       for (n = 0; n< sz-1; n+=2) {
659         i1    = idx[0];
660         i2    = idx[1];
661         idx  += 2;
662         tmp0  = x[i1];
663         tmp1  = x[i2];
664         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
665         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
666         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
667         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
668       }
669       if (n == sz-1) {
670         tmp0  = x[*idx++];
671         sum1 += *v1++ * tmp0;
672         sum2 += *v2++ * tmp0;
673         sum3 += *v3++ * tmp0;
674         sum4 += *v4++ * tmp0;
675       }
676       y[row++]=sum1;
677       y[row++]=sum2;
678       y[row++]=sum3;
679       y[row++]=sum4;
680       v1      =v4;              /* Since the next block to be processed starts there*/
681       idx    +=3*sz;
682       break;
683     case 5:
684       sum1 = *zt++;
685       sum2 = *zt++;
686       sum3 = *zt++;
687       sum4 = *zt++;
688       sum5 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692       v5   = v4 + n;
693 
694       for (n = 0; n<sz-1; n+=2) {
695         i1    = idx[0];
696         i2    = idx[1];
697         idx  += 2;
698         tmp0  = x[i1];
699         tmp1  = x[i2];
700         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
701         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
702         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
703         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
704         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
705       }
706       if (n   == sz-1) {
707         tmp0  = x[*idx++];
708         sum1 += *v1++ * tmp0;
709         sum2 += *v2++ * tmp0;
710         sum3 += *v3++ * tmp0;
711         sum4 += *v4++ * tmp0;
712         sum5 += *v5++ * tmp0;
713       }
714       y[row++]=sum1;
715       y[row++]=sum2;
716       y[row++]=sum3;
717       y[row++]=sum4;
718       y[row++]=sum5;
719       v1      =v5;       /* Since the next block to be processed starts there */
720       idx    +=4*sz;
721       break;
722     default:
723       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
724     }
725   }
726   PetscCall(VecRestoreArrayRead(xx,&x));
727   PetscCall(VecRestoreArrayPair(zz,yy,&z,&y));
728   PetscCall(PetscLogFlops(2.0*a->nz));
729   PetscFunctionReturn(0);
730 }
731 
732 /* ----------------------------------------------------------- */
733 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
734 {
735   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
736   IS                iscol = a->col,isrow = a->row;
737   const PetscInt    *r,*c,*rout,*cout;
738   PetscInt          i,j,n = A->rmap->n,nz;
739   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
740   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
741   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
742   PetscScalar       sum1,sum2,sum3,sum4,sum5;
743   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
744   const PetscScalar *b;
745 
746   PetscFunctionBegin;
747   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
748   node_max = a->inode.node_count;
749   ns       = a->inode.size;     /* Node Size array */
750 
751   PetscCall(VecGetArrayRead(bb,&b));
752   PetscCall(VecGetArrayWrite(xx,&x));
753   tmp  = a->solve_work;
754 
755   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
756   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
757 
758   /* forward solve the lower triangular */
759   tmps = tmp;
760   aa   = a_a;
761   aj   = a_j;
762   ad   = a->diag;
763 
764   for (i = 0,row = 0; i< node_max; ++i) {
765     nsz = ns[i];
766     aii = ai[row];
767     v1  = aa + aii;
768     vi  = aj + aii;
769     nz  = ad[row]- aii;
770     if (i < node_max-1) {
771       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
772       * but our indexing to determine it's size could. */
773       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
774       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
775       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
776       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
777     }
778 
779     switch (nsz) {               /* Each loop in 'case' is unrolled */
780     case 1:
781       sum1 = b[*r++];
782       for (j=0; j<nz-1; j+=2) {
783         i0    = vi[0];
784         i1    = vi[1];
785         vi   +=2;
786         tmp0  = tmps[i0];
787         tmp1  = tmps[i1];
788         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
789       }
790       if (j == nz-1) {
791         tmp0  = tmps[*vi++];
792         sum1 -= *v1++ *tmp0;
793       }
794       tmp[row++]=sum1;
795       break;
796     case 2:
797       sum1 = b[*r++];
798       sum2 = b[*r++];
799       v2   = aa + ai[row+1];
800 
801       for (j=0; j<nz-1; j+=2) {
802         i0    = vi[0];
803         i1    = vi[1];
804         vi   +=2;
805         tmp0  = tmps[i0];
806         tmp1  = tmps[i1];
807         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
808         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
809       }
810       if (j == nz-1) {
811         tmp0  = tmps[*vi++];
812         sum1 -= *v1++ *tmp0;
813         sum2 -= *v2++ *tmp0;
814       }
815       sum2     -= *v2++ *sum1;
816       tmp[row++]=sum1;
817       tmp[row++]=sum2;
818       break;
819     case 3:
820       sum1 = b[*r++];
821       sum2 = b[*r++];
822       sum3 = b[*r++];
823       v2   = aa + ai[row+1];
824       v3   = aa + ai[row+2];
825 
826       for (j=0; j<nz-1; j+=2) {
827         i0    = vi[0];
828         i1    = vi[1];
829         vi   +=2;
830         tmp0  = tmps[i0];
831         tmp1  = tmps[i1];
832         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
833         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
834         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839         sum2 -= *v2++ *tmp0;
840         sum3 -= *v3++ *tmp0;
841       }
842       sum2 -= *v2++ * sum1;
843       sum3 -= *v3++ * sum1;
844       sum3 -= *v3++ * sum2;
845 
846       tmp[row++]=sum1;
847       tmp[row++]=sum2;
848       tmp[row++]=sum3;
849       break;
850 
851     case 4:
852       sum1 = b[*r++];
853       sum2 = b[*r++];
854       sum3 = b[*r++];
855       sum4 = b[*r++];
856       v2   = aa + ai[row+1];
857       v3   = aa + ai[row+2];
858       v4   = aa + ai[row+3];
859 
860       for (j=0; j<nz-1; j+=2) {
861         i0    = vi[0];
862         i1    = vi[1];
863         vi   +=2;
864         tmp0  = tmps[i0];
865         tmp1  = tmps[i1];
866         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
867         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
868         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
869         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
870       }
871       if (j == nz-1) {
872         tmp0  = tmps[*vi++];
873         sum1 -= *v1++ *tmp0;
874         sum2 -= *v2++ *tmp0;
875         sum3 -= *v3++ *tmp0;
876         sum4 -= *v4++ *tmp0;
877       }
878       sum2 -= *v2++ * sum1;
879       sum3 -= *v3++ * sum1;
880       sum4 -= *v4++ * sum1;
881       sum3 -= *v3++ * sum2;
882       sum4 -= *v4++ * sum2;
883       sum4 -= *v4++ * sum3;
884 
885       tmp[row++]=sum1;
886       tmp[row++]=sum2;
887       tmp[row++]=sum3;
888       tmp[row++]=sum4;
889       break;
890     case 5:
891       sum1 = b[*r++];
892       sum2 = b[*r++];
893       sum3 = b[*r++];
894       sum4 = b[*r++];
895       sum5 = b[*r++];
896       v2   = aa + ai[row+1];
897       v3   = aa + ai[row+2];
898       v4   = aa + ai[row+3];
899       v5   = aa + ai[row+4];
900 
901       for (j=0; j<nz-1; j+=2) {
902         i0    = vi[0];
903         i1    = vi[1];
904         vi   +=2;
905         tmp0  = tmps[i0];
906         tmp1  = tmps[i1];
907         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
908         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
910         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
911         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
912       }
913       if (j == nz-1) {
914         tmp0  = tmps[*vi++];
915         sum1 -= *v1++ *tmp0;
916         sum2 -= *v2++ *tmp0;
917         sum3 -= *v3++ *tmp0;
918         sum4 -= *v4++ *tmp0;
919         sum5 -= *v5++ *tmp0;
920       }
921 
922       sum2 -= *v2++ * sum1;
923       sum3 -= *v3++ * sum1;
924       sum4 -= *v4++ * sum1;
925       sum5 -= *v5++ * sum1;
926       sum3 -= *v3++ * sum2;
927       sum4 -= *v4++ * sum2;
928       sum5 -= *v5++ * sum2;
929       sum4 -= *v4++ * sum3;
930       sum5 -= *v5++ * sum3;
931       sum5 -= *v5++ * sum4;
932 
933       tmp[row++]=sum1;
934       tmp[row++]=sum2;
935       tmp[row++]=sum3;
936       tmp[row++]=sum4;
937       tmp[row++]=sum5;
938       break;
939     default:
940       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
941     }
942   }
943   /* backward solve the upper triangular */
944   for (i=node_max -1,row = n-1; i>=0; i--) {
945     nsz = ns[i];
946     aii = ai[row+1] -1;
947     v1  = aa + aii;
948     vi  = aj + aii;
949     nz  = aii- ad[row];
950     switch (nsz) {               /* Each loop in 'case' is unrolled */
951     case 1:
952       sum1 = tmp[row];
953 
954       for (j=nz; j>1; j-=2) {
955         vi   -=2;
956         i0    = vi[2];
957         i1    = vi[1];
958         tmp0  = tmps[i0];
959         tmp1  = tmps[i1];
960         v1   -= 2;
961         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
962       }
963       if (j==1) {
964         tmp0  = tmps[*vi--];
965         sum1 -= *v1-- * tmp0;
966       }
967       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
968       break;
969     case 2:
970       sum1 = tmp[row];
971       sum2 = tmp[row -1];
972       v2   = aa + ai[row]-1;
973       for (j=nz; j>1; j-=2) {
974         vi   -=2;
975         i0    = vi[2];
976         i1    = vi[1];
977         tmp0  = tmps[i0];
978         tmp1  = tmps[i1];
979         v1   -= 2;
980         v2   -= 2;
981         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
983       }
984       if (j==1) {
985         tmp0  = tmps[*vi--];
986         sum1 -= *v1-- * tmp0;
987         sum2 -= *v2-- * tmp0;
988       }
989 
990       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
991       sum2   -= *v2-- * tmp0;
992       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
993       break;
994     case 3:
995       sum1 = tmp[row];
996       sum2 = tmp[row -1];
997       sum3 = tmp[row -2];
998       v2   = aa + ai[row]-1;
999       v3   = aa + ai[row -1]-1;
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         v2   -= 2;
1008         v3   -= 2;
1009         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1010         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1011         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1012       }
1013       if (j==1) {
1014         tmp0  = tmps[*vi--];
1015         sum1 -= *v1-- * tmp0;
1016         sum2 -= *v2-- * tmp0;
1017         sum3 -= *v3-- * tmp0;
1018       }
1019       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1020       sum2   -= *v2-- * tmp0;
1021       sum3   -= *v3-- * tmp0;
1022       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1023       sum3   -= *v3-- * tmp0;
1024       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1025 
1026       break;
1027     case 4:
1028       sum1 = tmp[row];
1029       sum2 = tmp[row -1];
1030       sum3 = tmp[row -2];
1031       sum4 = tmp[row -3];
1032       v2   = aa + ai[row]-1;
1033       v3   = aa + ai[row -1]-1;
1034       v4   = aa + ai[row -2]-1;
1035 
1036       for (j=nz; j>1; j-=2) {
1037         vi   -=2;
1038         i0    = vi[2];
1039         i1    = vi[1];
1040         tmp0  = tmps[i0];
1041         tmp1  = tmps[i1];
1042         v1   -= 2;
1043         v2   -= 2;
1044         v3   -= 2;
1045         v4   -= 2;
1046         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1047         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1048         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1049         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1050       }
1051       if (j==1) {
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056         sum4 -= *v4-- * tmp0;
1057       }
1058 
1059       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1060       sum2   -= *v2-- * tmp0;
1061       sum3   -= *v3-- * tmp0;
1062       sum4   -= *v4-- * tmp0;
1063       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1064       sum3   -= *v3-- * tmp0;
1065       sum4   -= *v4-- * tmp0;
1066       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1067       sum4   -= *v4-- * tmp0;
1068       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1069       break;
1070     case 5:
1071       sum1 = tmp[row];
1072       sum2 = tmp[row -1];
1073       sum3 = tmp[row -2];
1074       sum4 = tmp[row -3];
1075       sum5 = tmp[row -4];
1076       v2   = aa + ai[row]-1;
1077       v3   = aa + ai[row -1]-1;
1078       v4   = aa + ai[row -2]-1;
1079       v5   = aa + ai[row -3]-1;
1080       for (j=nz; j>1; j-=2) {
1081         vi   -= 2;
1082         i0    = vi[2];
1083         i1    = vi[1];
1084         tmp0  = tmps[i0];
1085         tmp1  = tmps[i1];
1086         v1   -= 2;
1087         v2   -= 2;
1088         v3   -= 2;
1089         v4   -= 2;
1090         v5   -= 2;
1091         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1092         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1093         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1094         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1095         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103         sum5 -= *v5-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       sum5   -= *v5-- * tmp0;
1111       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1112       sum3   -= *v3-- * tmp0;
1113       sum4   -= *v4-- * tmp0;
1114       sum5   -= *v5-- * tmp0;
1115       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1116       sum4   -= *v4-- * tmp0;
1117       sum5   -= *v5-- * tmp0;
1118       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1119       sum5   -= *v5-- * tmp0;
1120       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1121       break;
1122     default:
1123       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
1124     }
1125   }
1126   PetscCall(ISRestoreIndices(isrow,&rout));
1127   PetscCall(ISRestoreIndices(iscol,&cout));
1128   PetscCall(VecRestoreArrayRead(bb,&b));
1129   PetscCall(VecRestoreArrayWrite(xx,&x));
1130   PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n));
1131   PetscFunctionReturn(0);
1132 }
1133 
1134 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1135 {
1136   Mat             C     =B;
1137   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1138   IS              isrow = b->row,isicol = b->icol;
1139   const PetscInt  *r,*ic,*ics;
1140   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1141   PetscInt        i,j,k,nz,nzL,row,*pj;
1142   const PetscInt  *ajtmp,*bjtmp;
1143   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1144   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1145   FactorShiftCtx  sctx;
1146   const PetscInt  *ddiag;
1147   PetscReal       rs;
1148   MatScalar       d;
1149   PetscInt        inod,nodesz,node_max,col;
1150   const PetscInt  *ns;
1151   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1152 
1153   PetscFunctionBegin;
1154   /* MatPivotSetUp(): initialize shift context sctx */
1155   PetscCall(PetscMemzero(&sctx,sizeof(FactorShiftCtx)));
1156 
1157   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1158     ddiag          = a->diag;
1159     sctx.shift_top = info->zeropivot;
1160     for (i=0; i<n; i++) {
1161       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1162       d  = (aa)[ddiag[i]];
1163       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1164       v  = aa+ai[i];
1165       nz = ai[i+1] - ai[i];
1166       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1167       if (rs>sctx.shift_top) sctx.shift_top = rs;
1168     }
1169     sctx.shift_top *= 1.1;
1170     sctx.nshift_max = 5;
1171     sctx.shift_lo   = 0.;
1172     sctx.shift_hi   = 1.;
1173   }
1174 
1175   PetscCall(ISGetIndices(isrow,&r));
1176   PetscCall(ISGetIndices(isicol,&ic));
1177 
1178   PetscCall(PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4));
1179   ics   = ic;
1180 
1181   node_max = a->inode.node_count;
1182   ns       = a->inode.size;
1183   PetscCheck(ns,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1184 
1185   /* If max inode size > 4, split it into two inodes.*/
1186   /* also map the inode sizes according to the ordering */
1187   PetscCall(PetscMalloc1(n+1,&tmp_vec1));
1188   for (i=0,j=0; i<node_max; ++i,++j) {
1189     if (ns[i] > 4) {
1190       tmp_vec1[j] = 4;
1191       ++j;
1192       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1193     } else {
1194       tmp_vec1[j] = ns[i];
1195     }
1196   }
1197   /* Use the correct node_max */
1198   node_max = j;
1199 
1200   /* Now reorder the inode info based on mat re-ordering info */
1201   /* First create a row -> inode_size_array_index map */
1202   PetscCall(PetscMalloc1(n+1,&nsmap));
1203   PetscCall(PetscMalloc1(node_max+1,&tmp_vec2));
1204   for (i=0,row=0; i<node_max; i++) {
1205     nodesz = tmp_vec1[i];
1206     for (j=0; j<nodesz; j++,row++) {
1207       nsmap[row] = i;
1208     }
1209   }
1210   /* Using nsmap, create a reordered ns structure */
1211   for (i=0,j=0; i< node_max; i++) {
1212     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1213     tmp_vec2[i] = nodesz;
1214     j          += nodesz;
1215   }
1216   PetscCall(PetscFree(nsmap));
1217   PetscCall(PetscFree(tmp_vec1));
1218 
1219   /* Now use the correct ns */
1220   ns = tmp_vec2;
1221 
1222   do {
1223     sctx.newshift = PETSC_FALSE;
1224     /* Now loop over each block-row, and do the factorization */
1225     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1226       nodesz = ns[inod];
1227 
1228       switch (nodesz) {
1229       case 1:
1230         /*----------*/
1231         /* zero rtmp1 */
1232         /* L part */
1233         nz    = bi[i+1] - bi[i];
1234         bjtmp = bj + bi[i];
1235         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1236 
1237         /* U part */
1238         nz    = bdiag[i]-bdiag[i+1];
1239         bjtmp = bj + bdiag[i+1]+1;
1240         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1241 
1242         /* load in initial (unfactored row) */
1243         nz    = ai[r[i]+1] - ai[r[i]];
1244         ajtmp = aj + ai[r[i]];
1245         v     = aa + ai[r[i]];
1246         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1247 
1248         /* ZeropivotApply() */
1249         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1250 
1251         /* elimination */
1252         bjtmp = bj + bi[i];
1253         row   = *bjtmp++;
1254         nzL   = bi[i+1] - bi[i];
1255         for (k=0; k < nzL; k++) {
1256           pc = rtmp1 + row;
1257           if (*pc != 0.0) {
1258             pv   = b->a + bdiag[row];
1259             mul1 = *pc * (*pv);
1260             *pc  = mul1;
1261             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1262             pv   = b->a + bdiag[row+1]+1;
1263             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1264             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1265             PetscCall(PetscLogFlops(1+2.0*nz));
1266           }
1267           row = *bjtmp++;
1268         }
1269 
1270         /* finished row so stick it into b->a */
1271         rs = 0.0;
1272         /* L part */
1273         pv = b->a + bi[i];
1274         pj = b->j + bi[i];
1275         nz = bi[i+1] - bi[i];
1276         for (j=0; j<nz; j++) {
1277           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1278         }
1279 
1280         /* U part */
1281         pv = b->a + bdiag[i+1]+1;
1282         pj = b->j + bdiag[i+1]+1;
1283         nz = bdiag[i] - bdiag[i+1]-1;
1284         for (j=0; j<nz; j++) {
1285           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1286         }
1287 
1288         /* Check zero pivot */
1289         sctx.rs = rs;
1290         sctx.pv = rtmp1[i];
1291         PetscCall(MatPivotCheck(B,A,info,&sctx,i));
1292         if (sctx.newshift) break;
1293 
1294         /* Mark diagonal and invert diagonal for simpler triangular solves */
1295         pv  = b->a + bdiag[i];
1296         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1297         break;
1298 
1299       case 2:
1300         /*----------*/
1301         /* zero rtmp1 and rtmp2 */
1302         /* L part */
1303         nz    = bi[i+1] - bi[i];
1304         bjtmp = bj + bi[i];
1305         for  (j=0; j<nz; j++) {
1306           col        = bjtmp[j];
1307           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1308         }
1309 
1310         /* U part */
1311         nz    = bdiag[i]-bdiag[i+1];
1312         bjtmp = bj + bdiag[i+1]+1;
1313         for  (j=0; j<nz; j++) {
1314           col        = bjtmp[j];
1315           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1316         }
1317 
1318         /* load in initial (unfactored row) */
1319         nz    = ai[r[i]+1] - ai[r[i]];
1320         ajtmp = aj + ai[r[i]];
1321         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1322         for (j=0; j<nz; j++) {
1323           col        = ics[ajtmp[j]];
1324           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1325         }
1326         /* ZeropivotApply(): shift the diagonal of the matrix  */
1327         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1328 
1329         /* elimination */
1330         bjtmp = bj + bi[i];
1331         row   = *bjtmp++; /* pivot row */
1332         nzL   = bi[i+1] - bi[i];
1333         for (k=0; k < nzL; k++) {
1334           pc1 = rtmp1 + row;
1335           pc2 = rtmp2 + row;
1336           if (*pc1 != 0.0 || *pc2 != 0.0) {
1337             pv   = b->a + bdiag[row];
1338             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1339             *pc1 = mul1;       *pc2 = mul2;
1340 
1341             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1342             pv = b->a + bdiag[row+1]+1;
1343             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1344             for (j=0; j<nz; j++) {
1345               col         = pj[j];
1346               rtmp1[col] -= mul1 * pv[j];
1347               rtmp2[col] -= mul2 * pv[j];
1348             }
1349             PetscCall(PetscLogFlops(2+4.0*nz));
1350           }
1351           row = *bjtmp++;
1352         }
1353 
1354         /* finished row i; check zero pivot, then stick row i into b->a */
1355         rs = 0.0;
1356         /* L part */
1357         pc1 = b->a + bi[i];
1358         pj  = b->j + bi[i];
1359         nz  = bi[i+1] - bi[i];
1360         for (j=0; j<nz; j++) {
1361           col    = pj[j];
1362           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1363         }
1364         /* U part */
1365         pc1 = b->a + bdiag[i+1]+1;
1366         pj  = b->j + bdiag[i+1]+1;
1367         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1368         for (j=0; j<nz; j++) {
1369           col    = pj[j];
1370           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1371         }
1372 
1373         sctx.rs = rs;
1374         sctx.pv = rtmp1[i];
1375         PetscCall(MatPivotCheck(B,A,info,&sctx,i));
1376         if (sctx.newshift) break;
1377         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1378         *pc1 = 1.0/sctx.pv;
1379 
1380         /* Now take care of diagonal 2x2 block. */
1381         pc2 = rtmp2 + i;
1382         if (*pc2 != 0.0) {
1383           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1384           *pc2 = mul1;          /* insert L entry */
1385           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1386           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1387           for (j=0; j<nz; j++) {
1388             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1389           }
1390           PetscCall(PetscLogFlops(1+2.0*nz));
1391         }
1392 
1393         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1394         rs = 0.0;
1395         /* L part */
1396         pc2 = b->a + bi[i+1];
1397         pj  = b->j + bi[i+1];
1398         nz  = bi[i+2] - bi[i+1];
1399         for (j=0; j<nz; j++) {
1400           col    = pj[j];
1401           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1402         }
1403         /* U part */
1404         pc2 = b->a + bdiag[i+2]+1;
1405         pj  = b->j + bdiag[i+2]+1;
1406         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1407         for (j=0; j<nz; j++) {
1408           col    = pj[j];
1409           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1410         }
1411 
1412         sctx.rs = rs;
1413         sctx.pv = rtmp2[i+1];
1414         PetscCall(MatPivotCheck(B,A,info,&sctx,i+1));
1415         if (sctx.newshift) break;
1416         pc2  = b->a + bdiag[i+1];
1417         *pc2 = 1.0/sctx.pv;
1418         break;
1419 
1420       case 3:
1421         /*----------*/
1422         /* zero rtmp */
1423         /* L part */
1424         nz    = bi[i+1] - bi[i];
1425         bjtmp = bj + bi[i];
1426         for  (j=0; j<nz; j++) {
1427           col        = bjtmp[j];
1428           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1429         }
1430 
1431         /* U part */
1432         nz    = bdiag[i]-bdiag[i+1];
1433         bjtmp = bj + bdiag[i+1]+1;
1434         for  (j=0; j<nz; j++) {
1435           col        = bjtmp[j];
1436           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1437         }
1438 
1439         /* load in initial (unfactored row) */
1440         nz    = ai[r[i]+1] - ai[r[i]];
1441         ajtmp = aj + ai[r[i]];
1442         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1443         for (j=0; j<nz; j++) {
1444           col        = ics[ajtmp[j]];
1445           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1446         }
1447         /* ZeropivotApply(): shift the diagonal of the matrix  */
1448         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1449 
1450         /* elimination */
1451         bjtmp = bj + bi[i];
1452         row   = *bjtmp++; /* pivot row */
1453         nzL   = bi[i+1] - bi[i];
1454         for (k=0; k < nzL; k++) {
1455           pc1 = rtmp1 + row;
1456           pc2 = rtmp2 + row;
1457           pc3 = rtmp3 + row;
1458           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1459             pv   = b->a + bdiag[row];
1460             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1461             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1462 
1463             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1464             pv = b->a + bdiag[row+1]+1;
1465             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1466             for (j=0; j<nz; j++) {
1467               col         = pj[j];
1468               rtmp1[col] -= mul1 * pv[j];
1469               rtmp2[col] -= mul2 * pv[j];
1470               rtmp3[col] -= mul3 * pv[j];
1471             }
1472             PetscCall(PetscLogFlops(3+6.0*nz));
1473           }
1474           row = *bjtmp++;
1475         }
1476 
1477         /* finished row i; check zero pivot, then stick row i into b->a */
1478         rs = 0.0;
1479         /* L part */
1480         pc1 = b->a + bi[i];
1481         pj  = b->j + bi[i];
1482         nz  = bi[i+1] - bi[i];
1483         for (j=0; j<nz; j++) {
1484           col    = pj[j];
1485           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1486         }
1487         /* U part */
1488         pc1 = b->a + bdiag[i+1]+1;
1489         pj  = b->j + bdiag[i+1]+1;
1490         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1491         for (j=0; j<nz; j++) {
1492           col    = pj[j];
1493           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1494         }
1495 
1496         sctx.rs = rs;
1497         sctx.pv = rtmp1[i];
1498         PetscCall(MatPivotCheck(B,A,info,&sctx,i));
1499         if (sctx.newshift) break;
1500         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1501         *pc1 = 1.0/sctx.pv;
1502 
1503         /* Now take care of 1st column of diagonal 3x3 block. */
1504         pc2 = rtmp2 + i;
1505         pc3 = rtmp3 + i;
1506         if (*pc2 != 0.0 || *pc3 != 0.0) {
1507           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1508           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1509           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1510           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1511           for (j=0; j<nz; j++) {
1512             col         = pj[j];
1513             rtmp2[col] -= mul2 * rtmp1[col];
1514             rtmp3[col] -= mul3 * rtmp1[col];
1515           }
1516           PetscCall(PetscLogFlops(2+4.0*nz));
1517         }
1518 
1519         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1520         rs = 0.0;
1521         /* L part */
1522         pc2 = b->a + bi[i+1];
1523         pj  = b->j + bi[i+1];
1524         nz  = bi[i+2] - bi[i+1];
1525         for (j=0; j<nz; j++) {
1526           col    = pj[j];
1527           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1528         }
1529         /* U part */
1530         pc2 = b->a + bdiag[i+2]+1;
1531         pj  = b->j + bdiag[i+2]+1;
1532         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1533         for (j=0; j<nz; j++) {
1534           col    = pj[j];
1535           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1536         }
1537 
1538         sctx.rs = rs;
1539         sctx.pv = rtmp2[i+1];
1540         PetscCall(MatPivotCheck(B,A,info,&sctx,i+1));
1541         if (sctx.newshift) break;
1542         pc2  = b->a + bdiag[i+1];
1543         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1544 
1545         /* Now take care of 2nd column of diagonal 3x3 block. */
1546         pc3 = rtmp3 + i+1;
1547         if (*pc3 != 0.0) {
1548           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1549           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1550           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1551           for (j=0; j<nz; j++) {
1552             col         = pj[j];
1553             rtmp3[col] -= mul3 * rtmp2[col];
1554           }
1555           PetscCall(PetscLogFlops(1+2.0*nz));
1556         }
1557 
1558         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1559         rs = 0.0;
1560         /* L part */
1561         pc3 = b->a + bi[i+2];
1562         pj  = b->j + bi[i+2];
1563         nz  = bi[i+3] - bi[i+2];
1564         for (j=0; j<nz; j++) {
1565           col    = pj[j];
1566           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1567         }
1568         /* U part */
1569         pc3 = b->a + bdiag[i+3]+1;
1570         pj  = b->j + bdiag[i+3]+1;
1571         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1572         for (j=0; j<nz; j++) {
1573           col    = pj[j];
1574           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1575         }
1576 
1577         sctx.rs = rs;
1578         sctx.pv = rtmp3[i+2];
1579         PetscCall(MatPivotCheck(B,A,info,&sctx,i+2));
1580         if (sctx.newshift) break;
1581         pc3  = b->a + bdiag[i+2];
1582         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1583         break;
1584       case 4:
1585         /*----------*/
1586         /* zero rtmp */
1587         /* L part */
1588         nz    = bi[i+1] - bi[i];
1589         bjtmp = bj + bi[i];
1590         for  (j=0; j<nz; j++) {
1591           col        = bjtmp[j];
1592           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1593         }
1594 
1595         /* U part */
1596         nz    = bdiag[i]-bdiag[i+1];
1597         bjtmp = bj + bdiag[i+1]+1;
1598         for  (j=0; j<nz; j++) {
1599           col        = bjtmp[j];
1600           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1601         }
1602 
1603         /* load in initial (unfactored row) */
1604         nz    = ai[r[i]+1] - ai[r[i]];
1605         ajtmp = aj + ai[r[i]];
1606         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1607         for (j=0; j<nz; j++) {
1608           col        = ics[ajtmp[j]];
1609           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1610         }
1611         /* ZeropivotApply(): shift the diagonal of the matrix  */
1612         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1613 
1614         /* elimination */
1615         bjtmp = bj + bi[i];
1616         row   = *bjtmp++; /* pivot row */
1617         nzL   = bi[i+1] - bi[i];
1618         for (k=0; k < nzL; k++) {
1619           pc1 = rtmp1 + row;
1620           pc2 = rtmp2 + row;
1621           pc3 = rtmp3 + row;
1622           pc4 = rtmp4 + row;
1623           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1624             pv   = b->a + bdiag[row];
1625             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1626             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1627 
1628             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1629             pv = b->a + bdiag[row+1]+1;
1630             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1631             for (j=0; j<nz; j++) {
1632               col         = pj[j];
1633               rtmp1[col] -= mul1 * pv[j];
1634               rtmp2[col] -= mul2 * pv[j];
1635               rtmp3[col] -= mul3 * pv[j];
1636               rtmp4[col] -= mul4 * pv[j];
1637             }
1638             PetscCall(PetscLogFlops(4+8.0*nz));
1639           }
1640           row = *bjtmp++;
1641         }
1642 
1643         /* finished row i; check zero pivot, then stick row i into b->a */
1644         rs = 0.0;
1645         /* L part */
1646         pc1 = b->a + bi[i];
1647         pj  = b->j + bi[i];
1648         nz  = bi[i+1] - bi[i];
1649         for (j=0; j<nz; j++) {
1650           col    = pj[j];
1651           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1652         }
1653         /* U part */
1654         pc1 = b->a + bdiag[i+1]+1;
1655         pj  = b->j + bdiag[i+1]+1;
1656         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1657         for (j=0; j<nz; j++) {
1658           col    = pj[j];
1659           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1660         }
1661 
1662         sctx.rs = rs;
1663         sctx.pv = rtmp1[i];
1664         PetscCall(MatPivotCheck(B,A,info,&sctx,i));
1665         if (sctx.newshift) break;
1666         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1667         *pc1 = 1.0/sctx.pv;
1668 
1669         /* Now take care of 1st column of diagonal 4x4 block. */
1670         pc2 = rtmp2 + i;
1671         pc3 = rtmp3 + i;
1672         pc4 = rtmp4 + i;
1673         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1674           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1675           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1676           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1677           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1678           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1679           for (j=0; j<nz; j++) {
1680             col         = pj[j];
1681             rtmp2[col] -= mul2 * rtmp1[col];
1682             rtmp3[col] -= mul3 * rtmp1[col];
1683             rtmp4[col] -= mul4 * rtmp1[col];
1684           }
1685           PetscCall(PetscLogFlops(3+6.0*nz));
1686         }
1687 
1688         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1689         rs = 0.0;
1690         /* L part */
1691         pc2 = b->a + bi[i+1];
1692         pj  = b->j + bi[i+1];
1693         nz  = bi[i+2] - bi[i+1];
1694         for (j=0; j<nz; j++) {
1695           col    = pj[j];
1696           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1697         }
1698         /* U part */
1699         pc2 = b->a + bdiag[i+2]+1;
1700         pj  = b->j + bdiag[i+2]+1;
1701         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1705         }
1706 
1707         sctx.rs = rs;
1708         sctx.pv = rtmp2[i+1];
1709         PetscCall(MatPivotCheck(B,A,info,&sctx,i+1));
1710         if (sctx.newshift) break;
1711         pc2  = b->a + bdiag[i+1];
1712         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1713 
1714         /* Now take care of 2nd column of diagonal 4x4 block. */
1715         pc3 = rtmp3 + i+1;
1716         pc4 = rtmp4 + i+1;
1717         if (*pc3 != 0.0 || *pc4 != 0.0) {
1718           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1719           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1720           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1721           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1722           for (j=0; j<nz; j++) {
1723             col         = pj[j];
1724             rtmp3[col] -= mul3 * rtmp2[col];
1725             rtmp4[col] -= mul4 * rtmp2[col];
1726           }
1727           PetscCall(PetscLogFlops(4.0*nz));
1728         }
1729 
1730         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1731         rs = 0.0;
1732         /* L part */
1733         pc3 = b->a + bi[i+2];
1734         pj  = b->j + bi[i+2];
1735         nz  = bi[i+3] - bi[i+2];
1736         for (j=0; j<nz; j++) {
1737           col    = pj[j];
1738           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1739         }
1740         /* U part */
1741         pc3 = b->a + bdiag[i+3]+1;
1742         pj  = b->j + bdiag[i+3]+1;
1743         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1744         for (j=0; j<nz; j++) {
1745           col    = pj[j];
1746           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1747         }
1748 
1749         sctx.rs = rs;
1750         sctx.pv = rtmp3[i+2];
1751         PetscCall(MatPivotCheck(B,A,info,&sctx,i+2));
1752         if (sctx.newshift) break;
1753         pc3  = b->a + bdiag[i+2];
1754         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1755 
1756         /* Now take care of 3rd column of diagonal 4x4 block. */
1757         pc4 = rtmp4 + i+2;
1758         if (*pc4 != 0.0) {
1759           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1760           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1761           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1762           for (j=0; j<nz; j++) {
1763             col         = pj[j];
1764             rtmp4[col] -= mul4 * rtmp3[col];
1765           }
1766           PetscCall(PetscLogFlops(1+2.0*nz));
1767         }
1768 
1769         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1770         rs = 0.0;
1771         /* L part */
1772         pc4 = b->a + bi[i+3];
1773         pj  = b->j + bi[i+3];
1774         nz  = bi[i+4] - bi[i+3];
1775         for (j=0; j<nz; j++) {
1776           col    = pj[j];
1777           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1778         }
1779         /* U part */
1780         pc4 = b->a + bdiag[i+4]+1;
1781         pj  = b->j + bdiag[i+4]+1;
1782         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1783         for (j=0; j<nz; j++) {
1784           col    = pj[j];
1785           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1786         }
1787 
1788         sctx.rs = rs;
1789         sctx.pv = rtmp4[i+3];
1790         PetscCall(MatPivotCheck(B,A,info,&sctx,i+3));
1791         if (sctx.newshift) break;
1792         pc4  = b->a + bdiag[i+3];
1793         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1794         break;
1795 
1796       default:
1797         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
1798       }
1799       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1800       i += nodesz;                 /* Update the row */
1801     }
1802 
1803     /* MatPivotRefine() */
1804     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1805       /*
1806        * if no shift in this attempt & shifting & started shifting & can refine,
1807        * then try lower shift
1808        */
1809       sctx.shift_hi       = sctx.shift_fraction;
1810       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1811       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1812       sctx.newshift       = PETSC_TRUE;
1813       sctx.nshift++;
1814     }
1815   } while (sctx.newshift);
1816 
1817   PetscCall(PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4));
1818   PetscCall(PetscFree(tmp_vec2));
1819   PetscCall(ISRestoreIndices(isicol,&ic));
1820   PetscCall(ISRestoreIndices(isrow,&r));
1821 
1822   if (b->inode.size) {
1823     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1824   } else {
1825     C->ops->solve           = MatSolve_SeqAIJ;
1826   }
1827   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1828   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1829   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1830   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1831   C->assembled              = PETSC_TRUE;
1832   C->preallocated           = PETSC_TRUE;
1833 
1834   PetscCall(PetscLogFlops(C->cmap->n));
1835 
1836   /* MatShiftView(A,info,&sctx) */
1837   if (sctx.nshift) {
1838     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1839       PetscCall(PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top));
1840     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1841       PetscCall(PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount));
1842     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1843       PetscCall(PetscInfo(A,"number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n",sctx.nshift,(double)info->shiftamount));
1844     }
1845   }
1846   PetscFunctionReturn(0);
1847 }
1848 
1849 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1850 {
1851   Mat             C     = B;
1852   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1853   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1854   const PetscInt  *r,*ic,*c,*ics;
1855   PetscInt        n   = A->rmap->n,*bi = b->i;
1856   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1857   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1858   PetscInt        *ai = a->i,*aj = a->j;
1859   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1860   PetscScalar     mul1,mul2,mul3,tmp;
1861   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1862   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1863   PetscReal       rs=0.0;
1864   FactorShiftCtx  sctx;
1865 
1866   PetscFunctionBegin;
1867   sctx.shift_top      = 0;
1868   sctx.nshift_max     = 0;
1869   sctx.shift_lo       = 0;
1870   sctx.shift_hi       = 0;
1871   sctx.shift_fraction = 0;
1872 
1873   /* if both shift schemes are chosen by user, only use info->shiftpd */
1874   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1875     sctx.shift_top = 0;
1876     for (i=0; i<n; i++) {
1877       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1878       rs    = 0.0;
1879       ajtmp = aj + ai[i];
1880       rtmp1 = aa + ai[i];
1881       nz    = ai[i+1] - ai[i];
1882       for (j=0; j<nz; j++) {
1883         if (*ajtmp != i) {
1884           rs += PetscAbsScalar(*rtmp1++);
1885         } else {
1886           rs -= PetscRealPart(*rtmp1++);
1887         }
1888         ajtmp++;
1889       }
1890       if (rs>sctx.shift_top) sctx.shift_top = rs;
1891     }
1892     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1893     sctx.shift_top *= 1.1;
1894     sctx.nshift_max = 5;
1895     sctx.shift_lo   = 0.;
1896     sctx.shift_hi   = 1.;
1897   }
1898   sctx.shift_amount = 0;
1899   sctx.nshift       = 0;
1900 
1901   PetscCall(ISGetIndices(isrow,&r));
1902   PetscCall(ISGetIndices(iscol,&c));
1903   PetscCall(ISGetIndices(isicol,&ic));
1904   PetscCall(PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33));
1905   ics    = ic;
1906 
1907   node_max = a->inode.node_count;
1908   ns       = a->inode.size;
1909   PetscCheck(ns,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1910 
1911   /* If max inode size > 3, split it into two inodes.*/
1912   /* also map the inode sizes according to the ordering */
1913   PetscCall(PetscMalloc1(n+1,&tmp_vec1));
1914   for (i=0,j=0; i<node_max; ++i,++j) {
1915     if (ns[i]>3) {
1916       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1917       ++j;
1918       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1919     } else {
1920       tmp_vec1[j] = ns[i];
1921     }
1922   }
1923   /* Use the correct node_max */
1924   node_max = j;
1925 
1926   /* Now reorder the inode info based on mat re-ordering info */
1927   /* First create a row -> inode_size_array_index map */
1928   PetscCall(PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2));
1929   for (i=0,row=0; i<node_max; i++) {
1930     nodesz = tmp_vec1[i];
1931     for (j=0; j<nodesz; j++,row++) {
1932       nsmap[row] = i;
1933     }
1934   }
1935   /* Using nsmap, create a reordered ns structure */
1936   for (i=0,j=0; i< node_max; i++) {
1937     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1938     tmp_vec2[i] = nodesz;
1939     j          += nodesz;
1940   }
1941   PetscCall(PetscFree2(nsmap,tmp_vec1));
1942   /* Now use the correct ns */
1943   ns = tmp_vec2;
1944 
1945   do {
1946     sctx.newshift = PETSC_FALSE;
1947     /* Now loop over each block-row, and do the factorization */
1948     for (i=0,row=0; i<node_max; i++) {
1949       nodesz = ns[i];
1950       nz     = bi[row+1] - bi[row];
1951       bjtmp  = bj + bi[row];
1952 
1953       switch (nodesz) {
1954       case 1:
1955         for  (j=0; j<nz; j++) {
1956           idx         = bjtmp[j];
1957           rtmp11[idx] = 0.0;
1958         }
1959 
1960         /* load in initial (unfactored row) */
1961         idx    = r[row];
1962         nz_tmp = ai[idx+1] - ai[idx];
1963         ajtmp  = aj + ai[idx];
1964         v1     = aa + ai[idx];
1965 
1966         for (j=0; j<nz_tmp; j++) {
1967           idx         = ics[ajtmp[j]];
1968           rtmp11[idx] = v1[j];
1969         }
1970         rtmp11[ics[r[row]]] += sctx.shift_amount;
1971 
1972         prow = *bjtmp++;
1973         while (prow < row) {
1974           pc1 = rtmp11 + prow;
1975           if (*pc1 != 0.0) {
1976             pv     = ba + bd[prow];
1977             pj     = nbj + bd[prow];
1978             mul1   = *pc1 * *pv++;
1979             *pc1   = mul1;
1980             nz_tmp = bi[prow+1] - bd[prow] - 1;
1981             PetscCall(PetscLogFlops(1+2.0*nz_tmp));
1982             for (j=0; j<nz_tmp; j++) {
1983               tmp          = pv[j];
1984               idx          = pj[j];
1985               rtmp11[idx] -= mul1 * tmp;
1986             }
1987           }
1988           prow = *bjtmp++;
1989         }
1990         pj  = bj + bi[row];
1991         pc1 = ba + bi[row];
1992 
1993         sctx.pv     = rtmp11[row];
1994         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1995         rs          = 0.0;
1996         for (j=0; j<nz; j++) {
1997           idx    = pj[j];
1998           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1999           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2000         }
2001         sctx.rs = rs;
2002         PetscCall(MatPivotCheck(B,A,info,&sctx,row));
2003         if (sctx.newshift) goto endofwhile;
2004         break;
2005 
2006       case 2:
2007         for (j=0; j<nz; j++) {
2008           idx         = bjtmp[j];
2009           rtmp11[idx] = 0.0;
2010           rtmp22[idx] = 0.0;
2011         }
2012 
2013         /* load in initial (unfactored row) */
2014         idx    = r[row];
2015         nz_tmp = ai[idx+1] - ai[idx];
2016         ajtmp  = aj + ai[idx];
2017         v1     = aa + ai[idx];
2018         v2     = aa + ai[idx+1];
2019         for (j=0; j<nz_tmp; j++) {
2020           idx         = ics[ajtmp[j]];
2021           rtmp11[idx] = v1[j];
2022           rtmp22[idx] = v2[j];
2023         }
2024         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2025         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2026 
2027         prow = *bjtmp++;
2028         while (prow < row) {
2029           pc1 = rtmp11 + prow;
2030           pc2 = rtmp22 + prow;
2031           if (*pc1 != 0.0 || *pc2 != 0.0) {
2032             pv   = ba + bd[prow];
2033             pj   = nbj + bd[prow];
2034             mul1 = *pc1 * *pv;
2035             mul2 = *pc2 * *pv;
2036             ++pv;
2037             *pc1 = mul1;
2038             *pc2 = mul2;
2039 
2040             nz_tmp = bi[prow+1] - bd[prow] - 1;
2041             for (j=0; j<nz_tmp; j++) {
2042               tmp          = pv[j];
2043               idx          = pj[j];
2044               rtmp11[idx] -= mul1 * tmp;
2045               rtmp22[idx] -= mul2 * tmp;
2046             }
2047             PetscCall(PetscLogFlops(2+4.0*nz_tmp));
2048           }
2049           prow = *bjtmp++;
2050         }
2051 
2052         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2053         pc1 = rtmp11 + prow;
2054         pc2 = rtmp22 + prow;
2055 
2056         sctx.pv = *pc1;
2057         pj      = bj + bi[prow];
2058         rs      = 0.0;
2059         for (j=0; j<nz; j++) {
2060           idx = pj[j];
2061           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2062         }
2063         sctx.rs = rs;
2064         PetscCall(MatPivotCheck(B,A,info,&sctx,row));
2065         if (sctx.newshift) goto endofwhile;
2066 
2067         if (*pc2 != 0.0) {
2068           pj     = nbj + bd[prow];
2069           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2070           *pc2   = mul2;
2071           nz_tmp = bi[prow+1] - bd[prow] - 1;
2072           for (j=0; j<nz_tmp; j++) {
2073             idx          = pj[j];
2074             tmp          = rtmp11[idx];
2075             rtmp22[idx] -= mul2 * tmp;
2076           }
2077           PetscCall(PetscLogFlops(1+2.0*nz_tmp));
2078         }
2079 
2080         pj  = bj + bi[row];
2081         pc1 = ba + bi[row];
2082         pc2 = ba + bi[row+1];
2083 
2084         sctx.pv       = rtmp22[row+1];
2085         rs            = 0.0;
2086         rtmp11[row]   = 1.0/rtmp11[row];
2087         rtmp22[row+1] = 1.0/rtmp22[row+1];
2088         /* copy row entries from dense representation to sparse */
2089         for (j=0; j<nz; j++) {
2090           idx    = pj[j];
2091           pc1[j] = rtmp11[idx];
2092           pc2[j] = rtmp22[idx];
2093           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2094         }
2095         sctx.rs = rs;
2096         PetscCall(MatPivotCheck(B,A,info,&sctx,row+1));
2097         if (sctx.newshift) goto endofwhile;
2098         break;
2099 
2100       case 3:
2101         for  (j=0; j<nz; j++) {
2102           idx         = bjtmp[j];
2103           rtmp11[idx] = 0.0;
2104           rtmp22[idx] = 0.0;
2105           rtmp33[idx] = 0.0;
2106         }
2107         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2108         idx    = r[row];
2109         nz_tmp = ai[idx+1] - ai[idx];
2110         ajtmp  = aj + ai[idx];
2111         v1     = aa + ai[idx];
2112         v2     = aa + ai[idx+1];
2113         v3     = aa + ai[idx+2];
2114         for (j=0; j<nz_tmp; j++) {
2115           idx         = ics[ajtmp[j]];
2116           rtmp11[idx] = v1[j];
2117           rtmp22[idx] = v2[j];
2118           rtmp33[idx] = v3[j];
2119         }
2120         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2121         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2122         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2123 
2124         /* loop over all pivot row blocks above this row block */
2125         prow = *bjtmp++;
2126         while (prow < row) {
2127           pc1 = rtmp11 + prow;
2128           pc2 = rtmp22 + prow;
2129           pc3 = rtmp33 + prow;
2130           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2131             pv   = ba  + bd[prow];
2132             pj   = nbj + bd[prow];
2133             mul1 = *pc1 * *pv;
2134             mul2 = *pc2 * *pv;
2135             mul3 = *pc3 * *pv;
2136             ++pv;
2137             *pc1 = mul1;
2138             *pc2 = mul2;
2139             *pc3 = mul3;
2140 
2141             nz_tmp = bi[prow+1] - bd[prow] - 1;
2142             /* update this row based on pivot row */
2143             for (j=0; j<nz_tmp; j++) {
2144               tmp          = pv[j];
2145               idx          = pj[j];
2146               rtmp11[idx] -= mul1 * tmp;
2147               rtmp22[idx] -= mul2 * tmp;
2148               rtmp33[idx] -= mul3 * tmp;
2149             }
2150             PetscCall(PetscLogFlops(3+6.0*nz_tmp));
2151           }
2152           prow = *bjtmp++;
2153         }
2154 
2155         /* Now take care of diagonal 3x3 block in this set of rows */
2156         /* note: prow = row here */
2157         pc1 = rtmp11 + prow;
2158         pc2 = rtmp22 + prow;
2159         pc3 = rtmp33 + prow;
2160 
2161         sctx.pv = *pc1;
2162         pj      = bj + bi[prow];
2163         rs      = 0.0;
2164         for (j=0; j<nz; j++) {
2165           idx = pj[j];
2166           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2167         }
2168         sctx.rs = rs;
2169         PetscCall(MatPivotCheck(B,A,info,&sctx,row));
2170         if (sctx.newshift) goto endofwhile;
2171 
2172         if (*pc2 != 0.0 || *pc3 != 0.0) {
2173           mul2   = (*pc2)/(*pc1);
2174           mul3   = (*pc3)/(*pc1);
2175           *pc2   = mul2;
2176           *pc3   = mul3;
2177           nz_tmp = bi[prow+1] - bd[prow] - 1;
2178           pj     = nbj + bd[prow];
2179           for (j=0; j<nz_tmp; j++) {
2180             idx          = pj[j];
2181             tmp          = rtmp11[idx];
2182             rtmp22[idx] -= mul2 * tmp;
2183             rtmp33[idx] -= mul3 * tmp;
2184           }
2185           PetscCall(PetscLogFlops(2+4.0*nz_tmp));
2186         }
2187         ++prow;
2188 
2189         pc2     = rtmp22 + prow;
2190         pc3     = rtmp33 + prow;
2191         sctx.pv = *pc2;
2192         pj      = bj + bi[prow];
2193         rs      = 0.0;
2194         for (j=0; j<nz; j++) {
2195           idx = pj[j];
2196           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2197         }
2198         sctx.rs = rs;
2199         PetscCall(MatPivotCheck(B,A,info,&sctx,row+1));
2200         if (sctx.newshift) goto endofwhile;
2201 
2202         if (*pc3 != 0.0) {
2203           mul3   = (*pc3)/(*pc2);
2204           *pc3   = mul3;
2205           pj     = nbj + bd[prow];
2206           nz_tmp = bi[prow+1] - bd[prow] - 1;
2207           for (j=0; j<nz_tmp; j++) {
2208             idx          = pj[j];
2209             tmp          = rtmp22[idx];
2210             rtmp33[idx] -= mul3 * tmp;
2211           }
2212           PetscCall(PetscLogFlops(1+2.0*nz_tmp));
2213         }
2214 
2215         pj  = bj + bi[row];
2216         pc1 = ba + bi[row];
2217         pc2 = ba + bi[row+1];
2218         pc3 = ba + bi[row+2];
2219 
2220         sctx.pv       = rtmp33[row+2];
2221         rs            = 0.0;
2222         rtmp11[row]   = 1.0/rtmp11[row];
2223         rtmp22[row+1] = 1.0/rtmp22[row+1];
2224         rtmp33[row+2] = 1.0/rtmp33[row+2];
2225         /* copy row entries from dense representation to sparse */
2226         for (j=0; j<nz; j++) {
2227           idx    = pj[j];
2228           pc1[j] = rtmp11[idx];
2229           pc2[j] = rtmp22[idx];
2230           pc3[j] = rtmp33[idx];
2231           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2232         }
2233 
2234         sctx.rs = rs;
2235         PetscCall(MatPivotCheck(B,A,info,&sctx,row+2));
2236         if (sctx.newshift) goto endofwhile;
2237         break;
2238 
2239       default:
2240         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported ");
2241       }
2242       row += nodesz;                 /* Update the row */
2243     }
2244 endofwhile:;
2245   } while (sctx.newshift);
2246   PetscCall(PetscFree3(rtmp11,rtmp22,rtmp33));
2247   PetscCall(PetscFree(tmp_vec2));
2248   PetscCall(ISRestoreIndices(isicol,&ic));
2249   PetscCall(ISRestoreIndices(isrow,&r));
2250   PetscCall(ISRestoreIndices(iscol,&c));
2251 
2252   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2253   /* do not set solve add, since MatSolve_Inode + Add is faster */
2254   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2255   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2256   C->assembled              = PETSC_TRUE;
2257   C->preallocated           = PETSC_TRUE;
2258   if (sctx.nshift) {
2259     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2260       PetscCall(PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top));
2261     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2262       PetscCall(PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount));
2263     }
2264   }
2265   PetscCall(PetscLogFlops(C->cmap->n));
2266   PetscCall(MatSeqAIJCheckInode(C));
2267   PetscFunctionReturn(0);
2268 }
2269 
2270 /* ----------------------------------------------------------- */
2271 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2272 {
2273   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2274   IS                iscol = a->col,isrow = a->row;
2275   const PetscInt    *r,*c,*rout,*cout;
2276   PetscInt          i,j,n = A->rmap->n;
2277   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2278   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2279   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2280   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2281   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2282   const PetscScalar *b;
2283 
2284   PetscFunctionBegin;
2285   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2286   node_max = a->inode.node_count;
2287   ns       = a->inode.size;     /* Node Size array */
2288 
2289   PetscCall(VecGetArrayRead(bb,&b));
2290   PetscCall(VecGetArrayWrite(xx,&x));
2291   tmp  = a->solve_work;
2292 
2293   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
2294   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
2295 
2296   /* forward solve the lower triangular */
2297   tmps = tmp;
2298   aa   = a_a;
2299   aj   = a_j;
2300   ad   = a->diag;
2301 
2302   for (i = 0,row = 0; i< node_max; ++i) {
2303     nsz = ns[i];
2304     aii = ai[row];
2305     v1  = aa + aii;
2306     vi  = aj + aii;
2307     nz  = ai[row+1]- ai[row];
2308 
2309     if (i < node_max-1) {
2310       /* Prefetch the indices for the next block */
2311       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2312       /* Prefetch the data for the next block */
2313       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2314     }
2315 
2316     switch (nsz) {               /* Each loop in 'case' is unrolled */
2317     case 1:
2318       sum1 = b[r[row]];
2319       for (j=0; j<nz-1; j+=2) {
2320         i0    = vi[j];
2321         i1    = vi[j+1];
2322         tmp0  = tmps[i0];
2323         tmp1  = tmps[i1];
2324         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2325       }
2326       if (j == nz-1) {
2327         tmp0  = tmps[vi[j]];
2328         sum1 -= v1[j]*tmp0;
2329       }
2330       tmp[row++]=sum1;
2331       break;
2332     case 2:
2333       sum1 = b[r[row]];
2334       sum2 = b[r[row+1]];
2335       v2   = aa + ai[row+1];
2336 
2337       for (j=0; j<nz-1; j+=2) {
2338         i0    = vi[j];
2339         i1    = vi[j+1];
2340         tmp0  = tmps[i0];
2341         tmp1  = tmps[i1];
2342         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2343         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2344       }
2345       if (j == nz-1) {
2346         tmp0  = tmps[vi[j]];
2347         sum1 -= v1[j] *tmp0;
2348         sum2 -= v2[j] *tmp0;
2349       }
2350       sum2     -= v2[nz] * sum1;
2351       tmp[row++]=sum1;
2352       tmp[row++]=sum2;
2353       break;
2354     case 3:
2355       sum1 = b[r[row]];
2356       sum2 = b[r[row+1]];
2357       sum3 = b[r[row+2]];
2358       v2   = aa + ai[row+1];
2359       v3   = aa + ai[row+2];
2360 
2361       for (j=0; j<nz-1; j+=2) {
2362         i0    = vi[j];
2363         i1    = vi[j+1];
2364         tmp0  = tmps[i0];
2365         tmp1  = tmps[i1];
2366         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2367         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2368         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2369       }
2370       if (j == nz-1) {
2371         tmp0  = tmps[vi[j]];
2372         sum1 -= v1[j] *tmp0;
2373         sum2 -= v2[j] *tmp0;
2374         sum3 -= v3[j] *tmp0;
2375       }
2376       sum2     -= v2[nz] * sum1;
2377       sum3     -= v3[nz] * sum1;
2378       sum3     -= v3[nz+1] * sum2;
2379       tmp[row++]=sum1;
2380       tmp[row++]=sum2;
2381       tmp[row++]=sum3;
2382       break;
2383 
2384     case 4:
2385       sum1 = b[r[row]];
2386       sum2 = b[r[row+1]];
2387       sum3 = b[r[row+2]];
2388       sum4 = b[r[row+3]];
2389       v2   = aa + ai[row+1];
2390       v3   = aa + ai[row+2];
2391       v4   = aa + ai[row+3];
2392 
2393       for (j=0; j<nz-1; j+=2) {
2394         i0    = vi[j];
2395         i1    = vi[j+1];
2396         tmp0  = tmps[i0];
2397         tmp1  = tmps[i1];
2398         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2399         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2400         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2401         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2402       }
2403       if (j == nz-1) {
2404         tmp0  = tmps[vi[j]];
2405         sum1 -= v1[j] *tmp0;
2406         sum2 -= v2[j] *tmp0;
2407         sum3 -= v3[j] *tmp0;
2408         sum4 -= v4[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       sum3 -= v3[nz] * sum1;
2412       sum4 -= v4[nz] * sum1;
2413       sum3 -= v3[nz+1] * sum2;
2414       sum4 -= v4[nz+1] * sum2;
2415       sum4 -= v4[nz+2] * sum3;
2416 
2417       tmp[row++]=sum1;
2418       tmp[row++]=sum2;
2419       tmp[row++]=sum3;
2420       tmp[row++]=sum4;
2421       break;
2422     case 5:
2423       sum1 = b[r[row]];
2424       sum2 = b[r[row+1]];
2425       sum3 = b[r[row+2]];
2426       sum4 = b[r[row+3]];
2427       sum5 = b[r[row+4]];
2428       v2   = aa + ai[row+1];
2429       v3   = aa + ai[row+2];
2430       v4   = aa + ai[row+3];
2431       v5   = aa + ai[row+4];
2432 
2433       for (j=0; j<nz-1; j+=2) {
2434         i0    = vi[j];
2435         i1    = vi[j+1];
2436         tmp0  = tmps[i0];
2437         tmp1  = tmps[i1];
2438         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2439         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2440         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2441         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2442         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2443       }
2444       if (j == nz-1) {
2445         tmp0  = tmps[vi[j]];
2446         sum1 -= v1[j] *tmp0;
2447         sum2 -= v2[j] *tmp0;
2448         sum3 -= v3[j] *tmp0;
2449         sum4 -= v4[j] *tmp0;
2450         sum5 -= v5[j] *tmp0;
2451       }
2452 
2453       sum2 -= v2[nz] * sum1;
2454       sum3 -= v3[nz] * sum1;
2455       sum4 -= v4[nz] * sum1;
2456       sum5 -= v5[nz] * sum1;
2457       sum3 -= v3[nz+1] * sum2;
2458       sum4 -= v4[nz+1] * sum2;
2459       sum5 -= v5[nz+1] * sum2;
2460       sum4 -= v4[nz+2] * sum3;
2461       sum5 -= v5[nz+2] * sum3;
2462       sum5 -= v5[nz+3] * sum4;
2463 
2464       tmp[row++]=sum1;
2465       tmp[row++]=sum2;
2466       tmp[row++]=sum3;
2467       tmp[row++]=sum4;
2468       tmp[row++]=sum5;
2469       break;
2470     default:
2471       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2472     }
2473   }
2474   /* backward solve the upper triangular */
2475   for (i=node_max -1,row = n-1; i>=0; i--) {
2476     nsz = ns[i];
2477     aii = ad[row+1] + 1;
2478     v1  = aa + aii;
2479     vi  = aj + aii;
2480     nz  = ad[row]- ad[row+1] - 1;
2481 
2482     if (i > 0) {
2483       /* Prefetch the indices for the next block */
2484       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2485       /* Prefetch the data for the next block */
2486       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2487     }
2488 
2489     switch (nsz) {               /* Each loop in 'case' is unrolled */
2490     case 1:
2491       sum1 = tmp[row];
2492 
2493       for (j=0; j<nz-1; j+=2) {
2494         i0    = vi[j];
2495         i1    = vi[j+1];
2496         tmp0  = tmps[i0];
2497         tmp1  = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499       }
2500       if (j == nz-1) {
2501         tmp0  = tmps[vi[j]];
2502         sum1 -= v1[j]*tmp0;
2503       }
2504       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2505       break;
2506     case 2:
2507       sum1 = tmp[row];
2508       sum2 = tmp[row-1];
2509       v2   = aa + ad[row] + 1;
2510       for (j=0; j<nz-1; j+=2) {
2511         i0    = vi[j];
2512         i1    = vi[j+1];
2513         tmp0  = tmps[i0];
2514         tmp1  = tmps[i1];
2515         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2516         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2517       }
2518       if (j == nz-1) {
2519         tmp0  = tmps[vi[j]];
2520         sum1 -= v1[j]* tmp0;
2521         sum2 -= v2[j+1]* tmp0;
2522       }
2523 
2524       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2525       sum2     -= v2[0] * tmp0;
2526       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2527       break;
2528     case 3:
2529       sum1 = tmp[row];
2530       sum2 = tmp[row -1];
2531       sum3 = tmp[row -2];
2532       v2   = aa + ad[row] + 1;
2533       v3   = aa + ad[row -1] + 1;
2534       for (j=0; j<nz-1; j+=2) {
2535         i0    = vi[j];
2536         i1    = vi[j+1];
2537         tmp0  = tmps[i0];
2538         tmp1  = tmps[i1];
2539         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2540         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2541         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2542       }
2543       if (j== nz-1) {
2544         tmp0  = tmps[vi[j]];
2545         sum1 -= v1[j] * tmp0;
2546         sum2 -= v2[j+1] * tmp0;
2547         sum3 -= v3[j+2] * tmp0;
2548       }
2549       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2550       sum2     -= v2[0]* tmp0;
2551       sum3     -= v3[1] * tmp0;
2552       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553       sum3     -= v3[0]* tmp0;
2554       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2555 
2556       break;
2557     case 4:
2558       sum1 = tmp[row];
2559       sum2 = tmp[row -1];
2560       sum3 = tmp[row -2];
2561       sum4 = tmp[row -3];
2562       v2   = aa + ad[row]+1;
2563       v3   = aa + ad[row -1]+1;
2564       v4   = aa + ad[row -2]+1;
2565 
2566       for (j=0; j<nz-1; j+=2) {
2567         i0    = vi[j];
2568         i1    = vi[j+1];
2569         tmp0  = tmps[i0];
2570         tmp1  = tmps[i1];
2571         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2572         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2573         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2574         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2575       }
2576       if (j== nz-1) {
2577         tmp0  = tmps[vi[j]];
2578         sum1 -= v1[j] * tmp0;
2579         sum2 -= v2[j+1] * tmp0;
2580         sum3 -= v3[j+2] * tmp0;
2581         sum4 -= v4[j+3] * tmp0;
2582       }
2583 
2584       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2     -= v2[0] * tmp0;
2586       sum3     -= v3[1] * tmp0;
2587       sum4     -= v4[2] * tmp0;
2588       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2589       sum3     -= v3[0] * tmp0;
2590       sum4     -= v4[1] * tmp0;
2591       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2592       sum4     -= v4[0] * tmp0;
2593       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2594       break;
2595     case 5:
2596       sum1 = tmp[row];
2597       sum2 = tmp[row -1];
2598       sum3 = tmp[row -2];
2599       sum4 = tmp[row -3];
2600       sum5 = tmp[row -4];
2601       v2   = aa + ad[row]+1;
2602       v3   = aa + ad[row -1]+1;
2603       v4   = aa + ad[row -2]+1;
2604       v5   = aa + ad[row -3]+1;
2605       for (j=0; j<nz-1; j+=2) {
2606         i0    = vi[j];
2607         i1    = vi[j+1];
2608         tmp0  = tmps[i0];
2609         tmp1  = tmps[i1];
2610         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2611         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2612         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2613         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2614         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2615       }
2616       if (j==nz-1) {
2617         tmp0  = tmps[vi[j]];
2618         sum1 -= v1[j] * tmp0;
2619         sum2 -= v2[j+1] * tmp0;
2620         sum3 -= v3[j+2] * tmp0;
2621         sum4 -= v4[j+3] * tmp0;
2622         sum5 -= v5[j+4] * tmp0;
2623       }
2624 
2625       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2626       sum2     -= v2[0] * tmp0;
2627       sum3     -= v3[1] * tmp0;
2628       sum4     -= v4[2] * tmp0;
2629       sum5     -= v5[3] * tmp0;
2630       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2631       sum3     -= v3[0] * tmp0;
2632       sum4     -= v4[1] * tmp0;
2633       sum5     -= v5[2] * tmp0;
2634       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2635       sum4     -= v4[0] * tmp0;
2636       sum5     -= v5[1] * tmp0;
2637       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2638       sum5     -= v5[0] * tmp0;
2639       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2640       break;
2641     default:
2642       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported ");
2643     }
2644   }
2645   PetscCall(ISRestoreIndices(isrow,&rout));
2646   PetscCall(ISRestoreIndices(iscol,&cout));
2647   PetscCall(VecRestoreArrayRead(bb,&b));
2648   PetscCall(VecRestoreArrayWrite(xx,&x));
2649   PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n));
2650   PetscFunctionReturn(0);
2651 }
2652 
2653 /*
2654      Makes a longer coloring[] array and calls the usual code with that
2655 */
2656 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2657 {
2658   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2659   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2660   PetscInt        *colorused,i;
2661   ISColoringValue *newcolor;
2662 
2663   PetscFunctionBegin;
2664   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2665   PetscCall(PetscMalloc1(n+1,&newcolor));
2666   /* loop over inodes, marking a color for each column*/
2667   row = 0;
2668   for (i=0; i<m; i++) {
2669     for (j=0; j<ns[i]; j++) {
2670       newcolor[row++] = coloring[i] + j*ncolors;
2671     }
2672   }
2673 
2674   /* eliminate unneeded colors */
2675   PetscCall(PetscCalloc1(5*ncolors,&colorused));
2676   for (i=0; i<n; i++) {
2677     colorused[newcolor[i]] = 1;
2678   }
2679 
2680   for (i=1; i<5*ncolors; i++) {
2681     colorused[i] += colorused[i-1];
2682   }
2683   ncolors = colorused[5*ncolors-1];
2684   for (i=0; i<n; i++) {
2685     newcolor[i] = colorused[newcolor[i]]-1;
2686   }
2687   PetscCall(PetscFree(colorused));
2688   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring));
2689   PetscCall(PetscFree(coloring));
2690   PetscFunctionReturn(0);
2691 }
2692 
2693 #include <petsc/private/kernels/blockinvert.h>
2694 
2695 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2696 {
2697   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2698   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2699   MatScalar         *ibdiag,*bdiag,work[25],*t;
2700   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2701   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2702   const PetscScalar *xb, *b;
2703   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2704   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2705   PetscInt          sz,k,ipvt[5];
2706   PetscBool         allowzeropivot,zeropivotdetected;
2707   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2708 
2709   PetscFunctionBegin;
2710   allowzeropivot = PetscNot(A->erroriffailure);
2711   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2712   PetscCheck(omega == 1.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2713   PetscCheck(fshift == 0.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2714 
2715   if (!a->inode.ibdiagvalid) {
2716     if (!a->inode.ibdiag) {
2717       /* calculate space needed for diagonal blocks */
2718       for (i=0; i<m; i++) {
2719         cnt += sizes[i]*sizes[i];
2720       }
2721       a->inode.bdiagsize = cnt;
2722 
2723       PetscCall(PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work));
2724     }
2725 
2726     /* copy over the diagonal blocks and invert them */
2727     ibdiag = a->inode.ibdiag;
2728     bdiag  = a->inode.bdiag;
2729     cnt    = 0;
2730     for (i=0, row = 0; i<m; i++) {
2731       for (j=0; j<sizes[i]; j++) {
2732         for (k=0; k<sizes[i]; k++) {
2733           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2734         }
2735       }
2736       PetscCall(PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]));
2737 
2738       switch (sizes[i]) {
2739       case 1:
2740         /* Create matrix data structure */
2741         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2742           if (allowzeropivot) {
2743             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2744             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2745             A->factorerror_zeropivot_row   = row;
2746             PetscCall(PetscInfo(A,"Zero pivot, row %" PetscInt_FMT "\n",row));
2747           } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %" PetscInt_FMT,row);
2748         }
2749         ibdiag[cnt] = 1.0/ibdiag[cnt];
2750         break;
2751       case 2:
2752         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected));
2753         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2754         break;
2755       case 3:
2756         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected));
2757         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2758         break;
2759       case 4:
2760         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected));
2761         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2762         break;
2763       case 5:
2764         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected));
2765         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2766         break;
2767       default:
2768         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2769       }
2770       cnt += sizes[i]*sizes[i];
2771       row += sizes[i];
2772     }
2773     a->inode.ibdiagvalid = PETSC_TRUE;
2774   }
2775   ibdiag = a->inode.ibdiag;
2776   bdiag  = a->inode.bdiag;
2777   t      = a->inode.ssor_work;
2778 
2779   PetscCall(VecGetArray(xx,&x));
2780   PetscCall(VecGetArrayRead(bb,&b));
2781   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2782   if (flag & SOR_ZERO_INITIAL_GUESS) {
2783     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2784 
2785       for (i=0, row=0; i<m; i++) {
2786         sz  = diag[row] - ii[row];
2787         v1  = a->a + ii[row];
2788         idx = a->j + ii[row];
2789 
2790         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2791         switch (sizes[i]) {
2792         case 1:
2793 
2794           sum1 = b[row];
2795           for (n = 0; n<sz-1; n+=2) {
2796             i1    = idx[0];
2797             i2    = idx[1];
2798             idx  += 2;
2799             tmp0  = x[i1];
2800             tmp1  = x[i2];
2801             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2802           }
2803 
2804           if (n == sz-1) {
2805             tmp0  = x[*idx];
2806             sum1 -= *v1 * tmp0;
2807           }
2808           t[row]   = sum1;
2809           x[row++] = sum1*(*ibdiag++);
2810           break;
2811         case 2:
2812           v2   = a->a + ii[row+1];
2813           sum1 = b[row];
2814           sum2 = b[row+1];
2815           for (n = 0; n<sz-1; n+=2) {
2816             i1    = idx[0];
2817             i2    = idx[1];
2818             idx  += 2;
2819             tmp0  = x[i1];
2820             tmp1  = x[i2];
2821             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2822             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2823           }
2824 
2825           if (n == sz-1) {
2826             tmp0  = x[*idx];
2827             sum1 -= v1[0] * tmp0;
2828             sum2 -= v2[0] * tmp0;
2829           }
2830           t[row]   = sum1;
2831           t[row+1] = sum2;
2832           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2833           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2834           ibdiag  += 4;
2835           break;
2836         case 3:
2837           v2   = a->a + ii[row+1];
2838           v3   = a->a + ii[row+2];
2839           sum1 = b[row];
2840           sum2 = b[row+1];
2841           sum3 = b[row+2];
2842           for (n = 0; n<sz-1; n+=2) {
2843             i1    = idx[0];
2844             i2    = idx[1];
2845             idx  += 2;
2846             tmp0  = x[i1];
2847             tmp1  = x[i2];
2848             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2849             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2850             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2851           }
2852 
2853           if (n == sz-1) {
2854             tmp0  = x[*idx];
2855             sum1 -= v1[0] * tmp0;
2856             sum2 -= v2[0] * tmp0;
2857             sum3 -= v3[0] * tmp0;
2858           }
2859           t[row]   = sum1;
2860           t[row+1] = sum2;
2861           t[row+2] = sum3;
2862           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2863           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2864           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2865           ibdiag  += 9;
2866           break;
2867         case 4:
2868           v2   = a->a + ii[row+1];
2869           v3   = a->a + ii[row+2];
2870           v4   = a->a + ii[row+3];
2871           sum1 = b[row];
2872           sum2 = b[row+1];
2873           sum3 = b[row+2];
2874           sum4 = b[row+3];
2875           for (n = 0; n<sz-1; n+=2) {
2876             i1    = idx[0];
2877             i2    = idx[1];
2878             idx  += 2;
2879             tmp0  = x[i1];
2880             tmp1  = x[i2];
2881             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2882             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2883             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2884             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2885           }
2886 
2887           if (n == sz-1) {
2888             tmp0  = x[*idx];
2889             sum1 -= v1[0] * tmp0;
2890             sum2 -= v2[0] * tmp0;
2891             sum3 -= v3[0] * tmp0;
2892             sum4 -= v4[0] * tmp0;
2893           }
2894           t[row]   = sum1;
2895           t[row+1] = sum2;
2896           t[row+2] = sum3;
2897           t[row+3] = sum4;
2898           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2899           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2900           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2901           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2902           ibdiag  += 16;
2903           break;
2904         case 5:
2905           v2   = a->a + ii[row+1];
2906           v3   = a->a + ii[row+2];
2907           v4   = a->a + ii[row+3];
2908           v5   = a->a + ii[row+4];
2909           sum1 = b[row];
2910           sum2 = b[row+1];
2911           sum3 = b[row+2];
2912           sum4 = b[row+3];
2913           sum5 = b[row+4];
2914           for (n = 0; n<sz-1; n+=2) {
2915             i1    = idx[0];
2916             i2    = idx[1];
2917             idx  += 2;
2918             tmp0  = x[i1];
2919             tmp1  = x[i2];
2920             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2921             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2922             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2923             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2924             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2925           }
2926 
2927           if (n == sz-1) {
2928             tmp0  = x[*idx];
2929             sum1 -= v1[0] * tmp0;
2930             sum2 -= v2[0] * tmp0;
2931             sum3 -= v3[0] * tmp0;
2932             sum4 -= v4[0] * tmp0;
2933             sum5 -= v5[0] * tmp0;
2934           }
2935           t[row]   = sum1;
2936           t[row+1] = sum2;
2937           t[row+2] = sum3;
2938           t[row+3] = sum4;
2939           t[row+4] = sum5;
2940           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2941           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2942           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2943           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2944           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2945           ibdiag  += 25;
2946           break;
2947         default:
2948           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
2949         }
2950       }
2951 
2952       xb   = t;
2953       PetscCall(PetscLogFlops(a->nz));
2954     } else xb = b;
2955     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2956 
2957       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2958       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2959         ibdiag -= sizes[i]*sizes[i];
2960         sz      = ii[row+1] - diag[row] - 1;
2961         v1      = a->a + diag[row] + 1;
2962         idx     = a->j + diag[row] + 1;
2963 
2964         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2965         switch (sizes[i]) {
2966         case 1:
2967 
2968           sum1 = xb[row];
2969           for (n = 0; n<sz-1; n+=2) {
2970             i1    = idx[0];
2971             i2    = idx[1];
2972             idx  += 2;
2973             tmp0  = x[i1];
2974             tmp1  = x[i2];
2975             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2976           }
2977 
2978           if (n == sz-1) {
2979             tmp0  = x[*idx];
2980             sum1 -= *v1*tmp0;
2981           }
2982           x[row--] = sum1*(*ibdiag);
2983           break;
2984 
2985         case 2:
2986 
2987           sum1 = xb[row];
2988           sum2 = xb[row-1];
2989           /* note that sum1 is associated with the second of the two rows */
2990           v2 = a->a + diag[row-1] + 2;
2991           for (n = 0; n<sz-1; n+=2) {
2992             i1    = idx[0];
2993             i2    = idx[1];
2994             idx  += 2;
2995             tmp0  = x[i1];
2996             tmp1  = x[i2];
2997             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2998             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2999           }
3000 
3001           if (n == sz-1) {
3002             tmp0  = x[*idx];
3003             sum1 -= *v1*tmp0;
3004             sum2 -= *v2*tmp0;
3005           }
3006           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3007           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3008           break;
3009         case 3:
3010 
3011           sum1 = xb[row];
3012           sum2 = xb[row-1];
3013           sum3 = xb[row-2];
3014           v2   = a->a + diag[row-1] + 2;
3015           v3   = a->a + diag[row-2] + 3;
3016           for (n = 0; n<sz-1; n+=2) {
3017             i1    = idx[0];
3018             i2    = idx[1];
3019             idx  += 2;
3020             tmp0  = x[i1];
3021             tmp1  = x[i2];
3022             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3023             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3024             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3025           }
3026 
3027           if (n == sz-1) {
3028             tmp0  = x[*idx];
3029             sum1 -= *v1*tmp0;
3030             sum2 -= *v2*tmp0;
3031             sum3 -= *v3*tmp0;
3032           }
3033           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3034           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3035           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3036           break;
3037         case 4:
3038 
3039           sum1 = xb[row];
3040           sum2 = xb[row-1];
3041           sum3 = xb[row-2];
3042           sum4 = xb[row-3];
3043           v2   = a->a + diag[row-1] + 2;
3044           v3   = a->a + diag[row-2] + 3;
3045           v4   = a->a + diag[row-3] + 4;
3046           for (n = 0; n<sz-1; n+=2) {
3047             i1    = idx[0];
3048             i2    = idx[1];
3049             idx  += 2;
3050             tmp0  = x[i1];
3051             tmp1  = x[i2];
3052             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3053             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3054             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3055             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3056           }
3057 
3058           if (n == sz-1) {
3059             tmp0  = x[*idx];
3060             sum1 -= *v1*tmp0;
3061             sum2 -= *v2*tmp0;
3062             sum3 -= *v3*tmp0;
3063             sum4 -= *v4*tmp0;
3064           }
3065           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3066           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3067           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3068           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3069           break;
3070         case 5:
3071 
3072           sum1 = xb[row];
3073           sum2 = xb[row-1];
3074           sum3 = xb[row-2];
3075           sum4 = xb[row-3];
3076           sum5 = xb[row-4];
3077           v2   = a->a + diag[row-1] + 2;
3078           v3   = a->a + diag[row-2] + 3;
3079           v4   = a->a + diag[row-3] + 4;
3080           v5   = a->a + diag[row-4] + 5;
3081           for (n = 0; n<sz-1; n+=2) {
3082             i1    = idx[0];
3083             i2    = idx[1];
3084             idx  += 2;
3085             tmp0  = x[i1];
3086             tmp1  = x[i2];
3087             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3088             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3089             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3090             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3091             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3092           }
3093 
3094           if (n == sz-1) {
3095             tmp0  = x[*idx];
3096             sum1 -= *v1*tmp0;
3097             sum2 -= *v2*tmp0;
3098             sum3 -= *v3*tmp0;
3099             sum4 -= *v4*tmp0;
3100             sum5 -= *v5*tmp0;
3101           }
3102           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3103           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3104           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3105           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3106           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3107           break;
3108         default:
3109           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3110         }
3111       }
3112 
3113       PetscCall(PetscLogFlops(a->nz));
3114     }
3115     its--;
3116   }
3117   while (its--) {
3118 
3119     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3120       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3121            i<m;
3122            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3123 
3124         sz  = diag[row] - ii[row];
3125         v1  = a->a + ii[row];
3126         idx = a->j + ii[row];
3127         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3128         switch (sizes[i]) {
3129         case 1:
3130           sum1 = b[row];
3131           for (n = 0; n<sz-1; n+=2) {
3132             i1    = idx[0];
3133             i2    = idx[1];
3134             idx  += 2;
3135             tmp0  = x[i1];
3136             tmp1  = x[i2];
3137             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3138           }
3139           if (n == sz-1) {
3140             tmp0  = x[*idx++];
3141             sum1 -= *v1 * tmp0;
3142             v1++;
3143           }
3144           t[row]   = sum1;
3145           sz      = ii[row+1] - diag[row] - 1;
3146           idx     = a->j + diag[row] + 1;
3147           v1 += 1;
3148           for (n = 0; n<sz-1; n+=2) {
3149             i1    = idx[0];
3150             i2    = idx[1];
3151             idx  += 2;
3152             tmp0  = x[i1];
3153             tmp1  = x[i2];
3154             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3155           }
3156           if (n == sz-1) {
3157             tmp0  = x[*idx++];
3158             sum1 -= *v1 * tmp0;
3159           }
3160           /* in MatSOR_SeqAIJ this line would be
3161            *
3162            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3163            *
3164            * but omega == 1, so this becomes
3165            *
3166            * x[row] = sum1*(*ibdiag++);
3167            *
3168            */
3169           x[row] = sum1*(*ibdiag);
3170           break;
3171         case 2:
3172           v2   = a->a + ii[row+1];
3173           sum1 = b[row];
3174           sum2 = b[row+1];
3175           for (n = 0; n<sz-1; n+=2) {
3176             i1    = idx[0];
3177             i2    = idx[1];
3178             idx  += 2;
3179             tmp0  = x[i1];
3180             tmp1  = x[i2];
3181             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3182             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3183           }
3184           if (n == sz-1) {
3185             tmp0  = x[*idx++];
3186             sum1 -= v1[0] * tmp0;
3187             sum2 -= v2[0] * tmp0;
3188             v1++; v2++;
3189           }
3190           t[row]   = sum1;
3191           t[row+1] = sum2;
3192           sz      = ii[row+1] - diag[row] - 2;
3193           idx     = a->j + diag[row] + 2;
3194           v1 += 2;
3195           v2 += 2;
3196           for (n = 0; n<sz-1; n+=2) {
3197             i1    = idx[0];
3198             i2    = idx[1];
3199             idx  += 2;
3200             tmp0  = x[i1];
3201             tmp1  = x[i2];
3202             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3203             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3204           }
3205           if (n == sz-1) {
3206             tmp0  = x[*idx];
3207             sum1 -= v1[0] * tmp0;
3208             sum2 -= v2[0] * tmp0;
3209           }
3210           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3211           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3212           break;
3213         case 3:
3214           v2   = a->a + ii[row+1];
3215           v3   = a->a + ii[row+2];
3216           sum1 = b[row];
3217           sum2 = b[row+1];
3218           sum3 = b[row+2];
3219           for (n = 0; n<sz-1; n+=2) {
3220             i1    = idx[0];
3221             i2    = idx[1];
3222             idx  += 2;
3223             tmp0  = x[i1];
3224             tmp1  = x[i2];
3225             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3226             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3227             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3228           }
3229           if (n == sz-1) {
3230             tmp0  = x[*idx++];
3231             sum1 -= v1[0] * tmp0;
3232             sum2 -= v2[0] * tmp0;
3233             sum3 -= v3[0] * tmp0;
3234             v1++; v2++; v3++;
3235           }
3236           t[row]   = sum1;
3237           t[row+1] = sum2;
3238           t[row+2] = sum3;
3239           sz      = ii[row+1] - diag[row] - 3;
3240           idx     = a->j + diag[row] + 3;
3241           v1 += 3;
3242           v2 += 3;
3243           v3 += 3;
3244           for (n = 0; n<sz-1; n+=2) {
3245             i1    = idx[0];
3246             i2    = idx[1];
3247             idx  += 2;
3248             tmp0  = x[i1];
3249             tmp1  = x[i2];
3250             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3251             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3252             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3253           }
3254           if (n == sz-1) {
3255             tmp0  = x[*idx];
3256             sum1 -= v1[0] * tmp0;
3257             sum2 -= v2[0] * tmp0;
3258             sum3 -= v3[0] * tmp0;
3259           }
3260           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3261           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3262           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3263           break;
3264         case 4:
3265           v2   = a->a + ii[row+1];
3266           v3   = a->a + ii[row+2];
3267           v4   = a->a + ii[row+3];
3268           sum1 = b[row];
3269           sum2 = b[row+1];
3270           sum3 = b[row+2];
3271           sum4 = b[row+3];
3272           for (n = 0; n<sz-1; n+=2) {
3273             i1    = idx[0];
3274             i2    = idx[1];
3275             idx  += 2;
3276             tmp0  = x[i1];
3277             tmp1  = x[i2];
3278             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3279             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3280             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3281             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3282           }
3283           if (n == sz-1) {
3284             tmp0  = x[*idx++];
3285             sum1 -= v1[0] * tmp0;
3286             sum2 -= v2[0] * tmp0;
3287             sum3 -= v3[0] * tmp0;
3288             sum4 -= v4[0] * tmp0;
3289             v1++; v2++; v3++; v4++;
3290           }
3291           t[row]   = sum1;
3292           t[row+1] = sum2;
3293           t[row+2] = sum3;
3294           t[row+3] = sum4;
3295           sz      = ii[row+1] - diag[row] - 4;
3296           idx     = a->j + diag[row] + 4;
3297           v1 += 4;
3298           v2 += 4;
3299           v3 += 4;
3300           v4 += 4;
3301           for (n = 0; n<sz-1; n+=2) {
3302             i1    = idx[0];
3303             i2    = idx[1];
3304             idx  += 2;
3305             tmp0  = x[i1];
3306             tmp1  = x[i2];
3307             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3311           }
3312           if (n == sz-1) {
3313             tmp0  = x[*idx];
3314             sum1 -= v1[0] * tmp0;
3315             sum2 -= v2[0] * tmp0;
3316             sum3 -= v3[0] * tmp0;
3317             sum4 -= v4[0] * tmp0;
3318           }
3319           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3320           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3321           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3322           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3323           break;
3324         case 5:
3325           v2   = a->a + ii[row+1];
3326           v3   = a->a + ii[row+2];
3327           v4   = a->a + ii[row+3];
3328           v5   = a->a + ii[row+4];
3329           sum1 = b[row];
3330           sum2 = b[row+1];
3331           sum3 = b[row+2];
3332           sum4 = b[row+3];
3333           sum5 = b[row+4];
3334           for (n = 0; n<sz-1; n+=2) {
3335             i1    = idx[0];
3336             i2    = idx[1];
3337             idx  += 2;
3338             tmp0  = x[i1];
3339             tmp1  = x[i2];
3340             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3341             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3342             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3343             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3344             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3345           }
3346           if (n == sz-1) {
3347             tmp0  = x[*idx++];
3348             sum1 -= v1[0] * tmp0;
3349             sum2 -= v2[0] * tmp0;
3350             sum3 -= v3[0] * tmp0;
3351             sum4 -= v4[0] * tmp0;
3352             sum5 -= v5[0] * tmp0;
3353             v1++; v2++; v3++; v4++; v5++;
3354           }
3355           t[row]   = sum1;
3356           t[row+1] = sum2;
3357           t[row+2] = sum3;
3358           t[row+3] = sum4;
3359           t[row+4] = sum5;
3360           sz      = ii[row+1] - diag[row] - 5;
3361           idx     = a->j + diag[row] + 5;
3362           v1 += 5;
3363           v2 += 5;
3364           v3 += 5;
3365           v4 += 5;
3366           v5 += 5;
3367           for (n = 0; n<sz-1; n+=2) {
3368             i1    = idx[0];
3369             i2    = idx[1];
3370             idx  += 2;
3371             tmp0  = x[i1];
3372             tmp1  = x[i2];
3373             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3374             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3375             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3376             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3377             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3378           }
3379           if (n == sz-1) {
3380             tmp0  = x[*idx];
3381             sum1 -= v1[0] * tmp0;
3382             sum2 -= v2[0] * tmp0;
3383             sum3 -= v3[0] * tmp0;
3384             sum4 -= v4[0] * tmp0;
3385             sum5 -= v5[0] * tmp0;
3386           }
3387           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3388           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3389           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3390           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3391           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3392           break;
3393         default:
3394           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3395         }
3396       }
3397       xb   = t;
3398       PetscCall(PetscLogFlops(2.0*a->nz));  /* undercounts diag inverse */
3399     } else xb = b;
3400 
3401     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3402 
3403       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3404       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3405         ibdiag -= sizes[i]*sizes[i];
3406 
3407         /* set RHS */
3408         if (xb == b) {
3409           /* whole (old way) */
3410           sz      = ii[row+1] - ii[row];
3411           idx     = a->j + ii[row];
3412           switch (sizes[i]) {
3413           case 5:
3414             v5      = a->a + ii[row-4];
3415           case 4: /* fall through */
3416             v4      = a->a + ii[row-3];
3417           case 3:
3418             v3      = a->a + ii[row-2];
3419           case 2:
3420             v2      = a->a + ii[row-1];
3421           case 1:
3422             v1      = a->a + ii[row];
3423             break;
3424           default:
3425             SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3426           }
3427         } else {
3428           /* upper, no diag */
3429           sz      = ii[row+1] - diag[row] - 1;
3430           idx     = a->j + diag[row] + 1;
3431           switch (sizes[i]) {
3432           case 5:
3433             v5      = a->a + diag[row-4] + 5;
3434           case 4: /* fall through */
3435             v4      = a->a + diag[row-3] + 4;
3436           case 3:
3437             v3      = a->a + diag[row-2] + 3;
3438           case 2:
3439             v2      = a->a + diag[row-1] + 2;
3440           case 1:
3441             v1      = a->a + diag[row] + 1;
3442           }
3443         }
3444         /* set sum */
3445         switch (sizes[i]) {
3446         case 5:
3447           sum5 = xb[row-4];
3448         case 4: /* fall through */
3449           sum4 = xb[row-3];
3450         case 3:
3451           sum3 = xb[row-2];
3452         case 2:
3453           sum2 = xb[row-1];
3454         case 1:
3455           /* note that sum1 is associated with the last row */
3456           sum1 = xb[row];
3457         }
3458         /* do sums */
3459         for (n = 0; n<sz-1; n+=2) {
3460             i1    = idx[0];
3461             i2    = idx[1];
3462             idx  += 2;
3463             tmp0  = x[i1];
3464             tmp1  = x[i2];
3465             switch (sizes[i]) {
3466             case 5:
3467               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3468             case 4: /* fall through */
3469               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3470             case 3:
3471               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3472             case 2:
3473               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3474             case 1:
3475               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3476             }
3477         }
3478         /* ragged edge */
3479         if (n == sz-1) {
3480           tmp0  = x[*idx];
3481           switch (sizes[i]) {
3482           case 5:
3483             sum5 -= *v5*tmp0;
3484           case 4: /* fall through */
3485             sum4 -= *v4*tmp0;
3486           case 3:
3487             sum3 -= *v3*tmp0;
3488           case 2:
3489             sum2 -= *v2*tmp0;
3490           case 1:
3491             sum1 -= *v1*tmp0;
3492           }
3493         }
3494         /* update */
3495         if (xb == b) {
3496           /* whole (old way) w/ diag */
3497           switch (sizes[i]) {
3498           case 5:
3499             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3500             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3501             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3502             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3503             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3504             break;
3505           case 4:
3506             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3507             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3508             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3509             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3510             break;
3511           case 3:
3512             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3513             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3514             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3515             break;
3516           case 2:
3517             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3518             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3519             break;
3520           case 1:
3521             x[row--] += sum1*(*ibdiag);
3522             break;
3523           }
3524         } else {
3525           /* no diag so set =  */
3526           switch (sizes[i]) {
3527           case 5:
3528             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3529             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3530             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3531             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3532             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3533             break;
3534           case 4:
3535             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3536             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3537             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3538             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3539             break;
3540           case 3:
3541             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3542             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3543             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3544             break;
3545           case 2:
3546             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3547             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3548             break;
3549           case 1:
3550             x[row--] = sum1*(*ibdiag);
3551             break;
3552           }
3553         }
3554       }
3555       if (xb == b) {
3556         PetscCall(PetscLogFlops(2.0*a->nz));
3557       } else {
3558         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3559       }
3560     }
3561   }
3562   if (flag & SOR_EISENSTAT) {
3563     /*
3564           Apply  (U + D)^-1  where D is now the block diagonal
3565     */
3566     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3567     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3568       ibdiag -= sizes[i]*sizes[i];
3569       sz      = ii[row+1] - diag[row] - 1;
3570       v1      = a->a + diag[row] + 1;
3571       idx     = a->j + diag[row] + 1;
3572       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3573       switch (sizes[i]) {
3574       case 1:
3575 
3576         sum1 = b[row];
3577         for (n = 0; n<sz-1; n+=2) {
3578           i1    = idx[0];
3579           i2    = idx[1];
3580           idx  += 2;
3581           tmp0  = x[i1];
3582           tmp1  = x[i2];
3583           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3584         }
3585 
3586         if (n == sz-1) {
3587           tmp0  = x[*idx];
3588           sum1 -= *v1*tmp0;
3589         }
3590         x[row] = sum1*(*ibdiag);row--;
3591         break;
3592 
3593       case 2:
3594 
3595         sum1 = b[row];
3596         sum2 = b[row-1];
3597         /* note that sum1 is associated with the second of the two rows */
3598         v2 = a->a + diag[row-1] + 2;
3599         for (n = 0; n<sz-1; n+=2) {
3600           i1    = idx[0];
3601           i2    = idx[1];
3602           idx  += 2;
3603           tmp0  = x[i1];
3604           tmp1  = x[i2];
3605           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3607         }
3608 
3609         if (n == sz-1) {
3610           tmp0  = x[*idx];
3611           sum1 -= *v1*tmp0;
3612           sum2 -= *v2*tmp0;
3613         }
3614         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3615         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3616         row     -= 2;
3617         break;
3618       case 3:
3619 
3620         sum1 = b[row];
3621         sum2 = b[row-1];
3622         sum3 = b[row-2];
3623         v2   = a->a + diag[row-1] + 2;
3624         v3   = a->a + diag[row-2] + 3;
3625         for (n = 0; n<sz-1; n+=2) {
3626           i1    = idx[0];
3627           i2    = idx[1];
3628           idx  += 2;
3629           tmp0  = x[i1];
3630           tmp1  = x[i2];
3631           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3632           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3633           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3634         }
3635 
3636         if (n == sz-1) {
3637           tmp0  = x[*idx];
3638           sum1 -= *v1*tmp0;
3639           sum2 -= *v2*tmp0;
3640           sum3 -= *v3*tmp0;
3641         }
3642         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3643         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3644         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3645         row     -= 3;
3646         break;
3647       case 4:
3648 
3649         sum1 = b[row];
3650         sum2 = b[row-1];
3651         sum3 = b[row-2];
3652         sum4 = b[row-3];
3653         v2   = a->a + diag[row-1] + 2;
3654         v3   = a->a + diag[row-2] + 3;
3655         v4   = a->a + diag[row-3] + 4;
3656         for (n = 0; n<sz-1; n+=2) {
3657           i1    = idx[0];
3658           i2    = idx[1];
3659           idx  += 2;
3660           tmp0  = x[i1];
3661           tmp1  = x[i2];
3662           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3663           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3664           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3665           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3666         }
3667 
3668         if (n == sz-1) {
3669           tmp0  = x[*idx];
3670           sum1 -= *v1*tmp0;
3671           sum2 -= *v2*tmp0;
3672           sum3 -= *v3*tmp0;
3673           sum4 -= *v4*tmp0;
3674         }
3675         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3676         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3677         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3678         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3679         row     -= 4;
3680         break;
3681       case 5:
3682 
3683         sum1 = b[row];
3684         sum2 = b[row-1];
3685         sum3 = b[row-2];
3686         sum4 = b[row-3];
3687         sum5 = b[row-4];
3688         v2   = a->a + diag[row-1] + 2;
3689         v3   = a->a + diag[row-2] + 3;
3690         v4   = a->a + diag[row-3] + 4;
3691         v5   = a->a + diag[row-4] + 5;
3692         for (n = 0; n<sz-1; n+=2) {
3693           i1    = idx[0];
3694           i2    = idx[1];
3695           idx  += 2;
3696           tmp0  = x[i1];
3697           tmp1  = x[i2];
3698           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3699           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3700           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3701           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3702           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3703         }
3704 
3705         if (n == sz-1) {
3706           tmp0  = x[*idx];
3707           sum1 -= *v1*tmp0;
3708           sum2 -= *v2*tmp0;
3709           sum3 -= *v3*tmp0;
3710           sum4 -= *v4*tmp0;
3711           sum5 -= *v5*tmp0;
3712         }
3713         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3714         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3715         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3716         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3717         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3718         row     -= 5;
3719         break;
3720       default:
3721         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3722       }
3723     }
3724     PetscCall(PetscLogFlops(a->nz));
3725 
3726     /*
3727            t = b - D x    where D is the block diagonal
3728     */
3729     cnt = 0;
3730     for (i=0, row=0; i<m; i++) {
3731       switch (sizes[i]) {
3732       case 1:
3733         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3734         break;
3735       case 2:
3736         x1       = x[row]; x2 = x[row+1];
3737         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3738         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3739         t[row]   = b[row] - tmp1;
3740         t[row+1] = b[row+1] - tmp2; row += 2;
3741         cnt     += 4;
3742         break;
3743       case 3:
3744         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3745         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3746         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3747         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3748         t[row]   = b[row] - tmp1;
3749         t[row+1] = b[row+1] - tmp2;
3750         t[row+2] = b[row+2] - tmp3; row += 3;
3751         cnt     += 9;
3752         break;
3753       case 4:
3754         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3755         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3756         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3757         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3758         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3759         t[row]   = b[row] - tmp1;
3760         t[row+1] = b[row+1] - tmp2;
3761         t[row+2] = b[row+2] - tmp3;
3762         t[row+3] = b[row+3] - tmp4; row += 4;
3763         cnt     += 16;
3764         break;
3765       case 5:
3766         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3767         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3768         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3769         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3770         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3771         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3772         t[row]   = b[row] - tmp1;
3773         t[row+1] = b[row+1] - tmp2;
3774         t[row+2] = b[row+2] - tmp3;
3775         t[row+3] = b[row+3] - tmp4;
3776         t[row+4] = b[row+4] - tmp5;row += 5;
3777         cnt     += 25;
3778         break;
3779       default:
3780         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3781       }
3782     }
3783     PetscCall(PetscLogFlops(m));
3784 
3785     /*
3786           Apply (L + D)^-1 where D is the block diagonal
3787     */
3788     for (i=0, row=0; i<m; i++) {
3789       sz  = diag[row] - ii[row];
3790       v1  = a->a + ii[row];
3791       idx = a->j + ii[row];
3792       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3793       switch (sizes[i]) {
3794       case 1:
3795 
3796         sum1 = t[row];
3797         for (n = 0; n<sz-1; n+=2) {
3798           i1    = idx[0];
3799           i2    = idx[1];
3800           idx  += 2;
3801           tmp0  = t[i1];
3802           tmp1  = t[i2];
3803           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3804         }
3805 
3806         if (n == sz-1) {
3807           tmp0  = t[*idx];
3808           sum1 -= *v1 * tmp0;
3809         }
3810         x[row] += t[row] = sum1*(*ibdiag++); row++;
3811         break;
3812       case 2:
3813         v2   = a->a + ii[row+1];
3814         sum1 = t[row];
3815         sum2 = t[row+1];
3816         for (n = 0; n<sz-1; n+=2) {
3817           i1    = idx[0];
3818           i2    = idx[1];
3819           idx  += 2;
3820           tmp0  = t[i1];
3821           tmp1  = t[i2];
3822           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3823           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3824         }
3825 
3826         if (n == sz-1) {
3827           tmp0  = t[*idx];
3828           sum1 -= v1[0] * tmp0;
3829           sum2 -= v2[0] * tmp0;
3830         }
3831         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3832         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3833         ibdiag   += 4; row += 2;
3834         break;
3835       case 3:
3836         v2   = a->a + ii[row+1];
3837         v3   = a->a + ii[row+2];
3838         sum1 = t[row];
3839         sum2 = t[row+1];
3840         sum3 = t[row+2];
3841         for (n = 0; n<sz-1; n+=2) {
3842           i1    = idx[0];
3843           i2    = idx[1];
3844           idx  += 2;
3845           tmp0  = t[i1];
3846           tmp1  = t[i2];
3847           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3848           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3849           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3850         }
3851 
3852         if (n == sz-1) {
3853           tmp0  = t[*idx];
3854           sum1 -= v1[0] * tmp0;
3855           sum2 -= v2[0] * tmp0;
3856           sum3 -= v3[0] * tmp0;
3857         }
3858         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3859         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3860         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3861         ibdiag   += 9; row += 3;
3862         break;
3863       case 4:
3864         v2   = a->a + ii[row+1];
3865         v3   = a->a + ii[row+2];
3866         v4   = a->a + ii[row+3];
3867         sum1 = t[row];
3868         sum2 = t[row+1];
3869         sum3 = t[row+2];
3870         sum4 = t[row+3];
3871         for (n = 0; n<sz-1; n+=2) {
3872           i1    = idx[0];
3873           i2    = idx[1];
3874           idx  += 2;
3875           tmp0  = t[i1];
3876           tmp1  = t[i2];
3877           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3878           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3879           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3880           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3881         }
3882 
3883         if (n == sz-1) {
3884           tmp0  = t[*idx];
3885           sum1 -= v1[0] * tmp0;
3886           sum2 -= v2[0] * tmp0;
3887           sum3 -= v3[0] * tmp0;
3888           sum4 -= v4[0] * tmp0;
3889         }
3890         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3891         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3892         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3893         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3894         ibdiag   += 16; row += 4;
3895         break;
3896       case 5:
3897         v2   = a->a + ii[row+1];
3898         v3   = a->a + ii[row+2];
3899         v4   = a->a + ii[row+3];
3900         v5   = a->a + ii[row+4];
3901         sum1 = t[row];
3902         sum2 = t[row+1];
3903         sum3 = t[row+2];
3904         sum4 = t[row+3];
3905         sum5 = t[row+4];
3906         for (n = 0; n<sz-1; n+=2) {
3907           i1    = idx[0];
3908           i2    = idx[1];
3909           idx  += 2;
3910           tmp0  = t[i1];
3911           tmp1  = t[i2];
3912           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3913           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3914           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3915           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3916           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3917         }
3918 
3919         if (n == sz-1) {
3920           tmp0  = t[*idx];
3921           sum1 -= v1[0] * tmp0;
3922           sum2 -= v2[0] * tmp0;
3923           sum3 -= v3[0] * tmp0;
3924           sum4 -= v4[0] * tmp0;
3925           sum5 -= v5[0] * tmp0;
3926         }
3927         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3928         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3929         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3930         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3931         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3932         ibdiag   += 25; row += 5;
3933         break;
3934       default:
3935         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
3936       }
3937     }
3938     PetscCall(PetscLogFlops(a->nz));
3939   }
3940   PetscCall(VecRestoreArray(xx,&x));
3941   PetscCall(VecRestoreArrayRead(bb,&b));
3942   PetscFunctionReturn(0);
3943 }
3944 
3945 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3946 {
3947   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3948   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3949   const MatScalar   *bdiag = a->inode.bdiag;
3950   const PetscScalar *b;
3951   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3952   const PetscInt    *sizes = a->inode.size;
3953 
3954   PetscFunctionBegin;
3955   PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3956   PetscCall(VecGetArray(xx,&x));
3957   PetscCall(VecGetArrayRead(bb,&b));
3958   cnt  = 0;
3959   for (i=0, row=0; i<m; i++) {
3960     switch (sizes[i]) {
3961     case 1:
3962       x[row] = b[row]*bdiag[cnt++];row++;
3963       break;
3964     case 2:
3965       x1       = b[row]; x2 = b[row+1];
3966       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3967       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3968       x[row++] = tmp1;
3969       x[row++] = tmp2;
3970       cnt     += 4;
3971       break;
3972     case 3:
3973       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3974       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3975       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3976       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3977       x[row++] = tmp1;
3978       x[row++] = tmp2;
3979       x[row++] = tmp3;
3980       cnt     += 9;
3981       break;
3982     case 4:
3983       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3984       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3985       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3986       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3987       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3988       x[row++] = tmp1;
3989       x[row++] = tmp2;
3990       x[row++] = tmp3;
3991       x[row++] = tmp4;
3992       cnt     += 16;
3993       break;
3994     case 5:
3995       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3996       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3997       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3998       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3999       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4000       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4001       x[row++] = tmp1;
4002       x[row++] = tmp2;
4003       x[row++] = tmp3;
4004       x[row++] = tmp4;
4005       x[row++] = tmp5;
4006       cnt     += 25;
4007       break;
4008     default:
4009       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]);
4010     }
4011   }
4012   PetscCall(PetscLogFlops(2.0*cnt));
4013   PetscCall(VecRestoreArray(xx,&x));
4014   PetscCall(VecRestoreArrayRead(bb,&b));
4015   PetscFunctionReturn(0);
4016 }
4017 
4018 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4019 {
4020   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4021 
4022   PetscFunctionBegin;
4023   a->inode.node_count       = 0;
4024   a->inode.use              = PETSC_FALSE;
4025   a->inode.checked          = PETSC_FALSE;
4026   a->inode.mat_nonzerostate = -1;
4027   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4028   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4029   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4030   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4031   A->ops->coloringpatch     = NULL;
4032   A->ops->multdiagonalblock = NULL;
4033   if (A->factortype) {
4034     A->ops->solve = MatSolve_SeqAIJ_inplace;
4035   }
4036   PetscFunctionReturn(0);
4037 }
4038 
4039 /*
4040     samestructure indicates that the matrix has not changed its nonzero structure so we
4041     do not need to recompute the inodes
4042 */
4043 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4044 {
4045   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4046   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4047   PetscBool      flag;
4048   const PetscInt *idx,*idy,*ii;
4049 
4050   PetscFunctionBegin;
4051   if (!a->inode.use) {
4052     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4053     PetscCall(PetscFree(a->inode.size));
4054     PetscFunctionReturn(0);
4055   }
4056   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4057 
4058   m = A->rmap->n;
4059   if (!a->inode.size) PetscCall(PetscMalloc1(m+1,&a->inode.size));
4060   ns = a->inode.size;
4061 
4062   i          = 0;
4063   node_count = 0;
4064   idx        = a->j;
4065   ii         = a->i;
4066   while (i < m) {                /* For each row */
4067     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4068     /* Limits the number of elements in a node to 'a->inode.limit' */
4069     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4070       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4071       if (nzy != nzx) break;
4072       idy += nzx;              /* Same nonzero pattern */
4073       PetscCall(PetscArraycmp(idx,idy,nzx,&flag));
4074       if (!flag) break;
4075     }
4076     ns[node_count++] = blk_size;
4077     idx             += blk_size*nzx;
4078     i                = j;
4079   }
4080 
4081   /* If not enough inodes found,, do not use inode version of the routines */
4082   if (!m || node_count > .8*m) {
4083     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4084     PetscCall(PetscFree(a->inode.size));
4085     PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m));
4086   } else {
4087     if (!A->factortype) {
4088       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4089       if (A->rmap->n == A->cmap->n) {
4090         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4091         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4092         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4093         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4094         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4095       }
4096     } else {
4097       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4098     }
4099     a->inode.node_count = node_count;
4100     PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit));
4101   }
4102   a->inode.checked          = PETSC_TRUE;
4103   a->inode.mat_nonzerostate = A->nonzerostate;
4104   PetscFunctionReturn(0);
4105 }
4106 
4107 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4108 {
4109   Mat            B =*C;
4110   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4111   PetscInt       m=A->rmap->n;
4112 
4113   PetscFunctionBegin;
4114   c->inode.use              = a->inode.use;
4115   c->inode.limit            = a->inode.limit;
4116   c->inode.max_limit        = a->inode.max_limit;
4117   c->inode.checked          = PETSC_FALSE;
4118   c->inode.size             = NULL;
4119   c->inode.node_count       = 0;
4120   c->inode.ibdiagvalid      = PETSC_FALSE;
4121   c->inode.ibdiag           = NULL;
4122   c->inode.bdiag            = NULL;
4123   c->inode.mat_nonzerostate = -1;
4124   if (a->inode.use) {
4125     if (a->inode.checked && a->inode.size) {
4126       PetscCall(PetscMalloc1(m+1,&c->inode.size));
4127       PetscCall(PetscArraycpy(c->inode.size,a->inode.size,m+1));
4128 
4129       c->inode.checked          = PETSC_TRUE;
4130       c->inode.node_count       = a->inode.node_count;
4131       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4132     }
4133     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4134     if (!B->factortype) {
4135       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4136       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4137       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4138       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4139       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4140       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4141     } else {
4142       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4143     }
4144   }
4145   PetscFunctionReturn(0);
4146 }
4147 
4148 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4149 {
4150   PetscInt       k;
4151   const PetscInt *vi;
4152 
4153   PetscFunctionBegin;
4154   vi = aj + ai[row];
4155   for (k=0; k<nzl; k++) cols[k] = vi[k];
4156   vi        = aj + adiag[row];
4157   cols[nzl] = vi[0];
4158   vi        = aj + adiag[row+1]+1;
4159   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4160   PetscFunctionReturn(0);
4161 }
4162 /*
4163    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4164    Modified from MatSeqAIJCheckInode().
4165 
4166    Input Parameters:
4167 .  Mat A - ILU or LU matrix factor
4168 
4169 */
4170 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4171 {
4172   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4173   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4174   PetscInt       *cols1,*cols2,*ns;
4175   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4176   PetscBool      flag;
4177 
4178   PetscFunctionBegin;
4179   if (!a->inode.use)    PetscFunctionReturn(0);
4180   if (a->inode.checked) PetscFunctionReturn(0);
4181 
4182   m = A->rmap->n;
4183   if (a->inode.size) ns = a->inode.size;
4184   else {
4185     PetscCall(PetscMalloc1(m+1,&ns));
4186   }
4187 
4188   i          = 0;
4189   node_count = 0;
4190   PetscCall(PetscMalloc2(m,&cols1,m,&cols2));
4191   while (i < m) {                /* For each row */
4192     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4193     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4194     nzx  = nzl1 + nzu1 + 1;
4195     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4196 
4197     /* Limits the number of elements in a node to 'a->inode.limit' */
4198     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4199       nzl2 = ai[j+1] - ai[j];
4200       nzu2 = adiag[j] - adiag[j+1] - 1;
4201       nzy  = nzl2 + nzu2 + 1;
4202       if (nzy != nzx) break;
4203       PetscCall(MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j));
4204       PetscCall(PetscArraycmp(cols1,cols2,nzx,&flag));
4205       if (!flag) break;
4206     }
4207     ns[node_count++] = blk_size;
4208     i                = j;
4209   }
4210   PetscCall(PetscFree2(cols1,cols2));
4211   /* If not enough inodes found,, do not use inode version of the routines */
4212   if (!m || node_count > .8*m) {
4213     PetscCall(PetscFree(ns));
4214 
4215     a->inode.node_count = 0;
4216     a->inode.size       = NULL;
4217     a->inode.use        = PETSC_FALSE;
4218 
4219     PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m));
4220   } else {
4221     A->ops->mult              = NULL;
4222     A->ops->sor               = NULL;
4223     A->ops->multadd           = NULL;
4224     A->ops->getrowij          = NULL;
4225     A->ops->restorerowij      = NULL;
4226     A->ops->getcolumnij       = NULL;
4227     A->ops->restorecolumnij   = NULL;
4228     A->ops->coloringpatch     = NULL;
4229     A->ops->multdiagonalblock = NULL;
4230     a->inode.node_count       = node_count;
4231     a->inode.size             = ns;
4232 
4233     PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit));
4234   }
4235   a->inode.checked = PETSC_TRUE;
4236   PetscFunctionReturn(0);
4237 }
4238 
4239 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4240 {
4241   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4242 
4243   PetscFunctionBegin;
4244   a->inode.ibdiagvalid = PETSC_FALSE;
4245   PetscFunctionReturn(0);
4246 }
4247 
4248 /*
4249      This is really ugly. if inodes are used this replaces the
4250   permutations with ones that correspond to rows/cols of the matrix
4251   rather then inode blocks
4252 */
4253 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4254 {
4255   PetscFunctionBegin;
4256   PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));
4257   PetscFunctionReturn(0);
4258 }
4259 
4260 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4261 {
4262   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4263   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4264   const PetscInt *ridx,*cidx;
4265   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4266   PetscInt       nslim_col,*ns_col;
4267   IS             ris = *rperm,cis = *cperm;
4268 
4269   PetscFunctionBegin;
4270   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4271   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4272 
4273   PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col));
4274   PetscCall(PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns));
4275   PetscCall(PetscMalloc2(m,&permr,n,&permc));
4276 
4277   PetscCall(ISGetIndices(ris,&ridx));
4278   PetscCall(ISGetIndices(cis,&cidx));
4279 
4280   /* Form the inode structure for the rows of permuted matric using inv perm*/
4281   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4282 
4283   /* Construct the permutations for rows*/
4284   for (i=0,row = 0; i<nslim_row; ++i) {
4285     indx      = ridx[i];
4286     start_val = tns[indx];
4287     end_val   = tns[indx + 1];
4288     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4289   }
4290 
4291   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4292   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4293 
4294   /* Construct permutations for columns */
4295   for (i=0,col=0; i<nslim_col; ++i) {
4296     indx      = cidx[i];
4297     start_val = tns[indx];
4298     end_val   = tns[indx + 1];
4299     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4300   }
4301 
4302   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm));
4303   PetscCall(ISSetPermutation(*rperm));
4304   PetscCall(ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm));
4305   PetscCall(ISSetPermutation(*cperm));
4306 
4307   PetscCall(ISRestoreIndices(ris,&ridx));
4308   PetscCall(ISRestoreIndices(cis,&cidx));
4309 
4310   PetscCall(PetscFree(ns_col));
4311   PetscCall(PetscFree2(permr,permc));
4312   PetscCall(ISDestroy(&cis));
4313   PetscCall(ISDestroy(&ris));
4314   PetscCall(PetscFree(tns));
4315   PetscFunctionReturn(0);
4316 }
4317 
4318 /*@C
4319    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4320 
4321    Not Collective
4322 
4323    Input Parameter:
4324 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4325 
4326    Output Parameters:
4327 +  node_count - no of inodes present in the matrix.
4328 .  sizes      - an array of size node_count,with sizes of each inode.
4329 -  limit      - the max size used to generate the inodes.
4330 
4331    Level: advanced
4332 
4333    Notes:
4334     This routine returns some internal storage information
4335    of the matrix, it is intended to be used by advanced users.
4336    It should be called after the matrix is assembled.
4337    The contents of the sizes[] array should not be changed.
4338    NULL may be passed for information not requested.
4339 
4340 .seealso: `MatGetInfo()`
4341 @*/
4342 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4343 {
4344   PetscErrorCode (*f)(Mat,PetscInt*,PetscInt**,PetscInt*);
4345 
4346   PetscFunctionBegin;
4347   PetscCheck(A->assembled,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4348   PetscCall(PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f));
4349   if (f) PetscCall((*f)(A,node_count,sizes,limit));
4350   PetscFunctionReturn(0);
4351 }
4352 
4353 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4354 {
4355   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4356 
4357   PetscFunctionBegin;
4358   if (node_count) *node_count = a->inode.node_count;
4359   if (sizes)      *sizes      = a->inode.size;
4360   if (limit)      *limit      = a->inode.limit;
4361   PetscFunctionReturn(0);
4362 }
4363