xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 5b6bfdb9644f185dbf5e5a09b808ec241507e1e7)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n      = A->cmap->n;
16   m      = A->rmap->n;
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22     for (; count+1 < n; count++,i++) ;
23     if (count < n)  {
24       i++;
25     }
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
30 
31   /* Use the same row structure wherever feasible. */
32   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33     ns_col[i] = ns_row[i];
34   }
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count+1 < n; count++,i++) {
38     ns_col[i] = 1;
39   }
40   /* The last node is the odd ball. padd it up with the remaining rows; */
41   if (count < n) {
42     ns_col[i] = n - count;
43     i++;
44   } else if (count > n) {
45     /* Adjust for the over estimation */
46     ns_col[i-1] += n - count;
47   }
48   *size = i;
49   *ns   = ns_col;
50   PetscFunctionReturn(0);
51 }
52 
53 
54 /*
55       This builds symmetric version of nonzero structure,
56 */
57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58 {
59   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
60   PetscErrorCode ierr;
61   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
64 
65   PetscFunctionBegin;
66   nslim_row = a->inode.node_count;
67   m         = A->rmap->n;
68   n         = A->cmap->n;
69   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
70 
71   /* Use the row_inode as column_inode */
72   nslim_col = nslim_row;
73   ns_col    = ns_row;
74 
75   /* allocate space for reformated inode structure */
76   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
77   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
78   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
79 
80   for (i1=0,col=0; i1<nslim_col; ++i1) {
81     nsz = ns_col[i1];
82     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
83   }
84   /* allocate space for row pointers */
85   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
86   *iia = ia;
87   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
88   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
89 
90   /* determine the number of columns in each row */
91   ia[0] = oshift;
92   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
93 
94     j    = aj + ai[row] + ishift;
95     jmax = aj + ai[row+1] + ishift;
96     if (j==jmax) continue; /* empty row */
97     col  = *j++ + ishift;
98     i2   = tvc[col];
99     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100       ia[i1+1]++;
101       ia[i2+1]++;
102       i2++;                     /* Start col of next node */
103       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104       i2 = tvc[col];
105     }
106     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
107   }
108 
109   /* shift ia[i] to point to next row */
110   for (i1=1; i1<nslim_row+1; i1++) {
111     row        = ia[i1-1];
112     ia[i1]    += row;
113     work[i1-1] = row - oshift;
114   }
115 
116   /* allocate space for column pointers */
117   nz   = ia[nslim_row] + (!ishift);
118   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
119   *jja = ja;
120 
121   /* loop over lower triangular part putting into ja */
122   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123     j    = aj + ai[row] + ishift;
124     jmax = aj + ai[row+1] + ishift;
125     if (j==jmax) continue; /* empty row */
126     col  = *j++ + ishift;
127     i2   = tvc[col];
128     while (i2<i1 && j<jmax) {
129       ja[work[i2]++] = i1 + oshift;
130       ja[work[i1]++] = i2 + oshift;
131       ++i2;
132       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133       i2 = tvc[col];
134     }
135     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
136 
137   }
138   ierr = PetscFree(work);CHKERRQ(ierr);
139   ierr = PetscFree(tns);CHKERRQ(ierr);
140   ierr = PetscFree(tvc);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 
144 /*
145       This builds nonsymmetric version of nonzero structure,
146 */
147 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
148 {
149   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
150   PetscErrorCode ierr;
151   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
152   PetscInt       *tns,*tvc,nsz,i1,i2;
153   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
154 
155   PetscFunctionBegin;
156   nslim_row = a->inode.node_count;
157   n         = A->cmap->n;
158 
159   /* Create The column_inode for this matrix */
160   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
161 
162   /* allocate space for reformated column_inode structure */
163   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
164   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
165   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
166 
167   for (i1=0,col=0; i1<nslim_col; ++i1) {
168     nsz = ns_col[i1];
169     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
170   }
171   /* allocate space for row pointers */
172   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
173   *iia = ia;
174   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
175   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
176 
177   /* determine the number of columns in each row */
178   ia[0] = oshift;
179   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
180     j   = aj + ai[row] + ishift;
181     nz  = ai[row+1] - ai[row];
182     if (!nz) continue; /* empty row */
183     col = *j++ + ishift;
184     i2  = tvc[col];
185     while (nz-- > 0) {           /* off-diagonal elemets */
186       ia[i1+1]++;
187       i2++;                     /* Start col of next node */
188       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
189       if (nz > 0) i2 = tvc[col];
190     }
191   }
192 
193   /* shift ia[i] to point to next row */
194   for (i1=1; i1<nslim_row+1; i1++) {
195     row        = ia[i1-1];
196     ia[i1]    += row;
197     work[i1-1] = row - oshift;
198   }
199 
200   /* allocate space for column pointers */
201   nz   = ia[nslim_row] + (!ishift);
202   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
203   *jja = ja;
204 
205   /* loop over matrix putting into ja */
206   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
207     j   = aj + ai[row] + ishift;
208     nz  = ai[row+1] - ai[row];
209     if (!nz) continue; /* empty row */
210     col = *j++ + ishift;
211     i2  = tvc[col];
212     while (nz-- > 0) {
213       ja[work[i1]++] = i2 + oshift;
214       ++i2;
215       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216       if (nz > 0) i2 = tvc[col];
217     }
218   }
219   ierr = PetscFree(ns_col);CHKERRQ(ierr);
220   ierr = PetscFree(work);CHKERRQ(ierr);
221   ierr = PetscFree(tns);CHKERRQ(ierr);
222   ierr = PetscFree(tvc);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
227 {
228   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   *n = a->inode.node_count;
233   if (!ia) PetscFunctionReturn(0);
234   if (!blockcompressed) {
235     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
236   } else if (symmetric) {
237     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
238   } else {
239     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
240   }
241   PetscFunctionReturn(0);
242 }
243 
244 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
245 {
246   PetscErrorCode ierr;
247 
248   PetscFunctionBegin;
249   if (!ia) PetscFunctionReturn(0);
250 
251   if (!blockcompressed) {
252     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
253   } else {
254     ierr = PetscFree(*ia);CHKERRQ(ierr);
255     ierr = PetscFree(*ja);CHKERRQ(ierr);
256   }
257   PetscFunctionReturn(0);
258 }
259 
260 /* ----------------------------------------------------------- */
261 
262 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
263 {
264   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
265   PetscErrorCode ierr;
266   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
267   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
268 
269   PetscFunctionBegin;
270   nslim_row = a->inode.node_count;
271   n         = A->cmap->n;
272 
273   /* Create The column_inode for this matrix */
274   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
275 
276   /* allocate space for reformated column_inode structure */
277   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
278   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
279   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
280 
281   for (i1=0,col=0; i1<nslim_col; ++i1) {
282     nsz = ns_col[i1];
283     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
284   }
285   /* allocate space for column pointers */
286   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
287   *iia = ia;
288   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
289   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
290 
291   /* determine the number of columns in each row */
292   ia[0] = oshift;
293   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
294     j   = aj + ai[row] + ishift;
295     col = *j++ + ishift;
296     i2  = tvc[col];
297     nz  = ai[row+1] - ai[row];
298     while (nz-- > 0) {           /* off-diagonal elemets */
299       /* ia[i1+1]++; */
300       ia[i2+1]++;
301       i2++;
302       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
303       if (nz > 0) i2 = tvc[col];
304     }
305   }
306 
307   /* shift ia[i] to point to next col */
308   for (i1=1; i1<nslim_col+1; i1++) {
309     col        = ia[i1-1];
310     ia[i1]    += col;
311     work[i1-1] = col - oshift;
312   }
313 
314   /* allocate space for column pointers */
315   nz   = ia[nslim_col] + (!ishift);
316   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
317   *jja = ja;
318 
319   /* loop over matrix putting into ja */
320   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
321     j   = aj + ai[row] + ishift;
322     col = *j++ + ishift;
323     i2  = tvc[col];
324     nz  = ai[row+1] - ai[row];
325     while (nz-- > 0) {
326       /* ja[work[i1]++] = i2 + oshift; */
327       ja[work[i2]++] = i1 + oshift;
328       i2++;
329       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
330       if (nz > 0) i2 = tvc[col];
331     }
332   }
333   ierr = PetscFree(ns_col);CHKERRQ(ierr);
334   ierr = PetscFree(work);CHKERRQ(ierr);
335   ierr = PetscFree(tns);CHKERRQ(ierr);
336   ierr = PetscFree(tvc);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
341 {
342   PetscErrorCode ierr;
343 
344   PetscFunctionBegin;
345   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
346   if (!ia) PetscFunctionReturn(0);
347 
348   if (!blockcompressed) {
349     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
350   } else if (symmetric) {
351     /* Since the indices are symmetric it does'nt matter */
352     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
353   } else {
354     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
355   }
356   PetscFunctionReturn(0);
357 }
358 
359 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
360 {
361   PetscErrorCode ierr;
362 
363   PetscFunctionBegin;
364   if (!ia) PetscFunctionReturn(0);
365   if (!blockcompressed) {
366     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else {
368     ierr = PetscFree(*ia);CHKERRQ(ierr);
369     ierr = PetscFree(*ja);CHKERRQ(ierr);
370   }
371   PetscFunctionReturn(0);
372 }
373 
374 /* ----------------------------------------------------------- */
375 
376 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
377 {
378   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
379   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
380   PetscScalar       *y;
381   const PetscScalar *x;
382   const MatScalar   *v1,*v2,*v3,*v4,*v5;
383   PetscErrorCode    ierr;
384   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
385   const PetscInt    *idx,*ns,*ii;
386 
387 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
388 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
389 #endif
390 
391   PetscFunctionBegin;
392   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
393   node_max = a->inode.node_count;
394   ns       = a->inode.size;     /* Node Size array */
395   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
397   idx      = a->j;
398   v1       = a->a;
399   ii       = a->i;
400 
401   for (i = 0,row = 0; i< node_max; ++i) {
402     nsz         = ns[i];
403     n           = ii[1] - ii[0];
404     nonzerorow += (n>0)*nsz;
405     ii         += nsz;
406     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
407     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
408     sz = n;                     /* No of non zeros in this row */
409                                 /* Switch on the size of Node */
410     switch (nsz) {               /* Each loop in 'case' is unrolled */
411     case 1:
412       sum1 = 0.;
413 
414       for (n = 0; n< sz-1; n+=2) {
415         i1    = idx[0];         /* The instructions are ordered to */
416         i2    = idx[1];         /* make the compiler's job easy */
417         idx  += 2;
418         tmp0  = x[i1];
419         tmp1  = x[i2];
420         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
421       }
422 
423       if (n == sz-1) {          /* Take care of the last nonzero  */
424         tmp0  = x[*idx++];
425         sum1 += *v1++ *tmp0;
426       }
427       y[row++]=sum1;
428       break;
429     case 2:
430       sum1 = 0.;
431       sum2 = 0.;
432       v2   = v1 + n;
433 
434       for (n = 0; n< sz-1; n+=2) {
435         i1    = idx[0];
436         i2    = idx[1];
437         idx  += 2;
438         tmp0  = x[i1];
439         tmp1  = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
442       }
443       if (n == sz-1) {
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446         sum2 += *v2++ * tmp0;
447       }
448       y[row++]=sum1;
449       y[row++]=sum2;
450       v1      =v2;              /* Since the next block to be processed starts there*/
451       idx    +=sz;
452       break;
453     case 3:
454       sum1 = 0.;
455       sum2 = 0.;
456       sum3 = 0.;
457       v2   = v1 + n;
458       v3   = v2 + n;
459 
460       for (n = 0; n< sz-1; n+=2) {
461         i1    = idx[0];
462         i2    = idx[1];
463         idx  += 2;
464         tmp0  = x[i1];
465         tmp1  = x[i2];
466         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
467         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
468         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
469       }
470       if (n == sz-1) {
471         tmp0  = x[*idx++];
472         sum1 += *v1++ * tmp0;
473         sum2 += *v2++ * tmp0;
474         sum3 += *v3++ * tmp0;
475       }
476       y[row++]=sum1;
477       y[row++]=sum2;
478       y[row++]=sum3;
479       v1      =v3;              /* Since the next block to be processed starts there*/
480       idx    +=2*sz;
481       break;
482     case 4:
483       sum1 = 0.;
484       sum2 = 0.;
485       sum3 = 0.;
486       sum4 = 0.;
487       v2   = v1 + n;
488       v3   = v2 + n;
489       v4   = v3 + n;
490 
491       for (n = 0; n< sz-1; n+=2) {
492         i1    = idx[0];
493         i2    = idx[1];
494         idx  += 2;
495         tmp0  = x[i1];
496         tmp1  = x[i2];
497         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
498         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
499         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
500         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
501       }
502       if (n == sz-1) {
503         tmp0  = x[*idx++];
504         sum1 += *v1++ * tmp0;
505         sum2 += *v2++ * tmp0;
506         sum3 += *v3++ * tmp0;
507         sum4 += *v4++ * tmp0;
508       }
509       y[row++]=sum1;
510       y[row++]=sum2;
511       y[row++]=sum3;
512       y[row++]=sum4;
513       v1      =v4;              /* Since the next block to be processed starts there*/
514       idx    +=3*sz;
515       break;
516     case 5:
517       sum1 = 0.;
518       sum2 = 0.;
519       sum3 = 0.;
520       sum4 = 0.;
521       sum5 = 0.;
522       v2   = v1 + n;
523       v3   = v2 + n;
524       v4   = v3 + n;
525       v5   = v4 + n;
526 
527       for (n = 0; n<sz-1; n+=2) {
528         i1    = idx[0];
529         i2    = idx[1];
530         idx  += 2;
531         tmp0  = x[i1];
532         tmp1  = x[i2];
533         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
534         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
535         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
536         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
537         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
538       }
539       if (n == sz-1) {
540         tmp0  = x[*idx++];
541         sum1 += *v1++ * tmp0;
542         sum2 += *v2++ * tmp0;
543         sum3 += *v3++ * tmp0;
544         sum4 += *v4++ * tmp0;
545         sum5 += *v5++ * tmp0;
546       }
547       y[row++]=sum1;
548       y[row++]=sum2;
549       y[row++]=sum3;
550       y[row++]=sum4;
551       y[row++]=sum5;
552       v1      =v5;       /* Since the next block to be processed starts there */
553       idx    +=4*sz;
554       break;
555     default:
556       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
557     }
558   }
559   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
560   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
561   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 /* ----------------------------------------------------------- */
565 /* Almost same code as the MatMult_SeqAIJ_Inode() */
566 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
567 {
568   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
569   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
570   const MatScalar   *v1,*v2,*v3,*v4,*v5;
571   const PetscScalar *x;
572   PetscScalar       *y,*z,*zt;
573   PetscErrorCode    ierr;
574   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
575   const PetscInt    *idx,*ns,*ii;
576 
577   PetscFunctionBegin;
578   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
579   node_max = a->inode.node_count;
580   ns       = a->inode.size;     /* Node Size array */
581 
582   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
583   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
584   zt = z;
585 
586   idx = a->j;
587   v1  = a->a;
588   ii  = a->i;
589 
590   for (i = 0,row = 0; i< node_max; ++i) {
591     nsz = ns[i];
592     n   = ii[1] - ii[0];
593     ii += nsz;
594     sz  = n;                    /* No of non zeros in this row */
595                                 /* Switch on the size of Node */
596     switch (nsz) {               /* Each loop in 'case' is unrolled */
597     case 1:
598       sum1 = *zt++;
599 
600       for (n = 0; n< sz-1; n+=2) {
601         i1    = idx[0];         /* The instructions are ordered to */
602         i2    = idx[1];         /* make the compiler's job easy */
603         idx  += 2;
604         tmp0  = x[i1];
605         tmp1  = x[i2];
606         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
607       }
608 
609       if (n   == sz-1) {          /* Take care of the last nonzero  */
610         tmp0  = x[*idx++];
611         sum1 += *v1++ * tmp0;
612       }
613       y[row++]=sum1;
614       break;
615     case 2:
616       sum1 = *zt++;
617       sum2 = *zt++;
618       v2   = v1 + n;
619 
620       for (n = 0; n< sz-1; n+=2) {
621         i1    = idx[0];
622         i2    = idx[1];
623         idx  += 2;
624         tmp0  = x[i1];
625         tmp1  = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
628       }
629       if (n   == sz-1) {
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++]=sum1;
635       y[row++]=sum2;
636       v1      =v2;              /* Since the next block to be processed starts there*/
637       idx    +=sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n< sz-1; n+=2) {
647         i1    = idx[0];
648         i2    = idx[1];
649         idx  += 2;
650         tmp0  = x[i1];
651         tmp1  = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
653         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
654         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
655       }
656       if (n == sz-1) {
657         tmp0  = x[*idx++];
658         sum1 += *v1++ * tmp0;
659         sum2 += *v2++ * tmp0;
660         sum3 += *v3++ * tmp0;
661       }
662       y[row++]=sum1;
663       y[row++]=sum2;
664       y[row++]=sum3;
665       v1      =v3;              /* Since the next block to be processed starts there*/
666       idx    +=2*sz;
667       break;
668     case 4:
669       sum1 = *zt++;
670       sum2 = *zt++;
671       sum3 = *zt++;
672       sum4 = *zt++;
673       v2   = v1 + n;
674       v3   = v2 + n;
675       v4   = v3 + n;
676 
677       for (n = 0; n< sz-1; n+=2) {
678         i1    = idx[0];
679         i2    = idx[1];
680         idx  += 2;
681         tmp0  = x[i1];
682         tmp1  = x[i2];
683         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
684         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
685         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
686         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
687       }
688       if (n == sz-1) {
689         tmp0  = x[*idx++];
690         sum1 += *v1++ * tmp0;
691         sum2 += *v2++ * tmp0;
692         sum3 += *v3++ * tmp0;
693         sum4 += *v4++ * tmp0;
694       }
695       y[row++]=sum1;
696       y[row++]=sum2;
697       y[row++]=sum3;
698       y[row++]=sum4;
699       v1      =v4;              /* Since the next block to be processed starts there*/
700       idx    +=3*sz;
701       break;
702     case 5:
703       sum1 = *zt++;
704       sum2 = *zt++;
705       sum3 = *zt++;
706       sum4 = *zt++;
707       sum5 = *zt++;
708       v2   = v1 + n;
709       v3   = v2 + n;
710       v4   = v3 + n;
711       v5   = v4 + n;
712 
713       for (n = 0; n<sz-1; n+=2) {
714         i1    = idx[0];
715         i2    = idx[1];
716         idx  += 2;
717         tmp0  = x[i1];
718         tmp1  = x[i2];
719         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
720         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
721         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
722         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
723         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
724       }
725       if (n   == sz-1) {
726         tmp0  = x[*idx++];
727         sum1 += *v1++ * tmp0;
728         sum2 += *v2++ * tmp0;
729         sum3 += *v3++ * tmp0;
730         sum4 += *v4++ * tmp0;
731         sum5 += *v5++ * tmp0;
732       }
733       y[row++]=sum1;
734       y[row++]=sum2;
735       y[row++]=sum3;
736       y[row++]=sum4;
737       y[row++]=sum5;
738       v1      =v5;       /* Since the next block to be processed starts there */
739       idx    +=4*sz;
740       break;
741     default:
742       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
743     }
744   }
745   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
746   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
747   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
748   PetscFunctionReturn(0);
749 }
750 
751 /* ----------------------------------------------------------- */
752 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
753 {
754   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
755   IS                iscol = a->col,isrow = a->row;
756   PetscErrorCode    ierr;
757   const PetscInt    *r,*c,*rout,*cout;
758   PetscInt          i,j,n = A->rmap->n,nz;
759   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
760   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
761   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
762   PetscScalar       sum1,sum2,sum3,sum4,sum5;
763   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
764   const PetscScalar *b;
765 
766   PetscFunctionBegin;
767   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
768   node_max = a->inode.node_count;
769   ns       = a->inode.size;     /* Node Size array */
770 
771   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
772   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
773   tmp  = a->solve_work;
774 
775   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
776   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
777 
778   /* forward solve the lower triangular */
779   tmps = tmp;
780   aa   = a_a;
781   aj   = a_j;
782   ad   = a->diag;
783 
784   for (i = 0,row = 0; i< node_max; ++i) {
785     nsz = ns[i];
786     aii = ai[row];
787     v1  = aa + aii;
788     vi  = aj + aii;
789     nz  = ad[row]- aii;
790     if (i < node_max-1) {
791       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
792       * but our indexing to determine it's size could. */
793       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
794       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
795       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
796       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
797     }
798 
799     switch (nsz) {               /* Each loop in 'case' is unrolled */
800     case 1:
801       sum1 = b[*r++];
802       for (j=0; j<nz-1; j+=2) {
803         i0    = vi[0];
804         i1    = vi[1];
805         vi   +=2;
806         tmp0  = tmps[i0];
807         tmp1  = tmps[i1];
808         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809       }
810       if (j == nz-1) {
811         tmp0  = tmps[*vi++];
812         sum1 -= *v1++ *tmp0;
813       }
814       tmp[row++]=sum1;
815       break;
816     case 2:
817       sum1 = b[*r++];
818       sum2 = b[*r++];
819       v2   = aa + ai[row+1];
820 
821       for (j=0; j<nz-1; j+=2) {
822         i0    = vi[0];
823         i1    = vi[1];
824         vi   +=2;
825         tmp0  = tmps[i0];
826         tmp1  = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829       }
830       if (j == nz-1) {
831         tmp0  = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833         sum2 -= *v2++ *tmp0;
834       }
835       sum2     -= *v2++ *sum1;
836       tmp[row++]=sum1;
837       tmp[row++]=sum2;
838       break;
839     case 3:
840       sum1 = b[*r++];
841       sum2 = b[*r++];
842       sum3 = b[*r++];
843       v2   = aa + ai[row+1];
844       v3   = aa + ai[row+2];
845 
846       for (j=0; j<nz-1; j+=2) {
847         i0    = vi[0];
848         i1    = vi[1];
849         vi   +=2;
850         tmp0  = tmps[i0];
851         tmp1  = tmps[i1];
852         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860         sum3 -= *v3++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       sum3 -= *v3++ * sum1;
864       sum3 -= *v3++ * sum2;
865 
866       tmp[row++]=sum1;
867       tmp[row++]=sum2;
868       tmp[row++]=sum3;
869       break;
870 
871     case 4:
872       sum1 = b[*r++];
873       sum2 = b[*r++];
874       sum3 = b[*r++];
875       sum4 = b[*r++];
876       v2   = aa + ai[row+1];
877       v3   = aa + ai[row+2];
878       v4   = aa + ai[row+3];
879 
880       for (j=0; j<nz-1; j+=2) {
881         i0    = vi[0];
882         i1    = vi[1];
883         vi   +=2;
884         tmp0  = tmps[i0];
885         tmp1  = tmps[i1];
886         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
887         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
888         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
889         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
890       }
891       if (j == nz-1) {
892         tmp0  = tmps[*vi++];
893         sum1 -= *v1++ *tmp0;
894         sum2 -= *v2++ *tmp0;
895         sum3 -= *v3++ *tmp0;
896         sum4 -= *v4++ *tmp0;
897       }
898       sum2 -= *v2++ * sum1;
899       sum3 -= *v3++ * sum1;
900       sum4 -= *v4++ * sum1;
901       sum3 -= *v3++ * sum2;
902       sum4 -= *v4++ * sum2;
903       sum4 -= *v4++ * sum3;
904 
905       tmp[row++]=sum1;
906       tmp[row++]=sum2;
907       tmp[row++]=sum3;
908       tmp[row++]=sum4;
909       break;
910     case 5:
911       sum1 = b[*r++];
912       sum2 = b[*r++];
913       sum3 = b[*r++];
914       sum4 = b[*r++];
915       sum5 = b[*r++];
916       v2   = aa + ai[row+1];
917       v3   = aa + ai[row+2];
918       v4   = aa + ai[row+3];
919       v5   = aa + ai[row+4];
920 
921       for (j=0; j<nz-1; j+=2) {
922         i0    = vi[0];
923         i1    = vi[1];
924         vi   +=2;
925         tmp0  = tmps[i0];
926         tmp1  = tmps[i1];
927         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
928         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
929         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
930         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
931         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
932       }
933       if (j == nz-1) {
934         tmp0  = tmps[*vi++];
935         sum1 -= *v1++ *tmp0;
936         sum2 -= *v2++ *tmp0;
937         sum3 -= *v3++ *tmp0;
938         sum4 -= *v4++ *tmp0;
939         sum5 -= *v5++ *tmp0;
940       }
941 
942       sum2 -= *v2++ * sum1;
943       sum3 -= *v3++ * sum1;
944       sum4 -= *v4++ * sum1;
945       sum5 -= *v5++ * sum1;
946       sum3 -= *v3++ * sum2;
947       sum4 -= *v4++ * sum2;
948       sum5 -= *v5++ * sum2;
949       sum4 -= *v4++ * sum3;
950       sum5 -= *v5++ * sum3;
951       sum5 -= *v5++ * sum4;
952 
953       tmp[row++]=sum1;
954       tmp[row++]=sum2;
955       tmp[row++]=sum3;
956       tmp[row++]=sum4;
957       tmp[row++]=sum5;
958       break;
959     default:
960       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
961     }
962   }
963   /* backward solve the upper triangular */
964   for (i=node_max -1,row = n-1; i>=0; i--) {
965     nsz = ns[i];
966     aii = ai[row+1] -1;
967     v1  = aa + aii;
968     vi  = aj + aii;
969     nz  = aii- ad[row];
970     switch (nsz) {               /* Each loop in 'case' is unrolled */
971     case 1:
972       sum1 = tmp[row];
973 
974       for (j=nz; j>1; j-=2) {
975         vi   -=2;
976         i0    = vi[2];
977         i1    = vi[1];
978         tmp0  = tmps[i0];
979         tmp1  = tmps[i1];
980         v1   -= 2;
981         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982       }
983       if (j==1) {
984         tmp0  = tmps[*vi--];
985         sum1 -= *v1-- * tmp0;
986       }
987       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
988       break;
989     case 2:
990       sum1 = tmp[row];
991       sum2 = tmp[row -1];
992       v2   = aa + ai[row]-1;
993       for (j=nz; j>1; j-=2) {
994         vi   -=2;
995         i0    = vi[2];
996         i1    = vi[1];
997         tmp0  = tmps[i0];
998         tmp1  = tmps[i1];
999         v1   -= 2;
1000         v2   -= 2;
1001         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1003       }
1004       if (j==1) {
1005         tmp0  = tmps[*vi--];
1006         sum1 -= *v1-- * tmp0;
1007         sum2 -= *v2-- * tmp0;
1008       }
1009 
1010       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1011       sum2   -= *v2-- * tmp0;
1012       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1013       break;
1014     case 3:
1015       sum1 = tmp[row];
1016       sum2 = tmp[row -1];
1017       sum3 = tmp[row -2];
1018       v2   = aa + ai[row]-1;
1019       v3   = aa + ai[row -1]-1;
1020       for (j=nz; j>1; j-=2) {
1021         vi   -=2;
1022         i0    = vi[2];
1023         i1    = vi[1];
1024         tmp0  = tmps[i0];
1025         tmp1  = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         v3   -= 2;
1029         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1030         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1031         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1032       }
1033       if (j==1) {
1034         tmp0  = tmps[*vi--];
1035         sum1 -= *v1-- * tmp0;
1036         sum2 -= *v2-- * tmp0;
1037         sum3 -= *v3-- * tmp0;
1038       }
1039       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1040       sum2   -= *v2-- * tmp0;
1041       sum3   -= *v3-- * tmp0;
1042       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1043       sum3   -= *v3-- * tmp0;
1044       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1045 
1046       break;
1047     case 4:
1048       sum1 = tmp[row];
1049       sum2 = tmp[row -1];
1050       sum3 = tmp[row -2];
1051       sum4 = tmp[row -3];
1052       v2   = aa + ai[row]-1;
1053       v3   = aa + ai[row -1]-1;
1054       v4   = aa + ai[row -2]-1;
1055 
1056       for (j=nz; j>1; j-=2) {
1057         vi   -=2;
1058         i0    = vi[2];
1059         i1    = vi[1];
1060         tmp0  = tmps[i0];
1061         tmp1  = tmps[i1];
1062         v1   -= 2;
1063         v2   -= 2;
1064         v3   -= 2;
1065         v4   -= 2;
1066         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1067         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1068         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1069         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1070       }
1071       if (j==1) {
1072         tmp0  = tmps[*vi--];
1073         sum1 -= *v1-- * tmp0;
1074         sum2 -= *v2-- * tmp0;
1075         sum3 -= *v3-- * tmp0;
1076         sum4 -= *v4-- * tmp0;
1077       }
1078 
1079       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1080       sum2   -= *v2-- * tmp0;
1081       sum3   -= *v3-- * tmp0;
1082       sum4   -= *v4-- * tmp0;
1083       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1084       sum3   -= *v3-- * tmp0;
1085       sum4   -= *v4-- * tmp0;
1086       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1087       sum4   -= *v4-- * tmp0;
1088       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1089       break;
1090     case 5:
1091       sum1 = tmp[row];
1092       sum2 = tmp[row -1];
1093       sum3 = tmp[row -2];
1094       sum4 = tmp[row -3];
1095       sum5 = tmp[row -4];
1096       v2   = aa + ai[row]-1;
1097       v3   = aa + ai[row -1]-1;
1098       v4   = aa + ai[row -2]-1;
1099       v5   = aa + ai[row -3]-1;
1100       for (j=nz; j>1; j-=2) {
1101         vi   -= 2;
1102         i0    = vi[2];
1103         i1    = vi[1];
1104         tmp0  = tmps[i0];
1105         tmp1  = tmps[i1];
1106         v1   -= 2;
1107         v2   -= 2;
1108         v3   -= 2;
1109         v4   -= 2;
1110         v5   -= 2;
1111         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1112         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1113         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1114         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1115         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1116       }
1117       if (j==1) {
1118         tmp0  = tmps[*vi--];
1119         sum1 -= *v1-- * tmp0;
1120         sum2 -= *v2-- * tmp0;
1121         sum3 -= *v3-- * tmp0;
1122         sum4 -= *v4-- * tmp0;
1123         sum5 -= *v5-- * tmp0;
1124       }
1125 
1126       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1127       sum2   -= *v2-- * tmp0;
1128       sum3   -= *v3-- * tmp0;
1129       sum4   -= *v4-- * tmp0;
1130       sum5   -= *v5-- * tmp0;
1131       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1132       sum3   -= *v3-- * tmp0;
1133       sum4   -= *v4-- * tmp0;
1134       sum5   -= *v5-- * tmp0;
1135       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1136       sum4   -= *v4-- * tmp0;
1137       sum5   -= *v5-- * tmp0;
1138       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1139       sum5   -= *v5-- * tmp0;
1140       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1141       break;
1142     default:
1143       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1144     }
1145   }
1146   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1147   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1148   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1149   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1150   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1155 {
1156   Mat             C     =B;
1157   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1158   IS              isrow = b->row,isicol = b->icol;
1159   PetscErrorCode  ierr;
1160   const PetscInt  *r,*ic,*ics;
1161   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1162   PetscInt        i,j,k,nz,nzL,row,*pj;
1163   const PetscInt  *ajtmp,*bjtmp;
1164   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1165   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1166   FactorShiftCtx  sctx;
1167   const PetscInt  *ddiag;
1168   PetscReal       rs;
1169   MatScalar       d;
1170   PetscInt        inod,nodesz,node_max,col;
1171   const PetscInt  *ns;
1172   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1173 
1174   PetscFunctionBegin;
1175   /* MatPivotSetUp(): initialize shift context sctx */
1176   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1177 
1178   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1179     ddiag          = a->diag;
1180     sctx.shift_top = info->zeropivot;
1181     for (i=0; i<n; i++) {
1182       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183       d  = (aa)[ddiag[i]];
1184       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1185       v  = aa+ai[i];
1186       nz = ai[i+1] - ai[i];
1187       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1188       if (rs>sctx.shift_top) sctx.shift_top = rs;
1189     }
1190     sctx.shift_top *= 1.1;
1191     sctx.nshift_max = 5;
1192     sctx.shift_lo   = 0.;
1193     sctx.shift_hi   = 1.;
1194   }
1195 
1196   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1197   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1198 
1199   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1200   ics   = ic;
1201 
1202   node_max = a->inode.node_count;
1203   ns       = a->inode.size;
1204   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1205 
1206   /* If max inode size > 4, split it into two inodes.*/
1207   /* also map the inode sizes according to the ordering */
1208   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1209   for (i=0,j=0; i<node_max; ++i,++j) {
1210     if (ns[i] > 4) {
1211       tmp_vec1[j] = 4;
1212       ++j;
1213       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1214     } else {
1215       tmp_vec1[j] = ns[i];
1216     }
1217   }
1218   /* Use the correct node_max */
1219   node_max = j;
1220 
1221   /* Now reorder the inode info based on mat re-ordering info */
1222   /* First create a row -> inode_size_array_index map */
1223   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1224   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1225   for (i=0,row=0; i<node_max; i++) {
1226     nodesz = tmp_vec1[i];
1227     for (j=0; j<nodesz; j++,row++) {
1228       nsmap[row] = i;
1229     }
1230   }
1231   /* Using nsmap, create a reordered ns structure */
1232   for (i=0,j=0; i< node_max; i++) {
1233     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1234     tmp_vec2[i] = nodesz;
1235     j          += nodesz;
1236   }
1237   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1238   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1239 
1240   /* Now use the correct ns */
1241   ns = tmp_vec2;
1242 
1243   do {
1244     sctx.newshift = PETSC_FALSE;
1245     /* Now loop over each block-row, and do the factorization */
1246     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1247       nodesz = ns[inod];
1248 
1249       switch (nodesz) {
1250       case 1:
1251         /*----------*/
1252         /* zero rtmp1 */
1253         /* L part */
1254         nz    = bi[i+1] - bi[i];
1255         bjtmp = bj + bi[i];
1256         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1257 
1258         /* U part */
1259         nz    = bdiag[i]-bdiag[i+1];
1260         bjtmp = bj + bdiag[i+1]+1;
1261         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1262 
1263         /* load in initial (unfactored row) */
1264         nz    = ai[r[i]+1] - ai[r[i]];
1265         ajtmp = aj + ai[r[i]];
1266         v     = aa + ai[r[i]];
1267         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1268 
1269         /* ZeropivotApply() */
1270         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1271 
1272         /* elimination */
1273         bjtmp = bj + bi[i];
1274         row   = *bjtmp++;
1275         nzL   = bi[i+1] - bi[i];
1276         for (k=0; k < nzL; k++) {
1277           pc = rtmp1 + row;
1278           if (*pc != 0.0) {
1279             pv   = b->a + bdiag[row];
1280             mul1 = *pc * (*pv);
1281             *pc  = mul1;
1282             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1283             pv   = b->a + bdiag[row+1]+1;
1284             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1285             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1286             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1287           }
1288           row = *bjtmp++;
1289         }
1290 
1291         /* finished row so stick it into b->a */
1292         rs = 0.0;
1293         /* L part */
1294         pv = b->a + bi[i];
1295         pj = b->j + bi[i];
1296         nz = bi[i+1] - bi[i];
1297         for (j=0; j<nz; j++) {
1298           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1299         }
1300 
1301         /* U part */
1302         pv = b->a + bdiag[i+1]+1;
1303         pj = b->j + bdiag[i+1]+1;
1304         nz = bdiag[i] - bdiag[i+1]-1;
1305         for (j=0; j<nz; j++) {
1306           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1307         }
1308 
1309         /* Check zero pivot */
1310         sctx.rs = rs;
1311         sctx.pv = rtmp1[i];
1312         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1313         if (sctx.newshift) break;
1314 
1315         /* Mark diagonal and invert diagonal for simplier triangular solves */
1316         pv  = b->a + bdiag[i];
1317         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1318         break;
1319 
1320       case 2:
1321         /*----------*/
1322         /* zero rtmp1 and rtmp2 */
1323         /* L part */
1324         nz    = bi[i+1] - bi[i];
1325         bjtmp = bj + bi[i];
1326         for  (j=0; j<nz; j++) {
1327           col        = bjtmp[j];
1328           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1329         }
1330 
1331         /* U part */
1332         nz    = bdiag[i]-bdiag[i+1];
1333         bjtmp = bj + bdiag[i+1]+1;
1334         for  (j=0; j<nz; j++) {
1335           col        = bjtmp[j];
1336           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1337         }
1338 
1339         /* load in initial (unfactored row) */
1340         nz    = ai[r[i]+1] - ai[r[i]];
1341         ajtmp = aj + ai[r[i]];
1342         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1343         for (j=0; j<nz; j++) {
1344           col        = ics[ajtmp[j]];
1345           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1346         }
1347         /* ZeropivotApply(): shift the diagonal of the matrix  */
1348         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1349 
1350         /* elimination */
1351         bjtmp = bj + bi[i];
1352         row   = *bjtmp++; /* pivot row */
1353         nzL   = bi[i+1] - bi[i];
1354         for (k=0; k < nzL; k++) {
1355           pc1 = rtmp1 + row;
1356           pc2 = rtmp2 + row;
1357           if (*pc1 != 0.0 || *pc2 != 0.0) {
1358             pv   = b->a + bdiag[row];
1359             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1360             *pc1 = mul1;       *pc2 = mul2;
1361 
1362             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1363             pv = b->a + bdiag[row+1]+1;
1364             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1365             for (j=0; j<nz; j++) {
1366               col         = pj[j];
1367               rtmp1[col] -= mul1 * pv[j];
1368               rtmp2[col] -= mul2 * pv[j];
1369             }
1370             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1371           }
1372           row = *bjtmp++;
1373         }
1374 
1375         /* finished row i; check zero pivot, then stick row i into b->a */
1376         rs = 0.0;
1377         /* L part */
1378         pc1 = b->a + bi[i];
1379         pj  = b->j + bi[i];
1380         nz  = bi[i+1] - bi[i];
1381         for (j=0; j<nz; j++) {
1382           col    = pj[j];
1383           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1384         }
1385         /* U part */
1386         pc1 = b->a + bdiag[i+1]+1;
1387         pj  = b->j + bdiag[i+1]+1;
1388         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1389         for (j=0; j<nz; j++) {
1390           col    = pj[j];
1391           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1392         }
1393 
1394         sctx.rs = rs;
1395         sctx.pv = rtmp1[i];
1396         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1397         if (sctx.newshift) break;
1398         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1399         *pc1 = 1.0/sctx.pv;
1400 
1401         /* Now take care of diagonal 2x2 block. */
1402         pc2 = rtmp2 + i;
1403         if (*pc2 != 0.0) {
1404           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1405           *pc2 = mul1;          /* insert L entry */
1406           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1407           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1408           for (j=0; j<nz; j++) {
1409             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1410           }
1411           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1412         }
1413 
1414         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1415         rs = 0.0;
1416         /* L part */
1417         pc2 = b->a + bi[i+1];
1418         pj  = b->j + bi[i+1];
1419         nz  = bi[i+2] - bi[i+1];
1420         for (j=0; j<nz; j++) {
1421           col    = pj[j];
1422           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1423         }
1424         /* U part */
1425         pc2 = b->a + bdiag[i+2]+1;
1426         pj  = b->j + bdiag[i+2]+1;
1427         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1428         for (j=0; j<nz; j++) {
1429           col    = pj[j];
1430           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1431         }
1432 
1433         sctx.rs = rs;
1434         sctx.pv = rtmp2[i+1];
1435         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1436         if (sctx.newshift) break;
1437         pc2  = b->a + bdiag[i+1];
1438         *pc2 = 1.0/sctx.pv;
1439         break;
1440 
1441       case 3:
1442         /*----------*/
1443         /* zero rtmp */
1444         /* L part */
1445         nz    = bi[i+1] - bi[i];
1446         bjtmp = bj + bi[i];
1447         for  (j=0; j<nz; j++) {
1448           col        = bjtmp[j];
1449           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1450         }
1451 
1452         /* U part */
1453         nz    = bdiag[i]-bdiag[i+1];
1454         bjtmp = bj + bdiag[i+1]+1;
1455         for  (j=0; j<nz; j++) {
1456           col        = bjtmp[j];
1457           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1458         }
1459 
1460         /* load in initial (unfactored row) */
1461         nz    = ai[r[i]+1] - ai[r[i]];
1462         ajtmp = aj + ai[r[i]];
1463         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1464         for (j=0; j<nz; j++) {
1465           col        = ics[ajtmp[j]];
1466           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1467         }
1468         /* ZeropivotApply(): shift the diagonal of the matrix  */
1469         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1470 
1471         /* elimination */
1472         bjtmp = bj + bi[i];
1473         row   = *bjtmp++; /* pivot row */
1474         nzL   = bi[i+1] - bi[i];
1475         for (k=0; k < nzL; k++) {
1476           pc1 = rtmp1 + row;
1477           pc2 = rtmp2 + row;
1478           pc3 = rtmp3 + row;
1479           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1480             pv   = b->a + bdiag[row];
1481             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1482             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1483 
1484             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1485             pv = b->a + bdiag[row+1]+1;
1486             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1487             for (j=0; j<nz; j++) {
1488               col         = pj[j];
1489               rtmp1[col] -= mul1 * pv[j];
1490               rtmp2[col] -= mul2 * pv[j];
1491               rtmp3[col] -= mul3 * pv[j];
1492             }
1493             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1494           }
1495           row = *bjtmp++;
1496         }
1497 
1498         /* finished row i; check zero pivot, then stick row i into b->a */
1499         rs = 0.0;
1500         /* L part */
1501         pc1 = b->a + bi[i];
1502         pj  = b->j + bi[i];
1503         nz  = bi[i+1] - bi[i];
1504         for (j=0; j<nz; j++) {
1505           col    = pj[j];
1506           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1507         }
1508         /* U part */
1509         pc1 = b->a + bdiag[i+1]+1;
1510         pj  = b->j + bdiag[i+1]+1;
1511         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1512         for (j=0; j<nz; j++) {
1513           col    = pj[j];
1514           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1515         }
1516 
1517         sctx.rs = rs;
1518         sctx.pv = rtmp1[i];
1519         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1520         if (sctx.newshift) break;
1521         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1522         *pc1 = 1.0/sctx.pv;
1523 
1524         /* Now take care of 1st column of diagonal 3x3 block. */
1525         pc2 = rtmp2 + i;
1526         pc3 = rtmp3 + i;
1527         if (*pc2 != 0.0 || *pc3 != 0.0) {
1528           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1529           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1530           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1531           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1532           for (j=0; j<nz; j++) {
1533             col         = pj[j];
1534             rtmp2[col] -= mul2 * rtmp1[col];
1535             rtmp3[col] -= mul3 * rtmp1[col];
1536           }
1537           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1538         }
1539 
1540         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1541         rs = 0.0;
1542         /* L part */
1543         pc2 = b->a + bi[i+1];
1544         pj  = b->j + bi[i+1];
1545         nz  = bi[i+2] - bi[i+1];
1546         for (j=0; j<nz; j++) {
1547           col    = pj[j];
1548           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1549         }
1550         /* U part */
1551         pc2 = b->a + bdiag[i+2]+1;
1552         pj  = b->j + bdiag[i+2]+1;
1553         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1554         for (j=0; j<nz; j++) {
1555           col    = pj[j];
1556           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1557         }
1558 
1559         sctx.rs = rs;
1560         sctx.pv = rtmp2[i+1];
1561         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1562         if (sctx.newshift) break;
1563         pc2  = b->a + bdiag[i+1];
1564         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1565 
1566         /* Now take care of 2nd column of diagonal 3x3 block. */
1567         pc3 = rtmp3 + i+1;
1568         if (*pc3 != 0.0) {
1569           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1570           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1571           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1572           for (j=0; j<nz; j++) {
1573             col         = pj[j];
1574             rtmp3[col] -= mul3 * rtmp2[col];
1575           }
1576           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1577         }
1578 
1579         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1580         rs = 0.0;
1581         /* L part */
1582         pc3 = b->a + bi[i+2];
1583         pj  = b->j + bi[i+2];
1584         nz  = bi[i+3] - bi[i+2];
1585         for (j=0; j<nz; j++) {
1586           col    = pj[j];
1587           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1588         }
1589         /* U part */
1590         pc3 = b->a + bdiag[i+3]+1;
1591         pj  = b->j + bdiag[i+3]+1;
1592         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1593         for (j=0; j<nz; j++) {
1594           col    = pj[j];
1595           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1596         }
1597 
1598         sctx.rs = rs;
1599         sctx.pv = rtmp3[i+2];
1600         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1601         if (sctx.newshift) break;
1602         pc3  = b->a + bdiag[i+2];
1603         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1604         break;
1605       case 4:
1606         /*----------*/
1607         /* zero rtmp */
1608         /* L part */
1609         nz    = bi[i+1] - bi[i];
1610         bjtmp = bj + bi[i];
1611         for  (j=0; j<nz; j++) {
1612           col        = bjtmp[j];
1613           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1614         }
1615 
1616         /* U part */
1617         nz    = bdiag[i]-bdiag[i+1];
1618         bjtmp = bj + bdiag[i+1]+1;
1619         for  (j=0; j<nz; j++) {
1620           col        = bjtmp[j];
1621           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1622         }
1623 
1624         /* load in initial (unfactored row) */
1625         nz    = ai[r[i]+1] - ai[r[i]];
1626         ajtmp = aj + ai[r[i]];
1627         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1628         for (j=0; j<nz; j++) {
1629           col        = ics[ajtmp[j]];
1630           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1631         }
1632         /* ZeropivotApply(): shift the diagonal of the matrix  */
1633         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1634 
1635         /* elimination */
1636         bjtmp = bj + bi[i];
1637         row   = *bjtmp++; /* pivot row */
1638         nzL   = bi[i+1] - bi[i];
1639         for (k=0; k < nzL; k++) {
1640           pc1 = rtmp1 + row;
1641           pc2 = rtmp2 + row;
1642           pc3 = rtmp3 + row;
1643           pc4 = rtmp4 + row;
1644           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1645             pv   = b->a + bdiag[row];
1646             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1647             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1648 
1649             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1650             pv = b->a + bdiag[row+1]+1;
1651             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1652             for (j=0; j<nz; j++) {
1653               col         = pj[j];
1654               rtmp1[col] -= mul1 * pv[j];
1655               rtmp2[col] -= mul2 * pv[j];
1656               rtmp3[col] -= mul3 * pv[j];
1657               rtmp4[col] -= mul4 * pv[j];
1658             }
1659             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1660           }
1661           row = *bjtmp++;
1662         }
1663 
1664         /* finished row i; check zero pivot, then stick row i into b->a */
1665         rs = 0.0;
1666         /* L part */
1667         pc1 = b->a + bi[i];
1668         pj  = b->j + bi[i];
1669         nz  = bi[i+1] - bi[i];
1670         for (j=0; j<nz; j++) {
1671           col    = pj[j];
1672           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1673         }
1674         /* U part */
1675         pc1 = b->a + bdiag[i+1]+1;
1676         pj  = b->j + bdiag[i+1]+1;
1677         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1678         for (j=0; j<nz; j++) {
1679           col    = pj[j];
1680           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1681         }
1682 
1683         sctx.rs = rs;
1684         sctx.pv = rtmp1[i];
1685         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1686         if (sctx.newshift) break;
1687         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1688         *pc1 = 1.0/sctx.pv;
1689 
1690         /* Now take care of 1st column of diagonal 4x4 block. */
1691         pc2 = rtmp2 + i;
1692         pc3 = rtmp3 + i;
1693         pc4 = rtmp4 + i;
1694         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1695           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1696           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1697           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1698           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1699           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1700           for (j=0; j<nz; j++) {
1701             col         = pj[j];
1702             rtmp2[col] -= mul2 * rtmp1[col];
1703             rtmp3[col] -= mul3 * rtmp1[col];
1704             rtmp4[col] -= mul4 * rtmp1[col];
1705           }
1706           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1707         }
1708 
1709         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1710         rs = 0.0;
1711         /* L part */
1712         pc2 = b->a + bi[i+1];
1713         pj  = b->j + bi[i+1];
1714         nz  = bi[i+2] - bi[i+1];
1715         for (j=0; j<nz; j++) {
1716           col    = pj[j];
1717           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1718         }
1719         /* U part */
1720         pc2 = b->a + bdiag[i+2]+1;
1721         pj  = b->j + bdiag[i+2]+1;
1722         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1723         for (j=0; j<nz; j++) {
1724           col    = pj[j];
1725           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1726         }
1727 
1728         sctx.rs = rs;
1729         sctx.pv = rtmp2[i+1];
1730         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1731         if (sctx.newshift) break;
1732         pc2  = b->a + bdiag[i+1];
1733         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1734 
1735         /* Now take care of 2nd column of diagonal 4x4 block. */
1736         pc3 = rtmp3 + i+1;
1737         pc4 = rtmp4 + i+1;
1738         if (*pc3 != 0.0 || *pc4 != 0.0) {
1739           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1740           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1741           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1742           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1743           for (j=0; j<nz; j++) {
1744             col         = pj[j];
1745             rtmp3[col] -= mul3 * rtmp2[col];
1746             rtmp4[col] -= mul4 * rtmp2[col];
1747           }
1748           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1749         }
1750 
1751         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1752         rs = 0.0;
1753         /* L part */
1754         pc3 = b->a + bi[i+2];
1755         pj  = b->j + bi[i+2];
1756         nz  = bi[i+3] - bi[i+2];
1757         for (j=0; j<nz; j++) {
1758           col    = pj[j];
1759           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1760         }
1761         /* U part */
1762         pc3 = b->a + bdiag[i+3]+1;
1763         pj  = b->j + bdiag[i+3]+1;
1764         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1765         for (j=0; j<nz; j++) {
1766           col    = pj[j];
1767           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1768         }
1769 
1770         sctx.rs = rs;
1771         sctx.pv = rtmp3[i+2];
1772         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1773         if (sctx.newshift) break;
1774         pc3  = b->a + bdiag[i+2];
1775         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1776 
1777         /* Now take care of 3rd column of diagonal 4x4 block. */
1778         pc4 = rtmp4 + i+2;
1779         if (*pc4 != 0.0) {
1780           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1781           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1782           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1783           for (j=0; j<nz; j++) {
1784             col         = pj[j];
1785             rtmp4[col] -= mul4 * rtmp3[col];
1786           }
1787           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1788         }
1789 
1790         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1791         rs = 0.0;
1792         /* L part */
1793         pc4 = b->a + bi[i+3];
1794         pj  = b->j + bi[i+3];
1795         nz  = bi[i+4] - bi[i+3];
1796         for (j=0; j<nz; j++) {
1797           col    = pj[j];
1798           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1799         }
1800         /* U part */
1801         pc4 = b->a + bdiag[i+4]+1;
1802         pj  = b->j + bdiag[i+4]+1;
1803         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1804         for (j=0; j<nz; j++) {
1805           col    = pj[j];
1806           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1807         }
1808 
1809         sctx.rs = rs;
1810         sctx.pv = rtmp4[i+3];
1811         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1812         if (sctx.newshift) break;
1813         pc4  = b->a + bdiag[i+3];
1814         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1815         break;
1816 
1817       default:
1818         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1819       }
1820       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1821       i += nodesz;                 /* Update the row */
1822     }
1823 
1824     /* MatPivotRefine() */
1825     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1826       /*
1827        * if no shift in this attempt & shifting & started shifting & can refine,
1828        * then try lower shift
1829        */
1830       sctx.shift_hi       = sctx.shift_fraction;
1831       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1832       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1833       sctx.newshift       = PETSC_TRUE;
1834       sctx.nshift++;
1835     }
1836   } while (sctx.newshift);
1837 
1838   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1839   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1840   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1841   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1842 
1843   if (b->inode.size) {
1844     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1845   } else {
1846     C->ops->solve           = MatSolve_SeqAIJ;
1847   }
1848   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1849   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1850   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1851   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1852   C->assembled              = PETSC_TRUE;
1853   C->preallocated           = PETSC_TRUE;
1854 
1855   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1856 
1857   /* MatShiftView(A,info,&sctx) */
1858   if (sctx.nshift) {
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1860       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1861     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1862       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1863     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1864       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1865     }
1866   }
1867   PetscFunctionReturn(0);
1868 }
1869 
1870 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1871 {
1872   Mat             C     = B;
1873   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1874   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1875   PetscErrorCode  ierr;
1876   const PetscInt  *r,*ic,*c,*ics;
1877   PetscInt        n   = A->rmap->n,*bi = b->i;
1878   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1879   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1880   PetscInt        *ai = a->i,*aj = a->j;
1881   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1882   PetscScalar     mul1,mul2,mul3,tmp;
1883   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1884   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1885   PetscReal       rs=0.0;
1886   FactorShiftCtx  sctx;
1887 
1888   PetscFunctionBegin;
1889   sctx.shift_top      = 0;
1890   sctx.nshift_max     = 0;
1891   sctx.shift_lo       = 0;
1892   sctx.shift_hi       = 0;
1893   sctx.shift_fraction = 0;
1894 
1895   /* if both shift schemes are chosen by user, only use info->shiftpd */
1896   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1897     sctx.shift_top = 0;
1898     for (i=0; i<n; i++) {
1899       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1900       rs    = 0.0;
1901       ajtmp = aj + ai[i];
1902       rtmp1 = aa + ai[i];
1903       nz    = ai[i+1] - ai[i];
1904       for (j=0; j<nz; j++) {
1905         if (*ajtmp != i) {
1906           rs += PetscAbsScalar(*rtmp1++);
1907         } else {
1908           rs -= PetscRealPart(*rtmp1++);
1909         }
1910         ajtmp++;
1911       }
1912       if (rs>sctx.shift_top) sctx.shift_top = rs;
1913     }
1914     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1915     sctx.shift_top *= 1.1;
1916     sctx.nshift_max = 5;
1917     sctx.shift_lo   = 0.;
1918     sctx.shift_hi   = 1.;
1919   }
1920   sctx.shift_amount = 0;
1921   sctx.nshift       = 0;
1922 
1923   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1924   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1925   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1926   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1927   ics    = ic;
1928 
1929   node_max = a->inode.node_count;
1930   ns       = a->inode.size;
1931   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1932 
1933   /* If max inode size > 3, split it into two inodes.*/
1934   /* also map the inode sizes according to the ordering */
1935   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1936   for (i=0,j=0; i<node_max; ++i,++j) {
1937     if (ns[i]>3) {
1938       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1939       ++j;
1940       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1941     } else {
1942       tmp_vec1[j] = ns[i];
1943     }
1944   }
1945   /* Use the correct node_max */
1946   node_max = j;
1947 
1948   /* Now reorder the inode info based on mat re-ordering info */
1949   /* First create a row -> inode_size_array_index map */
1950   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1951   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1952   for (i=0,row=0; i<node_max; i++) {
1953     nodesz = tmp_vec1[i];
1954     for (j=0; j<nodesz; j++,row++) {
1955       nsmap[row] = i;
1956     }
1957   }
1958   /* Using nsmap, create a reordered ns structure */
1959   for (i=0,j=0; i< node_max; i++) {
1960     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1961     tmp_vec2[i] = nodesz;
1962     j          += nodesz;
1963   }
1964   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1965   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1966   /* Now use the correct ns */
1967   ns = tmp_vec2;
1968 
1969   do {
1970     sctx.newshift = PETSC_FALSE;
1971     /* Now loop over each block-row, and do the factorization */
1972     for (i=0,row=0; i<node_max; i++) {
1973       nodesz = ns[i];
1974       nz     = bi[row+1] - bi[row];
1975       bjtmp  = bj + bi[row];
1976 
1977       switch (nodesz) {
1978       case 1:
1979         for  (j=0; j<nz; j++) {
1980           idx         = bjtmp[j];
1981           rtmp11[idx] = 0.0;
1982         }
1983 
1984         /* load in initial (unfactored row) */
1985         idx    = r[row];
1986         nz_tmp = ai[idx+1] - ai[idx];
1987         ajtmp  = aj + ai[idx];
1988         v1     = aa + ai[idx];
1989 
1990         for (j=0; j<nz_tmp; j++) {
1991           idx         = ics[ajtmp[j]];
1992           rtmp11[idx] = v1[j];
1993         }
1994         rtmp11[ics[r[row]]] += sctx.shift_amount;
1995 
1996         prow = *bjtmp++;
1997         while (prow < row) {
1998           pc1 = rtmp11 + prow;
1999           if (*pc1 != 0.0) {
2000             pv     = ba + bd[prow];
2001             pj     = nbj + bd[prow];
2002             mul1   = *pc1 * *pv++;
2003             *pc1   = mul1;
2004             nz_tmp = bi[prow+1] - bd[prow] - 1;
2005             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2006             for (j=0; j<nz_tmp; j++) {
2007               tmp          = pv[j];
2008               idx          = pj[j];
2009               rtmp11[idx] -= mul1 * tmp;
2010             }
2011           }
2012           prow = *bjtmp++;
2013         }
2014         pj  = bj + bi[row];
2015         pc1 = ba + bi[row];
2016 
2017         sctx.pv     = rtmp11[row];
2018         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2019         rs          = 0.0;
2020         for (j=0; j<nz; j++) {
2021           idx    = pj[j];
2022           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2023           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2024         }
2025         sctx.rs = rs;
2026         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2027         if (sctx.newshift) goto endofwhile;
2028         break;
2029 
2030       case 2:
2031         for (j=0; j<nz; j++) {
2032           idx         = bjtmp[j];
2033           rtmp11[idx] = 0.0;
2034           rtmp22[idx] = 0.0;
2035         }
2036 
2037         /* load in initial (unfactored row) */
2038         idx    = r[row];
2039         nz_tmp = ai[idx+1] - ai[idx];
2040         ajtmp  = aj + ai[idx];
2041         v1     = aa + ai[idx];
2042         v2     = aa + ai[idx+1];
2043         for (j=0; j<nz_tmp; j++) {
2044           idx         = ics[ajtmp[j]];
2045           rtmp11[idx] = v1[j];
2046           rtmp22[idx] = v2[j];
2047         }
2048         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2049         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2050 
2051         prow = *bjtmp++;
2052         while (prow < row) {
2053           pc1 = rtmp11 + prow;
2054           pc2 = rtmp22 + prow;
2055           if (*pc1 != 0.0 || *pc2 != 0.0) {
2056             pv   = ba + bd[prow];
2057             pj   = nbj + bd[prow];
2058             mul1 = *pc1 * *pv;
2059             mul2 = *pc2 * *pv;
2060             ++pv;
2061             *pc1 = mul1;
2062             *pc2 = mul2;
2063 
2064             nz_tmp = bi[prow+1] - bd[prow] - 1;
2065             for (j=0; j<nz_tmp; j++) {
2066               tmp          = pv[j];
2067               idx          = pj[j];
2068               rtmp11[idx] -= mul1 * tmp;
2069               rtmp22[idx] -= mul2 * tmp;
2070             }
2071             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2072           }
2073           prow = *bjtmp++;
2074         }
2075 
2076         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2077         pc1 = rtmp11 + prow;
2078         pc2 = rtmp22 + prow;
2079 
2080         sctx.pv = *pc1;
2081         pj      = bj + bi[prow];
2082         rs      = 0.0;
2083         for (j=0; j<nz; j++) {
2084           idx = pj[j];
2085           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2086         }
2087         sctx.rs = rs;
2088         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2089         if (sctx.newshift) goto endofwhile;
2090 
2091         if (*pc2 != 0.0) {
2092           pj     = nbj + bd[prow];
2093           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2094           *pc2   = mul2;
2095           nz_tmp = bi[prow+1] - bd[prow] - 1;
2096           for (j=0; j<nz_tmp; j++) {
2097             idx          = pj[j];
2098             tmp          = rtmp11[idx];
2099             rtmp22[idx] -= mul2 * tmp;
2100           }
2101           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2102         }
2103 
2104         pj  = bj + bi[row];
2105         pc1 = ba + bi[row];
2106         pc2 = ba + bi[row+1];
2107 
2108         sctx.pv       = rtmp22[row+1];
2109         rs            = 0.0;
2110         rtmp11[row]   = 1.0/rtmp11[row];
2111         rtmp22[row+1] = 1.0/rtmp22[row+1];
2112         /* copy row entries from dense representation to sparse */
2113         for (j=0; j<nz; j++) {
2114           idx    = pj[j];
2115           pc1[j] = rtmp11[idx];
2116           pc2[j] = rtmp22[idx];
2117           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2118         }
2119         sctx.rs = rs;
2120         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2121         if (sctx.newshift) goto endofwhile;
2122         break;
2123 
2124       case 3:
2125         for  (j=0; j<nz; j++) {
2126           idx         = bjtmp[j];
2127           rtmp11[idx] = 0.0;
2128           rtmp22[idx] = 0.0;
2129           rtmp33[idx] = 0.0;
2130         }
2131         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2132         idx    = r[row];
2133         nz_tmp = ai[idx+1] - ai[idx];
2134         ajtmp  = aj + ai[idx];
2135         v1     = aa + ai[idx];
2136         v2     = aa + ai[idx+1];
2137         v3     = aa + ai[idx+2];
2138         for (j=0; j<nz_tmp; j++) {
2139           idx         = ics[ajtmp[j]];
2140           rtmp11[idx] = v1[j];
2141           rtmp22[idx] = v2[j];
2142           rtmp33[idx] = v3[j];
2143         }
2144         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2145         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2146         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2147 
2148         /* loop over all pivot row blocks above this row block */
2149         prow = *bjtmp++;
2150         while (prow < row) {
2151           pc1 = rtmp11 + prow;
2152           pc2 = rtmp22 + prow;
2153           pc3 = rtmp33 + prow;
2154           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2155             pv   = ba  + bd[prow];
2156             pj   = nbj + bd[prow];
2157             mul1 = *pc1 * *pv;
2158             mul2 = *pc2 * *pv;
2159             mul3 = *pc3 * *pv;
2160             ++pv;
2161             *pc1 = mul1;
2162             *pc2 = mul2;
2163             *pc3 = mul3;
2164 
2165             nz_tmp = bi[prow+1] - bd[prow] - 1;
2166             /* update this row based on pivot row */
2167             for (j=0; j<nz_tmp; j++) {
2168               tmp          = pv[j];
2169               idx          = pj[j];
2170               rtmp11[idx] -= mul1 * tmp;
2171               rtmp22[idx] -= mul2 * tmp;
2172               rtmp33[idx] -= mul3 * tmp;
2173             }
2174             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2175           }
2176           prow = *bjtmp++;
2177         }
2178 
2179         /* Now take care of diagonal 3x3 block in this set of rows */
2180         /* note: prow = row here */
2181         pc1 = rtmp11 + prow;
2182         pc2 = rtmp22 + prow;
2183         pc3 = rtmp33 + prow;
2184 
2185         sctx.pv = *pc1;
2186         pj      = bj + bi[prow];
2187         rs      = 0.0;
2188         for (j=0; j<nz; j++) {
2189           idx = pj[j];
2190           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2191         }
2192         sctx.rs = rs;
2193         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2194         if (sctx.newshift) goto endofwhile;
2195 
2196         if (*pc2 != 0.0 || *pc3 != 0.0) {
2197           mul2   = (*pc2)/(*pc1);
2198           mul3   = (*pc3)/(*pc1);
2199           *pc2   = mul2;
2200           *pc3   = mul3;
2201           nz_tmp = bi[prow+1] - bd[prow] - 1;
2202           pj     = nbj + bd[prow];
2203           for (j=0; j<nz_tmp; j++) {
2204             idx          = pj[j];
2205             tmp          = rtmp11[idx];
2206             rtmp22[idx] -= mul2 * tmp;
2207             rtmp33[idx] -= mul3 * tmp;
2208           }
2209           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2210         }
2211         ++prow;
2212 
2213         pc2     = rtmp22 + prow;
2214         pc3     = rtmp33 + prow;
2215         sctx.pv = *pc2;
2216         pj      = bj + bi[prow];
2217         rs      = 0.0;
2218         for (j=0; j<nz; j++) {
2219           idx = pj[j];
2220           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2221         }
2222         sctx.rs = rs;
2223         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2224         if (sctx.newshift) goto endofwhile;
2225 
2226         if (*pc3 != 0.0) {
2227           mul3   = (*pc3)/(*pc2);
2228           *pc3   = mul3;
2229           pj     = nbj + bd[prow];
2230           nz_tmp = bi[prow+1] - bd[prow] - 1;
2231           for (j=0; j<nz_tmp; j++) {
2232             idx          = pj[j];
2233             tmp          = rtmp22[idx];
2234             rtmp33[idx] -= mul3 * tmp;
2235           }
2236           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2237         }
2238 
2239         pj  = bj + bi[row];
2240         pc1 = ba + bi[row];
2241         pc2 = ba + bi[row+1];
2242         pc3 = ba + bi[row+2];
2243 
2244         sctx.pv       = rtmp33[row+2];
2245         rs            = 0.0;
2246         rtmp11[row]   = 1.0/rtmp11[row];
2247         rtmp22[row+1] = 1.0/rtmp22[row+1];
2248         rtmp33[row+2] = 1.0/rtmp33[row+2];
2249         /* copy row entries from dense representation to sparse */
2250         for (j=0; j<nz; j++) {
2251           idx    = pj[j];
2252           pc1[j] = rtmp11[idx];
2253           pc2[j] = rtmp22[idx];
2254           pc3[j] = rtmp33[idx];
2255           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2256         }
2257 
2258         sctx.rs = rs;
2259         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2260         if (sctx.newshift) goto endofwhile;
2261         break;
2262 
2263       default:
2264         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2265       }
2266       row += nodesz;                 /* Update the row */
2267     }
2268 endofwhile:;
2269   } while (sctx.newshift);
2270   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2271   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2272   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2273   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2274   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2275 
2276   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2277   /* do not set solve add, since MatSolve_Inode + Add is faster */
2278   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2279   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2280   C->assembled              = PETSC_TRUE;
2281   C->preallocated           = PETSC_TRUE;
2282   if (sctx.nshift) {
2283     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2284       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2285     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2286       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2287     }
2288   }
2289   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2290   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 
2295 /* ----------------------------------------------------------- */
2296 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2297 {
2298   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2299   IS                iscol = a->col,isrow = a->row;
2300   PetscErrorCode    ierr;
2301   const PetscInt    *r,*c,*rout,*cout;
2302   PetscInt          i,j,n = A->rmap->n;
2303   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2304   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2305   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2306   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2307   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2308   const PetscScalar *b;
2309 
2310   PetscFunctionBegin;
2311   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2312   node_max = a->inode.node_count;
2313   ns       = a->inode.size;     /* Node Size array */
2314 
2315   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2316   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2317   tmp  = a->solve_work;
2318 
2319   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2320   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2321 
2322   /* forward solve the lower triangular */
2323   tmps = tmp;
2324   aa   = a_a;
2325   aj   = a_j;
2326   ad   = a->diag;
2327 
2328   for (i = 0,row = 0; i< node_max; ++i) {
2329     nsz = ns[i];
2330     aii = ai[row];
2331     v1  = aa + aii;
2332     vi  = aj + aii;
2333     nz  = ai[row+1]- ai[row];
2334 
2335     if (i < node_max-1) {
2336       /* Prefetch the indices for the next block */
2337       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2338       /* Prefetch the data for the next block */
2339       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2340     }
2341 
2342     switch (nsz) {               /* Each loop in 'case' is unrolled */
2343     case 1:
2344       sum1 = b[r[row]];
2345       for (j=0; j<nz-1; j+=2) {
2346         i0    = vi[j];
2347         i1    = vi[j+1];
2348         tmp0  = tmps[i0];
2349         tmp1  = tmps[i1];
2350         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2351       }
2352       if (j == nz-1) {
2353         tmp0  = tmps[vi[j]];
2354         sum1 -= v1[j]*tmp0;
2355       }
2356       tmp[row++]=sum1;
2357       break;
2358     case 2:
2359       sum1 = b[r[row]];
2360       sum2 = b[r[row+1]];
2361       v2   = aa + ai[row+1];
2362 
2363       for (j=0; j<nz-1; j+=2) {
2364         i0    = vi[j];
2365         i1    = vi[j+1];
2366         tmp0  = tmps[i0];
2367         tmp1  = tmps[i1];
2368         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2369         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2370       }
2371       if (j == nz-1) {
2372         tmp0  = tmps[vi[j]];
2373         sum1 -= v1[j] *tmp0;
2374         sum2 -= v2[j] *tmp0;
2375       }
2376       sum2     -= v2[nz] * sum1;
2377       tmp[row++]=sum1;
2378       tmp[row++]=sum2;
2379       break;
2380     case 3:
2381       sum1 = b[r[row]];
2382       sum2 = b[r[row+1]];
2383       sum3 = b[r[row+2]];
2384       v2   = aa + ai[row+1];
2385       v3   = aa + ai[row+2];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2395       }
2396       if (j == nz-1) {
2397         tmp0  = tmps[vi[j]];
2398         sum1 -= v1[j] *tmp0;
2399         sum2 -= v2[j] *tmp0;
2400         sum3 -= v3[j] *tmp0;
2401       }
2402       sum2     -= v2[nz] * sum1;
2403       sum3     -= v3[nz] * sum1;
2404       sum3     -= v3[nz+1] * sum2;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       tmp[row++]=sum3;
2408       break;
2409 
2410     case 4:
2411       sum1 = b[r[row]];
2412       sum2 = b[r[row+1]];
2413       sum3 = b[r[row+2]];
2414       sum4 = b[r[row+3]];
2415       v2   = aa + ai[row+1];
2416       v3   = aa + ai[row+2];
2417       v4   = aa + ai[row+3];
2418 
2419       for (j=0; j<nz-1; j+=2) {
2420         i0    = vi[j];
2421         i1    = vi[j+1];
2422         tmp0  = tmps[i0];
2423         tmp1  = tmps[i1];
2424         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428       }
2429       if (j == nz-1) {
2430         tmp0  = tmps[vi[j]];
2431         sum1 -= v1[j] *tmp0;
2432         sum2 -= v2[j] *tmp0;
2433         sum3 -= v3[j] *tmp0;
2434         sum4 -= v4[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum4 -= v4[nz] * sum1;
2439       sum3 -= v3[nz+1] * sum2;
2440       sum4 -= v4[nz+1] * sum2;
2441       sum4 -= v4[nz+2] * sum3;
2442 
2443       tmp[row++]=sum1;
2444       tmp[row++]=sum2;
2445       tmp[row++]=sum3;
2446       tmp[row++]=sum4;
2447       break;
2448     case 5:
2449       sum1 = b[r[row]];
2450       sum2 = b[r[row+1]];
2451       sum3 = b[r[row+2]];
2452       sum4 = b[r[row+3]];
2453       sum5 = b[r[row+4]];
2454       v2   = aa + ai[row+1];
2455       v3   = aa + ai[row+2];
2456       v4   = aa + ai[row+3];
2457       v5   = aa + ai[row+4];
2458 
2459       for (j=0; j<nz-1; j+=2) {
2460         i0    = vi[j];
2461         i1    = vi[j+1];
2462         tmp0  = tmps[i0];
2463         tmp1  = tmps[i1];
2464         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2466         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2467         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2468         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2469       }
2470       if (j == nz-1) {
2471         tmp0  = tmps[vi[j]];
2472         sum1 -= v1[j] *tmp0;
2473         sum2 -= v2[j] *tmp0;
2474         sum3 -= v3[j] *tmp0;
2475         sum4 -= v4[j] *tmp0;
2476         sum5 -= v5[j] *tmp0;
2477       }
2478 
2479       sum2 -= v2[nz] * sum1;
2480       sum3 -= v3[nz] * sum1;
2481       sum4 -= v4[nz] * sum1;
2482       sum5 -= v5[nz] * sum1;
2483       sum3 -= v3[nz+1] * sum2;
2484       sum4 -= v4[nz+1] * sum2;
2485       sum5 -= v5[nz+1] * sum2;
2486       sum4 -= v4[nz+2] * sum3;
2487       sum5 -= v5[nz+2] * sum3;
2488       sum5 -= v5[nz+3] * sum4;
2489 
2490       tmp[row++]=sum1;
2491       tmp[row++]=sum2;
2492       tmp[row++]=sum3;
2493       tmp[row++]=sum4;
2494       tmp[row++]=sum5;
2495       break;
2496     default:
2497       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2498     }
2499   }
2500   /* backward solve the upper triangular */
2501   for (i=node_max -1,row = n-1; i>=0; i--) {
2502     nsz = ns[i];
2503     aii = ad[row+1] + 1;
2504     v1  = aa + aii;
2505     vi  = aj + aii;
2506     nz  = ad[row]- ad[row+1] - 1;
2507 
2508     if (i > 0) {
2509       /* Prefetch the indices for the next block */
2510       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2511       /* Prefetch the data for the next block */
2512       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2513     }
2514 
2515     switch (nsz) {               /* Each loop in 'case' is unrolled */
2516     case 1:
2517       sum1 = tmp[row];
2518 
2519       for (j=0; j<nz-1; j+=2) {
2520         i0    = vi[j];
2521         i1    = vi[j+1];
2522         tmp0  = tmps[i0];
2523         tmp1  = tmps[i1];
2524         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2525       }
2526       if (j == nz-1) {
2527         tmp0  = tmps[vi[j]];
2528         sum1 -= v1[j]*tmp0;
2529       }
2530       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2531       break;
2532     case 2:
2533       sum1 = tmp[row];
2534       sum2 = tmp[row-1];
2535       v2   = aa + ad[row] + 1;
2536       for (j=0; j<nz-1; j+=2) {
2537         i0    = vi[j];
2538         i1    = vi[j+1];
2539         tmp0  = tmps[i0];
2540         tmp1  = tmps[i1];
2541         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2542         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2543       }
2544       if (j == nz-1) {
2545         tmp0  = tmps[vi[j]];
2546         sum1 -= v1[j]* tmp0;
2547         sum2 -= v2[j+1]* tmp0;
2548       }
2549 
2550       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2551       sum2     -= v2[0] * tmp0;
2552       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553       break;
2554     case 3:
2555       sum1 = tmp[row];
2556       sum2 = tmp[row -1];
2557       sum3 = tmp[row -2];
2558       v2   = aa + ad[row] + 1;
2559       v3   = aa + ad[row -1] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2568       }
2569       if (j== nz-1) {
2570         tmp0  = tmps[vi[j]];
2571         sum1 -= v1[j] * tmp0;
2572         sum2 -= v2[j+1] * tmp0;
2573         sum3 -= v3[j+2] * tmp0;
2574       }
2575       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2576       sum2     -= v2[0]* tmp0;
2577       sum3     -= v3[1] * tmp0;
2578       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2579       sum3     -= v3[0]* tmp0;
2580       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2581 
2582       break;
2583     case 4:
2584       sum1 = tmp[row];
2585       sum2 = tmp[row -1];
2586       sum3 = tmp[row -2];
2587       sum4 = tmp[row -3];
2588       v2   = aa + ad[row]+1;
2589       v3   = aa + ad[row -1]+1;
2590       v4   = aa + ad[row -2]+1;
2591 
2592       for (j=0; j<nz-1; j+=2) {
2593         i0    = vi[j];
2594         i1    = vi[j+1];
2595         tmp0  = tmps[i0];
2596         tmp1  = tmps[i1];
2597         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2598         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2601       }
2602       if (j== nz-1) {
2603         tmp0  = tmps[vi[j]];
2604         sum1 -= v1[j] * tmp0;
2605         sum2 -= v2[j+1] * tmp0;
2606         sum3 -= v3[j+2] * tmp0;
2607         sum4 -= v4[j+3] * tmp0;
2608       }
2609 
2610       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611       sum2     -= v2[0] * tmp0;
2612       sum3     -= v3[1] * tmp0;
2613       sum4     -= v4[2] * tmp0;
2614       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615       sum3     -= v3[0] * tmp0;
2616       sum4     -= v4[1] * tmp0;
2617       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2618       sum4     -= v4[0] * tmp0;
2619       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2620       break;
2621     case 5:
2622       sum1 = tmp[row];
2623       sum2 = tmp[row -1];
2624       sum3 = tmp[row -2];
2625       sum4 = tmp[row -3];
2626       sum5 = tmp[row -4];
2627       v2   = aa + ad[row]+1;
2628       v3   = aa + ad[row -1]+1;
2629       v4   = aa + ad[row -2]+1;
2630       v5   = aa + ad[row -3]+1;
2631       for (j=0; j<nz-1; j+=2) {
2632         i0    = vi[j];
2633         i1    = vi[j+1];
2634         tmp0  = tmps[i0];
2635         tmp1  = tmps[i1];
2636         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2637         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2641       }
2642       if (j==nz-1) {
2643         tmp0  = tmps[vi[j]];
2644         sum1 -= v1[j] * tmp0;
2645         sum2 -= v2[j+1] * tmp0;
2646         sum3 -= v3[j+2] * tmp0;
2647         sum4 -= v4[j+3] * tmp0;
2648         sum5 -= v5[j+4] * tmp0;
2649       }
2650 
2651       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2652       sum2     -= v2[0] * tmp0;
2653       sum3     -= v3[1] * tmp0;
2654       sum4     -= v4[2] * tmp0;
2655       sum5     -= v5[3] * tmp0;
2656       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2657       sum3     -= v3[0] * tmp0;
2658       sum4     -= v4[1] * tmp0;
2659       sum5     -= v5[2] * tmp0;
2660       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2661       sum4     -= v4[0] * tmp0;
2662       sum5     -= v5[1] * tmp0;
2663       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2664       sum5     -= v5[0] * tmp0;
2665       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2666       break;
2667     default:
2668       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2669     }
2670   }
2671   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2672   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2673   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2674   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2675   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2676   PetscFunctionReturn(0);
2677 }
2678 
2679 
2680 /*
2681      Makes a longer coloring[] array and calls the usual code with that
2682 */
2683 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2684 {
2685   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2686   PetscErrorCode  ierr;
2687   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2688   PetscInt        *colorused,i;
2689   ISColoringValue *newcolor;
2690 
2691   PetscFunctionBegin;
2692   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2693   /* loop over inodes, marking a color for each column*/
2694   row = 0;
2695   for (i=0; i<m; i++) {
2696     for (j=0; j<ns[i]; j++) {
2697       newcolor[row++] = coloring[i] + j*ncolors;
2698     }
2699   }
2700 
2701   /* eliminate unneeded colors */
2702   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2703   for (i=0; i<n; i++) {
2704     colorused[newcolor[i]] = 1;
2705   }
2706 
2707   for (i=1; i<5*ncolors; i++) {
2708     colorused[i] += colorused[i-1];
2709   }
2710   ncolors = colorused[5*ncolors-1];
2711   for (i=0; i<n; i++) {
2712     newcolor[i] = colorused[newcolor[i]]-1;
2713   }
2714   ierr = PetscFree(colorused);CHKERRQ(ierr);
2715   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2716   ierr = PetscFree(coloring);CHKERRQ(ierr);
2717   PetscFunctionReturn(0);
2718 }
2719 
2720 #include <petsc/private/kernels/blockinvert.h>
2721 
2722 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2723 {
2724   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2725   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2726   MatScalar         *ibdiag,*bdiag,work[25],*t;
2727   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2728   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2729   const PetscScalar *xb, *b;
2730   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2731   PetscErrorCode    ierr;
2732   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2733   PetscInt          sz,k,ipvt[5];
2734   PetscBool         allowzeropivot,zeropivotdetected;
2735   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2736 
2737   PetscFunctionBegin;
2738   allowzeropivot = PetscNot(A->erroriffailure);
2739   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2740   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2741   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2742 
2743   if (!a->inode.ibdiagvalid) {
2744     if (!a->inode.ibdiag) {
2745       /* calculate space needed for diagonal blocks */
2746       for (i=0; i<m; i++) {
2747         cnt += sizes[i]*sizes[i];
2748       }
2749       a->inode.bdiagsize = cnt;
2750 
2751       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2752     }
2753 
2754     /* copy over the diagonal blocks and invert them */
2755     ibdiag = a->inode.ibdiag;
2756     bdiag  = a->inode.bdiag;
2757     cnt    = 0;
2758     for (i=0, row = 0; i<m; i++) {
2759       for (j=0; j<sizes[i]; j++) {
2760         for (k=0; k<sizes[i]; k++) {
2761           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2762         }
2763       }
2764       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2765 
2766       switch (sizes[i]) {
2767       case 1:
2768         /* Create matrix data structure */
2769         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2770           if (allowzeropivot) {
2771             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2772             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2773             A->factorerror_zeropivot_row   = row;
2774             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2775           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2776         }
2777         ibdiag[cnt] = 1.0/ibdiag[cnt];
2778         break;
2779       case 2:
2780         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2781         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2782         break;
2783       case 3:
2784         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2785         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2786         break;
2787       case 4:
2788         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2789         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2790         break;
2791       case 5:
2792         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2793         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2794         break;
2795       default:
2796         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2797       }
2798       cnt += sizes[i]*sizes[i];
2799       row += sizes[i];
2800     }
2801     a->inode.ibdiagvalid = PETSC_TRUE;
2802   }
2803   ibdiag = a->inode.ibdiag;
2804   bdiag  = a->inode.bdiag;
2805   t      = a->inode.ssor_work;
2806 
2807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2808   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2809   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2810   if (flag & SOR_ZERO_INITIAL_GUESS) {
2811     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2812 
2813       for (i=0, row=0; i<m; i++) {
2814         sz  = diag[row] - ii[row];
2815         v1  = a->a + ii[row];
2816         idx = a->j + ii[row];
2817 
2818         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2819         switch (sizes[i]) {
2820         case 1:
2821 
2822           sum1 = b[row];
2823           for (n = 0; n<sz-1; n+=2) {
2824             i1    = idx[0];
2825             i2    = idx[1];
2826             idx  += 2;
2827             tmp0  = x[i1];
2828             tmp1  = x[i2];
2829             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2830           }
2831 
2832           if (n == sz-1) {
2833             tmp0  = x[*idx];
2834             sum1 -= *v1 * tmp0;
2835           }
2836           t[row]   = sum1;
2837           x[row++] = sum1*(*ibdiag++);
2838           break;
2839         case 2:
2840           v2   = a->a + ii[row+1];
2841           sum1 = b[row];
2842           sum2 = b[row+1];
2843           for (n = 0; n<sz-1; n+=2) {
2844             i1    = idx[0];
2845             i2    = idx[1];
2846             idx  += 2;
2847             tmp0  = x[i1];
2848             tmp1  = x[i2];
2849             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2850             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2851           }
2852 
2853           if (n == sz-1) {
2854             tmp0  = x[*idx];
2855             sum1 -= v1[0] * tmp0;
2856             sum2 -= v2[0] * tmp0;
2857           }
2858           t[row]   = sum1;
2859           t[row+1] = sum2;
2860           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2861           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2862           ibdiag  += 4;
2863           break;
2864         case 3:
2865           v2   = a->a + ii[row+1];
2866           v3   = a->a + ii[row+2];
2867           sum1 = b[row];
2868           sum2 = b[row+1];
2869           sum3 = b[row+2];
2870           for (n = 0; n<sz-1; n+=2) {
2871             i1    = idx[0];
2872             i2    = idx[1];
2873             idx  += 2;
2874             tmp0  = x[i1];
2875             tmp1  = x[i2];
2876             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2877             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2878             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2879           }
2880 
2881           if (n == sz-1) {
2882             tmp0  = x[*idx];
2883             sum1 -= v1[0] * tmp0;
2884             sum2 -= v2[0] * tmp0;
2885             sum3 -= v3[0] * tmp0;
2886           }
2887           t[row]   = sum1;
2888           t[row+1] = sum2;
2889           t[row+2] = sum3;
2890           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2891           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2892           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2893           ibdiag  += 9;
2894           break;
2895         case 4:
2896           v2   = a->a + ii[row+1];
2897           v3   = a->a + ii[row+2];
2898           v4   = a->a + ii[row+3];
2899           sum1 = b[row];
2900           sum2 = b[row+1];
2901           sum3 = b[row+2];
2902           sum4 = b[row+3];
2903           for (n = 0; n<sz-1; n+=2) {
2904             i1    = idx[0];
2905             i2    = idx[1];
2906             idx  += 2;
2907             tmp0  = x[i1];
2908             tmp1  = x[i2];
2909             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2910             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2911             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2912             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2913           }
2914 
2915           if (n == sz-1) {
2916             tmp0  = x[*idx];
2917             sum1 -= v1[0] * tmp0;
2918             sum2 -= v2[0] * tmp0;
2919             sum3 -= v3[0] * tmp0;
2920             sum4 -= v4[0] * tmp0;
2921           }
2922           t[row]   = sum1;
2923           t[row+1] = sum2;
2924           t[row+2] = sum3;
2925           t[row+3] = sum4;
2926           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2927           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2928           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2929           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2930           ibdiag  += 16;
2931           break;
2932         case 5:
2933           v2   = a->a + ii[row+1];
2934           v3   = a->a + ii[row+2];
2935           v4   = a->a + ii[row+3];
2936           v5   = a->a + ii[row+4];
2937           sum1 = b[row];
2938           sum2 = b[row+1];
2939           sum3 = b[row+2];
2940           sum4 = b[row+3];
2941           sum5 = b[row+4];
2942           for (n = 0; n<sz-1; n+=2) {
2943             i1    = idx[0];
2944             i2    = idx[1];
2945             idx  += 2;
2946             tmp0  = x[i1];
2947             tmp1  = x[i2];
2948             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2949             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2950             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2951             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2952             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2953           }
2954 
2955           if (n == sz-1) {
2956             tmp0  = x[*idx];
2957             sum1 -= v1[0] * tmp0;
2958             sum2 -= v2[0] * tmp0;
2959             sum3 -= v3[0] * tmp0;
2960             sum4 -= v4[0] * tmp0;
2961             sum5 -= v5[0] * tmp0;
2962           }
2963           t[row]   = sum1;
2964           t[row+1] = sum2;
2965           t[row+2] = sum3;
2966           t[row+3] = sum4;
2967           t[row+4] = sum5;
2968           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2969           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2970           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2971           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2972           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2973           ibdiag  += 25;
2974           break;
2975         default:
2976           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2977         }
2978       }
2979 
2980       xb   = t;
2981       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2982     } else xb = b;
2983     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2984 
2985       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2986       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2987         ibdiag -= sizes[i]*sizes[i];
2988         sz      = ii[row+1] - diag[row] - 1;
2989         v1      = a->a + diag[row] + 1;
2990         idx     = a->j + diag[row] + 1;
2991 
2992         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2993         switch (sizes[i]) {
2994         case 1:
2995 
2996           sum1 = xb[row];
2997           for (n = 0; n<sz-1; n+=2) {
2998             i1    = idx[0];
2999             i2    = idx[1];
3000             idx  += 2;
3001             tmp0  = x[i1];
3002             tmp1  = x[i2];
3003             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3004           }
3005 
3006           if (n == sz-1) {
3007             tmp0  = x[*idx];
3008             sum1 -= *v1*tmp0;
3009           }
3010           x[row--] = sum1*(*ibdiag);
3011           break;
3012 
3013         case 2:
3014 
3015           sum1 = xb[row];
3016           sum2 = xb[row-1];
3017           /* note that sum1 is associated with the second of the two rows */
3018           v2 = a->a + diag[row-1] + 2;
3019           for (n = 0; n<sz-1; n+=2) {
3020             i1    = idx[0];
3021             i2    = idx[1];
3022             idx  += 2;
3023             tmp0  = x[i1];
3024             tmp1  = x[i2];
3025             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3026             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3027           }
3028 
3029           if (n == sz-1) {
3030             tmp0  = x[*idx];
3031             sum1 -= *v1*tmp0;
3032             sum2 -= *v2*tmp0;
3033           }
3034           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3035           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3036           break;
3037         case 3:
3038 
3039           sum1 = xb[row];
3040           sum2 = xb[row-1];
3041           sum3 = xb[row-2];
3042           v2   = a->a + diag[row-1] + 2;
3043           v3   = a->a + diag[row-2] + 3;
3044           for (n = 0; n<sz-1; n+=2) {
3045             i1    = idx[0];
3046             i2    = idx[1];
3047             idx  += 2;
3048             tmp0  = x[i1];
3049             tmp1  = x[i2];
3050             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3051             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3052             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3053           }
3054 
3055           if (n == sz-1) {
3056             tmp0  = x[*idx];
3057             sum1 -= *v1*tmp0;
3058             sum2 -= *v2*tmp0;
3059             sum3 -= *v3*tmp0;
3060           }
3061           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3062           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3063           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3064           break;
3065         case 4:
3066 
3067           sum1 = xb[row];
3068           sum2 = xb[row-1];
3069           sum3 = xb[row-2];
3070           sum4 = xb[row-3];
3071           v2   = a->a + diag[row-1] + 2;
3072           v3   = a->a + diag[row-2] + 3;
3073           v4   = a->a + diag[row-3] + 4;
3074           for (n = 0; n<sz-1; n+=2) {
3075             i1    = idx[0];
3076             i2    = idx[1];
3077             idx  += 2;
3078             tmp0  = x[i1];
3079             tmp1  = x[i2];
3080             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3084           }
3085 
3086           if (n == sz-1) {
3087             tmp0  = x[*idx];
3088             sum1 -= *v1*tmp0;
3089             sum2 -= *v2*tmp0;
3090             sum3 -= *v3*tmp0;
3091             sum4 -= *v4*tmp0;
3092           }
3093           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3094           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3095           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3096           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3097           break;
3098         case 5:
3099 
3100           sum1 = xb[row];
3101           sum2 = xb[row-1];
3102           sum3 = xb[row-2];
3103           sum4 = xb[row-3];
3104           sum5 = xb[row-4];
3105           v2   = a->a + diag[row-1] + 2;
3106           v3   = a->a + diag[row-2] + 3;
3107           v4   = a->a + diag[row-3] + 4;
3108           v5   = a->a + diag[row-4] + 5;
3109           for (n = 0; n<sz-1; n+=2) {
3110             i1    = idx[0];
3111             i2    = idx[1];
3112             idx  += 2;
3113             tmp0  = x[i1];
3114             tmp1  = x[i2];
3115             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3116             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3117             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3118             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3119             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3120           }
3121 
3122           if (n == sz-1) {
3123             tmp0  = x[*idx];
3124             sum1 -= *v1*tmp0;
3125             sum2 -= *v2*tmp0;
3126             sum3 -= *v3*tmp0;
3127             sum4 -= *v4*tmp0;
3128             sum5 -= *v5*tmp0;
3129           }
3130           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3131           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3132           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3133           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3134           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3135           break;
3136         default:
3137           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3138         }
3139       }
3140 
3141       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3142     }
3143     its--;
3144   }
3145   while (its--) {
3146 
3147     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3148       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3149            i<m;
3150            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3151 
3152         sz  = diag[row] - ii[row];
3153         v1  = a->a + ii[row];
3154         idx = a->j + ii[row];
3155         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3156         switch (sizes[i]) {
3157         case 1:
3158           sum1 = b[row];
3159           for (n = 0; n<sz-1; n+=2) {
3160             i1    = idx[0];
3161             i2    = idx[1];
3162             idx  += 2;
3163             tmp0  = x[i1];
3164             tmp1  = x[i2];
3165             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3166           }
3167           if (n == sz-1) {
3168             tmp0  = x[*idx++];
3169             sum1 -= *v1 * tmp0;
3170             v1++;
3171           }
3172           t[row]   = sum1;
3173           sz      = ii[row+1] - diag[row] - 1;
3174           idx     = a->j + diag[row] + 1;
3175           v1 += 1;
3176           for (n = 0; n<sz-1; n+=2) {
3177             i1    = idx[0];
3178             i2    = idx[1];
3179             idx  += 2;
3180             tmp0  = x[i1];
3181             tmp1  = x[i2];
3182             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3183           }
3184           if (n == sz-1) {
3185             tmp0  = x[*idx++];
3186             sum1 -= *v1 * tmp0;
3187           }
3188           /* in MatSOR_SeqAIJ this line would be
3189            *
3190            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3191            *
3192            * but omega == 1, so this becomes
3193            *
3194            * x[row] = sum1*(*ibdiag++);
3195            *
3196            */
3197           x[row] = sum1*(*ibdiag);
3198           break;
3199         case 2:
3200           v2   = a->a + ii[row+1];
3201           sum1 = b[row];
3202           sum2 = b[row+1];
3203           for (n = 0; n<sz-1; n+=2) {
3204             i1    = idx[0];
3205             i2    = idx[1];
3206             idx  += 2;
3207             tmp0  = x[i1];
3208             tmp1  = x[i2];
3209             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3210             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3211           }
3212           if (n == sz-1) {
3213             tmp0  = x[*idx++];
3214             sum1 -= v1[0] * tmp0;
3215             sum2 -= v2[0] * tmp0;
3216             v1++; v2++;
3217           }
3218           t[row]   = sum1;
3219           t[row+1] = sum2;
3220           sz      = ii[row+1] - diag[row] - 2;
3221           idx     = a->j + diag[row] + 2;
3222           v1 += 2;
3223           v2 += 2;
3224           for (n = 0; n<sz-1; n+=2) {
3225             i1    = idx[0];
3226             i2    = idx[1];
3227             idx  += 2;
3228             tmp0  = x[i1];
3229             tmp1  = x[i2];
3230             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3231             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3232           }
3233           if (n == sz-1) {
3234             tmp0  = x[*idx];
3235             sum1 -= v1[0] * tmp0;
3236             sum2 -= v2[0] * tmp0;
3237           }
3238           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3239           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3240           break;
3241         case 3:
3242           v2   = a->a + ii[row+1];
3243           v3   = a->a + ii[row+2];
3244           sum1 = b[row];
3245           sum2 = b[row+1];
3246           sum3 = b[row+2];
3247           for (n = 0; n<sz-1; n+=2) {
3248             i1    = idx[0];
3249             i2    = idx[1];
3250             idx  += 2;
3251             tmp0  = x[i1];
3252             tmp1  = x[i2];
3253             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3256           }
3257           if (n == sz-1) {
3258             tmp0  = x[*idx++];
3259             sum1 -= v1[0] * tmp0;
3260             sum2 -= v2[0] * tmp0;
3261             sum3 -= v3[0] * tmp0;
3262             v1++; v2++; v3++;
3263           }
3264           t[row]   = sum1;
3265           t[row+1] = sum2;
3266           t[row+2] = sum3;
3267           sz      = ii[row+1] - diag[row] - 3;
3268           idx     = a->j + diag[row] + 3;
3269           v1 += 3;
3270           v2 += 3;
3271           v3 += 3;
3272           for (n = 0; n<sz-1; n+=2) {
3273             i1    = idx[0];
3274             i2    = idx[1];
3275             idx  += 2;
3276             tmp0  = x[i1];
3277             tmp1  = x[i2];
3278             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3279             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3280             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3281           }
3282           if (n == sz-1) {
3283             tmp0  = x[*idx];
3284             sum1 -= v1[0] * tmp0;
3285             sum2 -= v2[0] * tmp0;
3286             sum3 -= v3[0] * tmp0;
3287           }
3288           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3289           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3290           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3291           break;
3292         case 4:
3293           v2   = a->a + ii[row+1];
3294           v3   = a->a + ii[row+2];
3295           v4   = a->a + ii[row+3];
3296           sum1 = b[row];
3297           sum2 = b[row+1];
3298           sum3 = b[row+2];
3299           sum4 = b[row+3];
3300           for (n = 0; n<sz-1; n+=2) {
3301             i1    = idx[0];
3302             i2    = idx[1];
3303             idx  += 2;
3304             tmp0  = x[i1];
3305             tmp1  = x[i2];
3306             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3307             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3308             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3309             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3310           }
3311           if (n == sz-1) {
3312             tmp0  = x[*idx++];
3313             sum1 -= v1[0] * tmp0;
3314             sum2 -= v2[0] * tmp0;
3315             sum3 -= v3[0] * tmp0;
3316             sum4 -= v4[0] * tmp0;
3317             v1++; v2++; v3++; v4++;
3318           }
3319           t[row]   = sum1;
3320           t[row+1] = sum2;
3321           t[row+2] = sum3;
3322           t[row+3] = sum4;
3323           sz      = ii[row+1] - diag[row] - 4;
3324           idx     = a->j + diag[row] + 4;
3325           v1 += 4;
3326           v2 += 4;
3327           v3 += 4;
3328           v4 += 4;
3329           for (n = 0; n<sz-1; n+=2) {
3330             i1    = idx[0];
3331             i2    = idx[1];
3332             idx  += 2;
3333             tmp0  = x[i1];
3334             tmp1  = x[i2];
3335             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3336             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3337             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3338             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3339           }
3340           if (n == sz-1) {
3341             tmp0  = x[*idx];
3342             sum1 -= v1[0] * tmp0;
3343             sum2 -= v2[0] * tmp0;
3344             sum3 -= v3[0] * tmp0;
3345             sum4 -= v4[0] * tmp0;
3346           }
3347           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3348           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3349           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3350           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3351           break;
3352         case 5:
3353           v2   = a->a + ii[row+1];
3354           v3   = a->a + ii[row+2];
3355           v4   = a->a + ii[row+3];
3356           v5   = a->a + ii[row+4];
3357           sum1 = b[row];
3358           sum2 = b[row+1];
3359           sum3 = b[row+2];
3360           sum4 = b[row+3];
3361           sum5 = b[row+4];
3362           for (n = 0; n<sz-1; n+=2) {
3363             i1    = idx[0];
3364             i2    = idx[1];
3365             idx  += 2;
3366             tmp0  = x[i1];
3367             tmp1  = x[i2];
3368             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3369             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3370             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3371             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3372             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3373           }
3374           if (n == sz-1) {
3375             tmp0  = x[*idx++];
3376             sum1 -= v1[0] * tmp0;
3377             sum2 -= v2[0] * tmp0;
3378             sum3 -= v3[0] * tmp0;
3379             sum4 -= v4[0] * tmp0;
3380             sum5 -= v5[0] * tmp0;
3381             v1++; v2++; v3++; v4++; v5++;
3382           }
3383           t[row]   = sum1;
3384           t[row+1] = sum2;
3385           t[row+2] = sum3;
3386           t[row+3] = sum4;
3387           t[row+4] = sum5;
3388           sz      = ii[row+1] - diag[row] - 5;
3389           idx     = a->j + diag[row] + 5;
3390           v1 += 5;
3391           v2 += 5;
3392           v3 += 5;
3393           v4 += 5;
3394           v5 += 5;
3395           for (n = 0; n<sz-1; n+=2) {
3396             i1    = idx[0];
3397             i2    = idx[1];
3398             idx  += 2;
3399             tmp0  = x[i1];
3400             tmp1  = x[i2];
3401             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3402             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3403             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3404             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3405             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3406           }
3407           if (n == sz-1) {
3408             tmp0  = x[*idx];
3409             sum1 -= v1[0] * tmp0;
3410             sum2 -= v2[0] * tmp0;
3411             sum3 -= v3[0] * tmp0;
3412             sum4 -= v4[0] * tmp0;
3413             sum5 -= v5[0] * tmp0;
3414           }
3415           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3416           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3417           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3418           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3419           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3420           break;
3421         default:
3422           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3423         }
3424       }
3425       xb   = t;
3426       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3427     } else xb = b;
3428 
3429     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3430 
3431       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3432       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3433         ibdiag -= sizes[i]*sizes[i];
3434 
3435         /* set RHS */
3436         if (xb == b) {
3437           /* whole (old way) */
3438           sz      = ii[row+1] - ii[row];
3439           idx     = a->j + ii[row];
3440           switch (sizes[i]) {
3441           case 5:
3442             v5      = a->a + ii[row-4];
3443           case 4: /* fall through */
3444             v4      = a->a + ii[row-3];
3445           case 3:
3446             v3      = a->a + ii[row-2];
3447           case 2:
3448             v2      = a->a + ii[row-1];
3449           case 1:
3450             v1      = a->a + ii[row];
3451             break;
3452           default:
3453             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3454           }
3455         } else {
3456           /* upper, no diag */
3457           sz      = ii[row+1] - diag[row] - 1;
3458           idx     = a->j + diag[row] + 1;
3459           switch (sizes[i]) {
3460           case 5:
3461             v5      = a->a + diag[row-4] + 5;
3462           case 4: /* fall through */
3463             v4      = a->a + diag[row-3] + 4;
3464           case 3:
3465             v3      = a->a + diag[row-2] + 3;
3466           case 2:
3467             v2      = a->a + diag[row-1] + 2;
3468           case 1:
3469             v1      = a->a + diag[row] + 1;
3470           }
3471         }
3472         /* set sum */
3473         switch (sizes[i]) {
3474         case 5:
3475           sum5 = xb[row-4];
3476         case 4: /* fall through */
3477           sum4 = xb[row-3];
3478         case 3:
3479           sum3 = xb[row-2];
3480         case 2:
3481           sum2 = xb[row-1];
3482         case 1:
3483           /* note that sum1 is associated with the last row */
3484           sum1 = xb[row];
3485         }
3486         /* do sums */
3487         for (n = 0; n<sz-1; n+=2) {
3488             i1    = idx[0];
3489             i2    = idx[1];
3490             idx  += 2;
3491             tmp0  = x[i1];
3492             tmp1  = x[i2];
3493             switch (sizes[i]) {
3494             case 5:
3495               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3496             case 4: /* fall through */
3497               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3498             case 3:
3499               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3500             case 2:
3501               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3502             case 1:
3503               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3504             }
3505         }
3506         /* ragged edge */
3507         if (n == sz-1) {
3508           tmp0  = x[*idx];
3509           switch (sizes[i]) {
3510           case 5:
3511             sum5 -= *v5*tmp0;
3512           case 4: /* fall through */
3513             sum4 -= *v4*tmp0;
3514           case 3:
3515             sum3 -= *v3*tmp0;
3516           case 2:
3517             sum2 -= *v2*tmp0;
3518           case 1:
3519             sum1 -= *v1*tmp0;
3520           }
3521         }
3522         /* update */
3523         if (xb == b) {
3524           /* whole (old way) w/ diag */
3525           switch (sizes[i]) {
3526           case 5:
3527             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3528             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3529             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3530             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3531             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3532             break;
3533           case 4:
3534             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3535             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3536             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3537             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3538             break;
3539           case 3:
3540             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3541             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3542             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3543             break;
3544           case 2:
3545             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3546             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3547             break;
3548           case 1:
3549             x[row--] += sum1*(*ibdiag);
3550             break;
3551           }
3552         } else {
3553           /* no diag so set =  */
3554           switch (sizes[i]) {
3555           case 5:
3556             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3557             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3558             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3559             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3560             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3561             break;
3562           case 4:
3563             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3564             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3565             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3566             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3567             break;
3568           case 3:
3569             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3570             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3571             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3572             break;
3573           case 2:
3574             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3575             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3576             break;
3577           case 1:
3578             x[row--] = sum1*(*ibdiag);
3579             break;
3580           }
3581         }
3582       }
3583       if (xb == b) {
3584         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3585       } else {
3586         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3587       }
3588     }
3589   }
3590   if (flag & SOR_EISENSTAT) {
3591     /*
3592           Apply  (U + D)^-1  where D is now the block diagonal
3593     */
3594     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3595     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3596       ibdiag -= sizes[i]*sizes[i];
3597       sz      = ii[row+1] - diag[row] - 1;
3598       v1      = a->a + diag[row] + 1;
3599       idx     = a->j + diag[row] + 1;
3600       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3601       switch (sizes[i]) {
3602       case 1:
3603 
3604         sum1 = b[row];
3605         for (n = 0; n<sz-1; n+=2) {
3606           i1    = idx[0];
3607           i2    = idx[1];
3608           idx  += 2;
3609           tmp0  = x[i1];
3610           tmp1  = x[i2];
3611           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3612         }
3613 
3614         if (n == sz-1) {
3615           tmp0  = x[*idx];
3616           sum1 -= *v1*tmp0;
3617         }
3618         x[row] = sum1*(*ibdiag);row--;
3619         break;
3620 
3621       case 2:
3622 
3623         sum1 = b[row];
3624         sum2 = b[row-1];
3625         /* note that sum1 is associated with the second of the two rows */
3626         v2 = a->a + diag[row-1] + 2;
3627         for (n = 0; n<sz-1; n+=2) {
3628           i1    = idx[0];
3629           i2    = idx[1];
3630           idx  += 2;
3631           tmp0  = x[i1];
3632           tmp1  = x[i2];
3633           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3635         }
3636 
3637         if (n == sz-1) {
3638           tmp0  = x[*idx];
3639           sum1 -= *v1*tmp0;
3640           sum2 -= *v2*tmp0;
3641         }
3642         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3643         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3644         row     -= 2;
3645         break;
3646       case 3:
3647 
3648         sum1 = b[row];
3649         sum2 = b[row-1];
3650         sum3 = b[row-2];
3651         v2   = a->a + diag[row-1] + 2;
3652         v3   = a->a + diag[row-2] + 3;
3653         for (n = 0; n<sz-1; n+=2) {
3654           i1    = idx[0];
3655           i2    = idx[1];
3656           idx  += 2;
3657           tmp0  = x[i1];
3658           tmp1  = x[i2];
3659           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3660           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3661           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3662         }
3663 
3664         if (n == sz-1) {
3665           tmp0  = x[*idx];
3666           sum1 -= *v1*tmp0;
3667           sum2 -= *v2*tmp0;
3668           sum3 -= *v3*tmp0;
3669         }
3670         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3671         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3672         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3673         row     -= 3;
3674         break;
3675       case 4:
3676 
3677         sum1 = b[row];
3678         sum2 = b[row-1];
3679         sum3 = b[row-2];
3680         sum4 = b[row-3];
3681         v2   = a->a + diag[row-1] + 2;
3682         v3   = a->a + diag[row-2] + 3;
3683         v4   = a->a + diag[row-3] + 4;
3684         for (n = 0; n<sz-1; n+=2) {
3685           i1    = idx[0];
3686           i2    = idx[1];
3687           idx  += 2;
3688           tmp0  = x[i1];
3689           tmp1  = x[i2];
3690           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3691           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3692           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3693           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3694         }
3695 
3696         if (n == sz-1) {
3697           tmp0  = x[*idx];
3698           sum1 -= *v1*tmp0;
3699           sum2 -= *v2*tmp0;
3700           sum3 -= *v3*tmp0;
3701           sum4 -= *v4*tmp0;
3702         }
3703         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3704         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3705         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3706         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3707         row     -= 4;
3708         break;
3709       case 5:
3710 
3711         sum1 = b[row];
3712         sum2 = b[row-1];
3713         sum3 = b[row-2];
3714         sum4 = b[row-3];
3715         sum5 = b[row-4];
3716         v2   = a->a + diag[row-1] + 2;
3717         v3   = a->a + diag[row-2] + 3;
3718         v4   = a->a + diag[row-3] + 4;
3719         v5   = a->a + diag[row-4] + 5;
3720         for (n = 0; n<sz-1; n+=2) {
3721           i1    = idx[0];
3722           i2    = idx[1];
3723           idx  += 2;
3724           tmp0  = x[i1];
3725           tmp1  = x[i2];
3726           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3727           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3728           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3729           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3730           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3731         }
3732 
3733         if (n == sz-1) {
3734           tmp0  = x[*idx];
3735           sum1 -= *v1*tmp0;
3736           sum2 -= *v2*tmp0;
3737           sum3 -= *v3*tmp0;
3738           sum4 -= *v4*tmp0;
3739           sum5 -= *v5*tmp0;
3740         }
3741         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3742         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3743         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3744         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3745         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3746         row     -= 5;
3747         break;
3748       default:
3749         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3750       }
3751     }
3752     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3753 
3754     /*
3755            t = b - D x    where D is the block diagonal
3756     */
3757     cnt = 0;
3758     for (i=0, row=0; i<m; i++) {
3759       switch (sizes[i]) {
3760       case 1:
3761         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3762         break;
3763       case 2:
3764         x1       = x[row]; x2 = x[row+1];
3765         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3766         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3767         t[row]   = b[row] - tmp1;
3768         t[row+1] = b[row+1] - tmp2; row += 2;
3769         cnt     += 4;
3770         break;
3771       case 3:
3772         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3773         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3774         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3775         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3776         t[row]   = b[row] - tmp1;
3777         t[row+1] = b[row+1] - tmp2;
3778         t[row+2] = b[row+2] - tmp3; row += 3;
3779         cnt     += 9;
3780         break;
3781       case 4:
3782         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3783         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3784         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3785         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3786         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3787         t[row]   = b[row] - tmp1;
3788         t[row+1] = b[row+1] - tmp2;
3789         t[row+2] = b[row+2] - tmp3;
3790         t[row+3] = b[row+3] - tmp4; row += 4;
3791         cnt     += 16;
3792         break;
3793       case 5:
3794         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3795         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3796         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3797         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3798         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3799         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3800         t[row]   = b[row] - tmp1;
3801         t[row+1] = b[row+1] - tmp2;
3802         t[row+2] = b[row+2] - tmp3;
3803         t[row+3] = b[row+3] - tmp4;
3804         t[row+4] = b[row+4] - tmp5;row += 5;
3805         cnt     += 25;
3806         break;
3807       default:
3808         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3809       }
3810     }
3811     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3812 
3813 
3814 
3815     /*
3816           Apply (L + D)^-1 where D is the block diagonal
3817     */
3818     for (i=0, row=0; i<m; i++) {
3819       sz  = diag[row] - ii[row];
3820       v1  = a->a + ii[row];
3821       idx = a->j + ii[row];
3822       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3823       switch (sizes[i]) {
3824       case 1:
3825 
3826         sum1 = t[row];
3827         for (n = 0; n<sz-1; n+=2) {
3828           i1    = idx[0];
3829           i2    = idx[1];
3830           idx  += 2;
3831           tmp0  = t[i1];
3832           tmp1  = t[i2];
3833           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3834         }
3835 
3836         if (n == sz-1) {
3837           tmp0  = t[*idx];
3838           sum1 -= *v1 * tmp0;
3839         }
3840         x[row] += t[row] = sum1*(*ibdiag++); row++;
3841         break;
3842       case 2:
3843         v2   = a->a + ii[row+1];
3844         sum1 = t[row];
3845         sum2 = t[row+1];
3846         for (n = 0; n<sz-1; n+=2) {
3847           i1    = idx[0];
3848           i2    = idx[1];
3849           idx  += 2;
3850           tmp0  = t[i1];
3851           tmp1  = t[i2];
3852           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3853           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3854         }
3855 
3856         if (n == sz-1) {
3857           tmp0  = t[*idx];
3858           sum1 -= v1[0] * tmp0;
3859           sum2 -= v2[0] * tmp0;
3860         }
3861         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3862         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3863         ibdiag   += 4; row += 2;
3864         break;
3865       case 3:
3866         v2   = a->a + ii[row+1];
3867         v3   = a->a + ii[row+2];
3868         sum1 = t[row];
3869         sum2 = t[row+1];
3870         sum3 = t[row+2];
3871         for (n = 0; n<sz-1; n+=2) {
3872           i1    = idx[0];
3873           i2    = idx[1];
3874           idx  += 2;
3875           tmp0  = t[i1];
3876           tmp1  = t[i2];
3877           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3878           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3879           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3880         }
3881 
3882         if (n == sz-1) {
3883           tmp0  = t[*idx];
3884           sum1 -= v1[0] * tmp0;
3885           sum2 -= v2[0] * tmp0;
3886           sum3 -= v3[0] * tmp0;
3887         }
3888         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3889         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3890         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3891         ibdiag   += 9; row += 3;
3892         break;
3893       case 4:
3894         v2   = a->a + ii[row+1];
3895         v3   = a->a + ii[row+2];
3896         v4   = a->a + ii[row+3];
3897         sum1 = t[row];
3898         sum2 = t[row+1];
3899         sum3 = t[row+2];
3900         sum4 = t[row+3];
3901         for (n = 0; n<sz-1; n+=2) {
3902           i1    = idx[0];
3903           i2    = idx[1];
3904           idx  += 2;
3905           tmp0  = t[i1];
3906           tmp1  = t[i2];
3907           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3911         }
3912 
3913         if (n == sz-1) {
3914           tmp0  = t[*idx];
3915           sum1 -= v1[0] * tmp0;
3916           sum2 -= v2[0] * tmp0;
3917           sum3 -= v3[0] * tmp0;
3918           sum4 -= v4[0] * tmp0;
3919         }
3920         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3921         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3922         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3923         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3924         ibdiag   += 16; row += 4;
3925         break;
3926       case 5:
3927         v2   = a->a + ii[row+1];
3928         v3   = a->a + ii[row+2];
3929         v4   = a->a + ii[row+3];
3930         v5   = a->a + ii[row+4];
3931         sum1 = t[row];
3932         sum2 = t[row+1];
3933         sum3 = t[row+2];
3934         sum4 = t[row+3];
3935         sum5 = t[row+4];
3936         for (n = 0; n<sz-1; n+=2) {
3937           i1    = idx[0];
3938           i2    = idx[1];
3939           idx  += 2;
3940           tmp0  = t[i1];
3941           tmp1  = t[i2];
3942           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3943           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3944           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3945           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3946           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3947         }
3948 
3949         if (n == sz-1) {
3950           tmp0  = t[*idx];
3951           sum1 -= v1[0] * tmp0;
3952           sum2 -= v2[0] * tmp0;
3953           sum3 -= v3[0] * tmp0;
3954           sum4 -= v4[0] * tmp0;
3955           sum5 -= v5[0] * tmp0;
3956         }
3957         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3958         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3959         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3960         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3961         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3962         ibdiag   += 25; row += 5;
3963         break;
3964       default:
3965         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3966       }
3967     }
3968     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3969   }
3970   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3971   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3972   PetscFunctionReturn(0);
3973 }
3974 
3975 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3976 {
3977   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3978   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3979   const MatScalar   *bdiag = a->inode.bdiag;
3980   const PetscScalar *b;
3981   PetscErrorCode    ierr;
3982   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3983   const PetscInt    *sizes = a->inode.size;
3984 
3985   PetscFunctionBegin;
3986   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3987   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3988   cnt  = 0;
3989   for (i=0, row=0; i<m; i++) {
3990     switch (sizes[i]) {
3991     case 1:
3992       x[row] = b[row]*bdiag[cnt++];row++;
3993       break;
3994     case 2:
3995       x1       = b[row]; x2 = b[row+1];
3996       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3997       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3998       x[row++] = tmp1;
3999       x[row++] = tmp2;
4000       cnt     += 4;
4001       break;
4002     case 3:
4003       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4004       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4005       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4006       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4007       x[row++] = tmp1;
4008       x[row++] = tmp2;
4009       x[row++] = tmp3;
4010       cnt     += 9;
4011       break;
4012     case 4:
4013       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4014       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4015       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4016       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4017       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4018       x[row++] = tmp1;
4019       x[row++] = tmp2;
4020       x[row++] = tmp3;
4021       x[row++] = tmp4;
4022       cnt     += 16;
4023       break;
4024     case 5:
4025       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4026       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4027       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4028       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4029       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4030       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4031       x[row++] = tmp1;
4032       x[row++] = tmp2;
4033       x[row++] = tmp3;
4034       x[row++] = tmp4;
4035       x[row++] = tmp5;
4036       cnt     += 25;
4037       break;
4038     default:
4039       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4040     }
4041   }
4042   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4043   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4044   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4045   PetscFunctionReturn(0);
4046 }
4047 
4048 /*
4049     samestructure indicates that the matrix has not changed its nonzero structure so we
4050     do not need to recompute the inodes
4051 */
4052 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4053 {
4054   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4055   PetscErrorCode ierr;
4056   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4057   PetscBool      flag;
4058   const PetscInt *idx,*idy,*ii;
4059 
4060   PetscFunctionBegin;
4061   if (!a->inode.use) PetscFunctionReturn(0);
4062   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4063 
4064   m = A->rmap->n;
4065   if (a->inode.size) ns = a->inode.size;
4066   else {
4067     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4068   }
4069 
4070   i          = 0;
4071   node_count = 0;
4072   idx        = a->j;
4073   ii         = a->i;
4074   while (i < m) {                /* For each row */
4075     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4076     /* Limits the number of elements in a node to 'a->inode.limit' */
4077     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4078       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4079       if (nzy != nzx) break;
4080       idy += nzx;              /* Same nonzero pattern */
4081       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4082       if (!flag) break;
4083     }
4084     ns[node_count++] = blk_size;
4085     idx             += blk_size*nzx;
4086     i                = j;
4087   }
4088   /* If not enough inodes found,, do not use inode version of the routines */
4089   if (!m || node_count > .8*m) {
4090     ierr = PetscFree(ns);CHKERRQ(ierr);
4091 
4092     a->inode.node_count       = 0;
4093     a->inode.size             = NULL;
4094     a->inode.use              = PETSC_FALSE;
4095     A->ops->mult              = MatMult_SeqAIJ;
4096     A->ops->sor               = MatSOR_SeqAIJ;
4097     A->ops->multadd           = MatMultAdd_SeqAIJ;
4098     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4099     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4100     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4101     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4102     A->ops->coloringpatch     = 0;
4103     A->ops->multdiagonalblock = 0;
4104 
4105     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4106   } else {
4107     if (!A->factortype) {
4108       A->ops->mult              = MatMult_SeqAIJ_Inode;
4109       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4110       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4111       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4112       if (A->rmap->n == A->cmap->n) {
4113         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4114         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4115         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4116         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4117         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4118       }
4119     } else {
4120       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4121     }
4122     a->inode.node_count = node_count;
4123     a->inode.size       = ns;
4124     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4125   }
4126   a->inode.checked          = PETSC_TRUE;
4127   a->inode.mat_nonzerostate = A->nonzerostate;
4128   PetscFunctionReturn(0);
4129 }
4130 
4131 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4132 {
4133   Mat            B =*C;
4134   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4135   PetscErrorCode ierr;
4136   PetscInt       m=A->rmap->n;
4137 
4138   PetscFunctionBegin;
4139   c->inode.use       = a->inode.use;
4140   c->inode.limit     = a->inode.limit;
4141   c->inode.max_limit = a->inode.max_limit;
4142   if (a->inode.size) {
4143     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4144     c->inode.node_count = a->inode.node_count;
4145     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4146     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4147     if (!B->factortype) {
4148       B->ops->mult              = MatMult_SeqAIJ_Inode;
4149       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4150       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4151       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4152       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4153       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4154       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4155       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4156       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4157     } else {
4158       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4159     }
4160   } else {
4161     c->inode.size       = 0;
4162     c->inode.node_count = 0;
4163   }
4164   c->inode.ibdiagvalid = PETSC_FALSE;
4165   c->inode.ibdiag      = 0;
4166   c->inode.bdiag       = 0;
4167   PetscFunctionReturn(0);
4168 }
4169 
4170 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4171 {
4172   PetscInt       k;
4173   const PetscInt *vi;
4174 
4175   PetscFunctionBegin;
4176   vi = aj + ai[row];
4177   for (k=0; k<nzl; k++) cols[k] = vi[k];
4178   vi        = aj + adiag[row];
4179   cols[nzl] = vi[0];
4180   vi        = aj + adiag[row+1]+1;
4181   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4182   PetscFunctionReturn(0);
4183 }
4184 /*
4185    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4186    Modified from MatSeqAIJCheckInode().
4187 
4188    Input Parameters:
4189 .  Mat A - ILU or LU matrix factor
4190 
4191 */
4192 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4193 {
4194   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4195   PetscErrorCode ierr;
4196   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4197   PetscInt       *cols1,*cols2,*ns;
4198   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4199   PetscBool      flag;
4200 
4201   PetscFunctionBegin;
4202   if (!a->inode.use)    PetscFunctionReturn(0);
4203   if (a->inode.checked) PetscFunctionReturn(0);
4204 
4205   m = A->rmap->n;
4206   if (a->inode.size) ns = a->inode.size;
4207   else {
4208     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4209   }
4210 
4211   i          = 0;
4212   node_count = 0;
4213   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4214   while (i < m) {                /* For each row */
4215     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4216     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4217     nzx  = nzl1 + nzu1 + 1;
4218     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4219 
4220     /* Limits the number of elements in a node to 'a->inode.limit' */
4221     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4222       nzl2 = ai[j+1] - ai[j];
4223       nzu2 = adiag[j] - adiag[j+1] - 1;
4224       nzy  = nzl2 + nzu2 + 1;
4225       if (nzy != nzx) break;
4226       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4227       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4228       if (!flag) break;
4229     }
4230     ns[node_count++] = blk_size;
4231     i                = j;
4232   }
4233   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4234   /* If not enough inodes found,, do not use inode version of the routines */
4235   if (!m || node_count > .8*m) {
4236     ierr = PetscFree(ns);CHKERRQ(ierr);
4237 
4238     a->inode.node_count = 0;
4239     a->inode.size       = NULL;
4240     a->inode.use        = PETSC_FALSE;
4241 
4242     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4243   } else {
4244     A->ops->mult              = 0;
4245     A->ops->sor               = 0;
4246     A->ops->multadd           = 0;
4247     A->ops->getrowij          = 0;
4248     A->ops->restorerowij      = 0;
4249     A->ops->getcolumnij       = 0;
4250     A->ops->restorecolumnij   = 0;
4251     A->ops->coloringpatch     = 0;
4252     A->ops->multdiagonalblock = 0;
4253     a->inode.node_count       = node_count;
4254     a->inode.size             = ns;
4255 
4256     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4257   }
4258   a->inode.checked = PETSC_TRUE;
4259   PetscFunctionReturn(0);
4260 }
4261 
4262 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4263 {
4264   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4265 
4266   PetscFunctionBegin;
4267   a->inode.ibdiagvalid = PETSC_FALSE;
4268   PetscFunctionReturn(0);
4269 }
4270 
4271 /*
4272      This is really ugly. if inodes are used this replaces the
4273   permutations with ones that correspond to rows/cols of the matrix
4274   rather then inode blocks
4275 */
4276 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4277 {
4278   PetscErrorCode ierr;
4279 
4280   PetscFunctionBegin;
4281   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4282   PetscFunctionReturn(0);
4283 }
4284 
4285 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4286 {
4287   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4288   PetscErrorCode ierr;
4289   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4290   const PetscInt *ridx,*cidx;
4291   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4292   PetscInt       nslim_col,*ns_col;
4293   IS             ris = *rperm,cis = *cperm;
4294 
4295   PetscFunctionBegin;
4296   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4297   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4298 
4299   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4300   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4301   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4302 
4303   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4304   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4305 
4306   /* Form the inode structure for the rows of permuted matric using inv perm*/
4307   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4308 
4309   /* Construct the permutations for rows*/
4310   for (i=0,row = 0; i<nslim_row; ++i) {
4311     indx      = ridx[i];
4312     start_val = tns[indx];
4313     end_val   = tns[indx + 1];
4314     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4315   }
4316 
4317   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4318   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4319 
4320   /* Construct permutations for columns */
4321   for (i=0,col=0; i<nslim_col; ++i) {
4322     indx      = cidx[i];
4323     start_val = tns[indx];
4324     end_val   = tns[indx + 1];
4325     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4326   }
4327 
4328   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4329   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4330   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4331   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4332 
4333   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4334   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4335 
4336   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4337   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4338   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4339   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4340   ierr = PetscFree(tns);CHKERRQ(ierr);
4341   PetscFunctionReturn(0);
4342 }
4343 
4344 /*@C
4345    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4346 
4347    Not Collective
4348 
4349    Input Parameter:
4350 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4351 
4352    Output Parameter:
4353 +  node_count - no of inodes present in the matrix.
4354 .  sizes      - an array of size node_count,with sizes of each inode.
4355 -  limit      - the max size used to generate the inodes.
4356 
4357    Level: advanced
4358 
4359    Notes: This routine returns some internal storage information
4360    of the matrix, it is intended to be used by advanced users.
4361    It should be called after the matrix is assembled.
4362    The contents of the sizes[] array should not be changed.
4363    NULL may be passed for information not requested.
4364 
4365 .keywords: matrix, seqaij, get, inode
4366 
4367 .seealso: MatGetInfo()
4368 @*/
4369 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4370 {
4371   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4372 
4373   PetscFunctionBegin;
4374   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4375   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4376   if (f) {
4377     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4378   }
4379   PetscFunctionReturn(0);
4380 }
4381 
4382 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4383 {
4384   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4385 
4386   PetscFunctionBegin;
4387   if (node_count) *node_count = a->inode.node_count;
4388   if (sizes)      *sizes      = a->inode.size;
4389   if (limit)      *limit      = a->inode.limit;
4390   PetscFunctionReturn(0);
4391 }
4392