xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 487a658c8b32ba712a1dc8280daad2fd70c1dcd9)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n      = A->cmap->n;
16   m      = A->rmap->n;
17   ns_row = a->inode.size;
18 
19   min_mn = (m < n) ? m : n;
20   if (!ns) {
21     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
22     for (; count+1 < n; count++,i++) ;
23     if (count < n)  {
24       i++;
25     }
26     *size = i;
27     PetscFunctionReturn(0);
28   }
29   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
30 
31   /* Use the same row structure wherever feasible. */
32   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
33     ns_col[i] = ns_row[i];
34   }
35 
36   /* if m < n; pad up the remainder with inode_limit */
37   for (; count+1 < n; count++,i++) {
38     ns_col[i] = 1;
39   }
40   /* The last node is the odd ball. padd it up with the remaining rows; */
41   if (count < n) {
42     ns_col[i] = n - count;
43     i++;
44   } else if (count > n) {
45     /* Adjust for the over estimation */
46     ns_col[i-1] += n - count;
47   }
48   *size = i;
49   *ns   = ns_col;
50   PetscFunctionReturn(0);
51 }
52 
53 
54 /*
55       This builds symmetric version of nonzero structure,
56 */
57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
58 {
59   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
60   PetscErrorCode ierr;
61   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
62   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
63   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
64 
65   PetscFunctionBegin;
66   nslim_row = a->inode.node_count;
67   m         = A->rmap->n;
68   n         = A->cmap->n;
69   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
70 
71   /* Use the row_inode as column_inode */
72   nslim_col = nslim_row;
73   ns_col    = ns_row;
74 
75   /* allocate space for reformated inode structure */
76   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
77   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
78   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
79 
80   for (i1=0,col=0; i1<nslim_col; ++i1) {
81     nsz = ns_col[i1];
82     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
83   }
84   /* allocate space for row pointers */
85   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
86   *iia = ia;
87   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
88   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
89 
90   /* determine the number of columns in each row */
91   ia[0] = oshift;
92   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
93 
94     j    = aj + ai[row] + ishift;
95     jmax = aj + ai[row+1] + ishift;
96     if (j==jmax) continue; /* empty row */
97     col  = *j++ + ishift;
98     i2   = tvc[col];
99     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100       ia[i1+1]++;
101       ia[i2+1]++;
102       i2++;                     /* Start col of next node */
103       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104       i2 = tvc[col];
105     }
106     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
107   }
108 
109   /* shift ia[i] to point to next row */
110   for (i1=1; i1<nslim_row+1; i1++) {
111     row        = ia[i1-1];
112     ia[i1]    += row;
113     work[i1-1] = row - oshift;
114   }
115 
116   /* allocate space for column pointers */
117   nz   = ia[nslim_row] + (!ishift);
118   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
119   *jja = ja;
120 
121   /* loop over lower triangular part putting into ja */
122   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123     j    = aj + ai[row] + ishift;
124     jmax = aj + ai[row+1] + ishift;
125     if (j==jmax) continue; /* empty row */
126     col  = *j++ + ishift;
127     i2   = tvc[col];
128     while (i2<i1 && j<jmax) {
129       ja[work[i2]++] = i1 + oshift;
130       ja[work[i1]++] = i2 + oshift;
131       ++i2;
132       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133       i2 = tvc[col];
134     }
135     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
136 
137   }
138   ierr = PetscFree(work);CHKERRQ(ierr);
139   ierr = PetscFree(tns);CHKERRQ(ierr);
140   ierr = PetscFree(tvc);CHKERRQ(ierr);
141   PetscFunctionReturn(0);
142 }
143 
144 /*
145       This builds nonsymmetric version of nonzero structure,
146 */
147 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
148 {
149   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
150   PetscErrorCode ierr;
151   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
152   PetscInt       *tns,*tvc,nsz,i1,i2;
153   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
154 
155   PetscFunctionBegin;
156   nslim_row = a->inode.node_count;
157   n         = A->cmap->n;
158 
159   /* Create The column_inode for this matrix */
160   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
161 
162   /* allocate space for reformated column_inode structure */
163   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
164   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
165   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
166 
167   for (i1=0,col=0; i1<nslim_col; ++i1) {
168     nsz = ns_col[i1];
169     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
170   }
171   /* allocate space for row pointers */
172   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
173   *iia = ia;
174   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
175   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
176 
177   /* determine the number of columns in each row */
178   ia[0] = oshift;
179   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
180     j   = aj + ai[row] + ishift;
181     nz  = ai[row+1] - ai[row];
182     if (!nz) continue; /* empty row */
183     col = *j++ + ishift;
184     i2  = tvc[col];
185     while (nz-- > 0) {           /* off-diagonal elemets */
186       ia[i1+1]++;
187       i2++;                     /* Start col of next node */
188       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
189       if (nz > 0) i2 = tvc[col];
190     }
191   }
192 
193   /* shift ia[i] to point to next row */
194   for (i1=1; i1<nslim_row+1; i1++) {
195     row        = ia[i1-1];
196     ia[i1]    += row;
197     work[i1-1] = row - oshift;
198   }
199 
200   /* allocate space for column pointers */
201   nz   = ia[nslim_row] + (!ishift);
202   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
203   *jja = ja;
204 
205   /* loop over matrix putting into ja */
206   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
207     j   = aj + ai[row] + ishift;
208     nz  = ai[row+1] - ai[row];
209     if (!nz) continue; /* empty row */
210     col = *j++ + ishift;
211     i2  = tvc[col];
212     while (nz-- > 0) {
213       ja[work[i1]++] = i2 + oshift;
214       ++i2;
215       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
216       if (nz > 0) i2 = tvc[col];
217     }
218   }
219   ierr = PetscFree(ns_col);CHKERRQ(ierr);
220   ierr = PetscFree(work);CHKERRQ(ierr);
221   ierr = PetscFree(tns);CHKERRQ(ierr);
222   ierr = PetscFree(tvc);CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
227 {
228   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
229   PetscErrorCode ierr;
230 
231   PetscFunctionBegin;
232   *n = a->inode.node_count;
233   if (!ia) PetscFunctionReturn(0);
234   if (!blockcompressed) {
235     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
236   } else if (symmetric) {
237     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
238   } else {
239     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
240   }
241   PetscFunctionReturn(0);
242 }
243 
244 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
245 {
246   PetscErrorCode ierr;
247 
248   PetscFunctionBegin;
249   if (!ia) PetscFunctionReturn(0);
250 
251   if (!blockcompressed) {
252     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
253   } else {
254     ierr = PetscFree(*ia);CHKERRQ(ierr);
255     ierr = PetscFree(*ja);CHKERRQ(ierr);
256   }
257   PetscFunctionReturn(0);
258 }
259 
260 /* ----------------------------------------------------------- */
261 
262 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
263 {
264   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
265   PetscErrorCode ierr;
266   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
267   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
268 
269   PetscFunctionBegin;
270   nslim_row = a->inode.node_count;
271   n         = A->cmap->n;
272 
273   /* Create The column_inode for this matrix */
274   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
275 
276   /* allocate space for reformated column_inode structure */
277   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
278   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
279   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
280 
281   for (i1=0,col=0; i1<nslim_col; ++i1) {
282     nsz = ns_col[i1];
283     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
284   }
285   /* allocate space for column pointers */
286   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
287   *iia = ia;
288   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
289   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
290 
291   /* determine the number of columns in each row */
292   ia[0] = oshift;
293   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
294     j   = aj + ai[row] + ishift;
295     col = *j++ + ishift;
296     i2  = tvc[col];
297     nz  = ai[row+1] - ai[row];
298     while (nz-- > 0) {           /* off-diagonal elemets */
299       /* ia[i1+1]++; */
300       ia[i2+1]++;
301       i2++;
302       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
303       if (nz > 0) i2 = tvc[col];
304     }
305   }
306 
307   /* shift ia[i] to point to next col */
308   for (i1=1; i1<nslim_col+1; i1++) {
309     col        = ia[i1-1];
310     ia[i1]    += col;
311     work[i1-1] = col - oshift;
312   }
313 
314   /* allocate space for column pointers */
315   nz   = ia[nslim_col] + (!ishift);
316   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
317   *jja = ja;
318 
319   /* loop over matrix putting into ja */
320   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
321     j   = aj + ai[row] + ishift;
322     col = *j++ + ishift;
323     i2  = tvc[col];
324     nz  = ai[row+1] - ai[row];
325     while (nz-- > 0) {
326       /* ja[work[i1]++] = i2 + oshift; */
327       ja[work[i2]++] = i1 + oshift;
328       i2++;
329       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
330       if (nz > 0) i2 = tvc[col];
331     }
332   }
333   ierr = PetscFree(ns_col);CHKERRQ(ierr);
334   ierr = PetscFree(work);CHKERRQ(ierr);
335   ierr = PetscFree(tns);CHKERRQ(ierr);
336   ierr = PetscFree(tvc);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
340 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
341 {
342   PetscErrorCode ierr;
343 
344   PetscFunctionBegin;
345   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
346   if (!ia) PetscFunctionReturn(0);
347 
348   if (!blockcompressed) {
349     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
350   } else if (symmetric) {
351     /* Since the indices are symmetric it does'nt matter */
352     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
353   } else {
354     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
355   }
356   PetscFunctionReturn(0);
357 }
358 
359 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
360 {
361   PetscErrorCode ierr;
362 
363   PetscFunctionBegin;
364   if (!ia) PetscFunctionReturn(0);
365   if (!blockcompressed) {
366     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else {
368     ierr = PetscFree(*ia);CHKERRQ(ierr);
369     ierr = PetscFree(*ja);CHKERRQ(ierr);
370   }
371   PetscFunctionReturn(0);
372 }
373 
374 /* ----------------------------------------------------------- */
375 
376 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
377 {
378   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
379   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
380   PetscScalar       *y;
381   const PetscScalar *x;
382   const MatScalar   *v1,*v2,*v3,*v4,*v5;
383   PetscErrorCode    ierr;
384   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
385   const PetscInt    *idx,*ns,*ii;
386 
387 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
388 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
389 #endif
390 
391   PetscFunctionBegin;
392   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
393   node_max = a->inode.node_count;
394   ns       = a->inode.size;     /* Node Size array */
395   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
396   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
397   idx      = a->j;
398   v1       = a->a;
399   ii       = a->i;
400 
401   for (i = 0,row = 0; i< node_max; ++i) {
402     nsz         = ns[i];
403     n           = ii[1] - ii[0];
404     nonzerorow += (n>0)*nsz;
405     ii         += nsz;
406     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
407     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
408     sz = n;                     /* No of non zeros in this row */
409                                 /* Switch on the size of Node */
410     switch (nsz) {               /* Each loop in 'case' is unrolled */
411     case 1:
412       sum1 = 0.;
413 
414       for (n = 0; n< sz-1; n+=2) {
415         i1    = idx[0];         /* The instructions are ordered to */
416         i2    = idx[1];         /* make the compiler's job easy */
417         idx  += 2;
418         tmp0  = x[i1];
419         tmp1  = x[i2];
420         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
421       }
422 
423       if (n == sz-1) {          /* Take care of the last nonzero  */
424         tmp0  = x[*idx++];
425         sum1 += *v1++ *tmp0;
426       }
427       y[row++]=sum1;
428       break;
429     case 2:
430       sum1 = 0.;
431       sum2 = 0.;
432       v2   = v1 + n;
433 
434       for (n = 0; n< sz-1; n+=2) {
435         i1    = idx[0];
436         i2    = idx[1];
437         idx  += 2;
438         tmp0  = x[i1];
439         tmp1  = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
442       }
443       if (n == sz-1) {
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446         sum2 += *v2++ * tmp0;
447       }
448       y[row++]=sum1;
449       y[row++]=sum2;
450       v1      =v2;              /* Since the next block to be processed starts there*/
451       idx    +=sz;
452       break;
453     case 3:
454       sum1 = 0.;
455       sum2 = 0.;
456       sum3 = 0.;
457       v2   = v1 + n;
458       v3   = v2 + n;
459 
460       for (n = 0; n< sz-1; n+=2) {
461         i1    = idx[0];
462         i2    = idx[1];
463         idx  += 2;
464         tmp0  = x[i1];
465         tmp1  = x[i2];
466         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
467         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
468         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
469       }
470       if (n == sz-1) {
471         tmp0  = x[*idx++];
472         sum1 += *v1++ * tmp0;
473         sum2 += *v2++ * tmp0;
474         sum3 += *v3++ * tmp0;
475       }
476       y[row++]=sum1;
477       y[row++]=sum2;
478       y[row++]=sum3;
479       v1      =v3;              /* Since the next block to be processed starts there*/
480       idx    +=2*sz;
481       break;
482     case 4:
483       sum1 = 0.;
484       sum2 = 0.;
485       sum3 = 0.;
486       sum4 = 0.;
487       v2   = v1 + n;
488       v3   = v2 + n;
489       v4   = v3 + n;
490 
491       for (n = 0; n< sz-1; n+=2) {
492         i1    = idx[0];
493         i2    = idx[1];
494         idx  += 2;
495         tmp0  = x[i1];
496         tmp1  = x[i2];
497         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
498         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
499         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
500         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
501       }
502       if (n == sz-1) {
503         tmp0  = x[*idx++];
504         sum1 += *v1++ * tmp0;
505         sum2 += *v2++ * tmp0;
506         sum3 += *v3++ * tmp0;
507         sum4 += *v4++ * tmp0;
508       }
509       y[row++]=sum1;
510       y[row++]=sum2;
511       y[row++]=sum3;
512       y[row++]=sum4;
513       v1      =v4;              /* Since the next block to be processed starts there*/
514       idx    +=3*sz;
515       break;
516     case 5:
517       sum1 = 0.;
518       sum2 = 0.;
519       sum3 = 0.;
520       sum4 = 0.;
521       sum5 = 0.;
522       v2   = v1 + n;
523       v3   = v2 + n;
524       v4   = v3 + n;
525       v5   = v4 + n;
526 
527       for (n = 0; n<sz-1; n+=2) {
528         i1    = idx[0];
529         i2    = idx[1];
530         idx  += 2;
531         tmp0  = x[i1];
532         tmp1  = x[i2];
533         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
534         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
535         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
536         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
537         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
538       }
539       if (n == sz-1) {
540         tmp0  = x[*idx++];
541         sum1 += *v1++ * tmp0;
542         sum2 += *v2++ * tmp0;
543         sum3 += *v3++ * tmp0;
544         sum4 += *v4++ * tmp0;
545         sum5 += *v5++ * tmp0;
546       }
547       y[row++]=sum1;
548       y[row++]=sum2;
549       y[row++]=sum3;
550       y[row++]=sum4;
551       y[row++]=sum5;
552       v1      =v5;       /* Since the next block to be processed starts there */
553       idx    +=4*sz;
554       break;
555     default:
556       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
557     }
558   }
559   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
560   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
561   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 /* ----------------------------------------------------------- */
565 /* Almost same code as the MatMult_SeqAIJ_Inode() */
566 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
567 {
568   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
569   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
570   const MatScalar   *v1,*v2,*v3,*v4,*v5;
571   const PetscScalar *x;
572   PetscScalar       *y,*z,*zt;
573   PetscErrorCode    ierr;
574   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
575   const PetscInt    *idx,*ns,*ii;
576 
577   PetscFunctionBegin;
578   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
579   node_max = a->inode.node_count;
580   ns       = a->inode.size;     /* Node Size array */
581 
582   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
583   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
584   zt = z;
585 
586   idx = a->j;
587   v1  = a->a;
588   ii  = a->i;
589 
590   for (i = 0,row = 0; i< node_max; ++i) {
591     nsz = ns[i];
592     n   = ii[1] - ii[0];
593     ii += nsz;
594     sz  = n;                    /* No of non zeros in this row */
595                                 /* Switch on the size of Node */
596     switch (nsz) {               /* Each loop in 'case' is unrolled */
597     case 1:
598       sum1 = *zt++;
599 
600       for (n = 0; n< sz-1; n+=2) {
601         i1    = idx[0];         /* The instructions are ordered to */
602         i2    = idx[1];         /* make the compiler's job easy */
603         idx  += 2;
604         tmp0  = x[i1];
605         tmp1  = x[i2];
606         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
607       }
608 
609       if (n   == sz-1) {          /* Take care of the last nonzero  */
610         tmp0  = x[*idx++];
611         sum1 += *v1++ * tmp0;
612       }
613       y[row++]=sum1;
614       break;
615     case 2:
616       sum1 = *zt++;
617       sum2 = *zt++;
618       v2   = v1 + n;
619 
620       for (n = 0; n< sz-1; n+=2) {
621         i1    = idx[0];
622         i2    = idx[1];
623         idx  += 2;
624         tmp0  = x[i1];
625         tmp1  = x[i2];
626         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
627         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
628       }
629       if (n   == sz-1) {
630         tmp0  = x[*idx++];
631         sum1 += *v1++ * tmp0;
632         sum2 += *v2++ * tmp0;
633       }
634       y[row++]=sum1;
635       y[row++]=sum2;
636       v1      =v2;              /* Since the next block to be processed starts there*/
637       idx    +=sz;
638       break;
639     case 3:
640       sum1 = *zt++;
641       sum2 = *zt++;
642       sum3 = *zt++;
643       v2   = v1 + n;
644       v3   = v2 + n;
645 
646       for (n = 0; n< sz-1; n+=2) {
647         i1    = idx[0];
648         i2    = idx[1];
649         idx  += 2;
650         tmp0  = x[i1];
651         tmp1  = x[i2];
652         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
653         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
654         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
655       }
656       if (n == sz-1) {
657         tmp0  = x[*idx++];
658         sum1 += *v1++ * tmp0;
659         sum2 += *v2++ * tmp0;
660         sum3 += *v3++ * tmp0;
661       }
662       y[row++]=sum1;
663       y[row++]=sum2;
664       y[row++]=sum3;
665       v1      =v3;              /* Since the next block to be processed starts there*/
666       idx    +=2*sz;
667       break;
668     case 4:
669       sum1 = *zt++;
670       sum2 = *zt++;
671       sum3 = *zt++;
672       sum4 = *zt++;
673       v2   = v1 + n;
674       v3   = v2 + n;
675       v4   = v3 + n;
676 
677       for (n = 0; n< sz-1; n+=2) {
678         i1    = idx[0];
679         i2    = idx[1];
680         idx  += 2;
681         tmp0  = x[i1];
682         tmp1  = x[i2];
683         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
684         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
685         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
686         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
687       }
688       if (n == sz-1) {
689         tmp0  = x[*idx++];
690         sum1 += *v1++ * tmp0;
691         sum2 += *v2++ * tmp0;
692         sum3 += *v3++ * tmp0;
693         sum4 += *v4++ * tmp0;
694       }
695       y[row++]=sum1;
696       y[row++]=sum2;
697       y[row++]=sum3;
698       y[row++]=sum4;
699       v1      =v4;              /* Since the next block to be processed starts there*/
700       idx    +=3*sz;
701       break;
702     case 5:
703       sum1 = *zt++;
704       sum2 = *zt++;
705       sum3 = *zt++;
706       sum4 = *zt++;
707       sum5 = *zt++;
708       v2   = v1 + n;
709       v3   = v2 + n;
710       v4   = v3 + n;
711       v5   = v4 + n;
712 
713       for (n = 0; n<sz-1; n+=2) {
714         i1    = idx[0];
715         i2    = idx[1];
716         idx  += 2;
717         tmp0  = x[i1];
718         tmp1  = x[i2];
719         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
720         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
721         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
722         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
723         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
724       }
725       if (n   == sz-1) {
726         tmp0  = x[*idx++];
727         sum1 += *v1++ * tmp0;
728         sum2 += *v2++ * tmp0;
729         sum3 += *v3++ * tmp0;
730         sum4 += *v4++ * tmp0;
731         sum5 += *v5++ * tmp0;
732       }
733       y[row++]=sum1;
734       y[row++]=sum2;
735       y[row++]=sum3;
736       y[row++]=sum4;
737       y[row++]=sum5;
738       v1      =v5;       /* Since the next block to be processed starts there */
739       idx    +=4*sz;
740       break;
741     default:
742       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
743     }
744   }
745   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
746   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
747   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
748   PetscFunctionReturn(0);
749 }
750 
751 /* ----------------------------------------------------------- */
752 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
753 {
754   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
755   IS                iscol = a->col,isrow = a->row;
756   PetscErrorCode    ierr;
757   const PetscInt    *r,*c,*rout,*cout;
758   PetscInt          i,j,n = A->rmap->n,nz;
759   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
760   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
761   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
762   PetscScalar       sum1,sum2,sum3,sum4,sum5;
763   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
764   const PetscScalar *b;
765 
766   PetscFunctionBegin;
767   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
768   node_max = a->inode.node_count;
769   ns       = a->inode.size;     /* Node Size array */
770 
771   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
772   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
773   tmp  = a->solve_work;
774 
775   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
776   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
777 
778   /* forward solve the lower triangular */
779   tmps = tmp;
780   aa   = a_a;
781   aj   = a_j;
782   ad   = a->diag;
783 
784   for (i = 0,row = 0; i< node_max; ++i) {
785     nsz = ns[i];
786     aii = ai[row];
787     v1  = aa + aii;
788     vi  = aj + aii;
789     nz  = ad[row]- aii;
790     if (i < node_max-1) {
791       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
792       * but our indexing to determine it's size could. */
793       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
794       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
795       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
796       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
797     }
798 
799     switch (nsz) {               /* Each loop in 'case' is unrolled */
800     case 1:
801       sum1 = b[*r++];
802       for (j=0; j<nz-1; j+=2) {
803         i0    = vi[0];
804         i1    = vi[1];
805         vi   +=2;
806         tmp0  = tmps[i0];
807         tmp1  = tmps[i1];
808         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809       }
810       if (j == nz-1) {
811         tmp0  = tmps[*vi++];
812         sum1 -= *v1++ *tmp0;
813       }
814       tmp[row++]=sum1;
815       break;
816     case 2:
817       sum1 = b[*r++];
818       sum2 = b[*r++];
819       v2   = aa + ai[row+1];
820 
821       for (j=0; j<nz-1; j+=2) {
822         i0    = vi[0];
823         i1    = vi[1];
824         vi   +=2;
825         tmp0  = tmps[i0];
826         tmp1  = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829       }
830       if (j == nz-1) {
831         tmp0  = tmps[*vi++];
832         sum1 -= *v1++ *tmp0;
833         sum2 -= *v2++ *tmp0;
834       }
835       sum2     -= *v2++ *sum1;
836       tmp[row++]=sum1;
837       tmp[row++]=sum2;
838       break;
839     case 3:
840       sum1 = b[*r++];
841       sum2 = b[*r++];
842       sum3 = b[*r++];
843       v2   = aa + ai[row+1];
844       v3   = aa + ai[row+2];
845 
846       for (j=0; j<nz-1; j+=2) {
847         i0    = vi[0];
848         i1    = vi[1];
849         vi   +=2;
850         tmp0  = tmps[i0];
851         tmp1  = tmps[i1];
852         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860         sum3 -= *v3++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       sum3 -= *v3++ * sum1;
864       sum3 -= *v3++ * sum2;
865 
866       tmp[row++]=sum1;
867       tmp[row++]=sum2;
868       tmp[row++]=sum3;
869       break;
870 
871     case 4:
872       sum1 = b[*r++];
873       sum2 = b[*r++];
874       sum3 = b[*r++];
875       sum4 = b[*r++];
876       v2   = aa + ai[row+1];
877       v3   = aa + ai[row+2];
878       v4   = aa + ai[row+3];
879 
880       for (j=0; j<nz-1; j+=2) {
881         i0    = vi[0];
882         i1    = vi[1];
883         vi   +=2;
884         tmp0  = tmps[i0];
885         tmp1  = tmps[i1];
886         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
887         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
888         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
889         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
890       }
891       if (j == nz-1) {
892         tmp0  = tmps[*vi++];
893         sum1 -= *v1++ *tmp0;
894         sum2 -= *v2++ *tmp0;
895         sum3 -= *v3++ *tmp0;
896         sum4 -= *v4++ *tmp0;
897       }
898       sum2 -= *v2++ * sum1;
899       sum3 -= *v3++ * sum1;
900       sum4 -= *v4++ * sum1;
901       sum3 -= *v3++ * sum2;
902       sum4 -= *v4++ * sum2;
903       sum4 -= *v4++ * sum3;
904 
905       tmp[row++]=sum1;
906       tmp[row++]=sum2;
907       tmp[row++]=sum3;
908       tmp[row++]=sum4;
909       break;
910     case 5:
911       sum1 = b[*r++];
912       sum2 = b[*r++];
913       sum3 = b[*r++];
914       sum4 = b[*r++];
915       sum5 = b[*r++];
916       v2   = aa + ai[row+1];
917       v3   = aa + ai[row+2];
918       v4   = aa + ai[row+3];
919       v5   = aa + ai[row+4];
920 
921       for (j=0; j<nz-1; j+=2) {
922         i0    = vi[0];
923         i1    = vi[1];
924         vi   +=2;
925         tmp0  = tmps[i0];
926         tmp1  = tmps[i1];
927         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
928         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
929         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
930         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
931         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
932       }
933       if (j == nz-1) {
934         tmp0  = tmps[*vi++];
935         sum1 -= *v1++ *tmp0;
936         sum2 -= *v2++ *tmp0;
937         sum3 -= *v3++ *tmp0;
938         sum4 -= *v4++ *tmp0;
939         sum5 -= *v5++ *tmp0;
940       }
941 
942       sum2 -= *v2++ * sum1;
943       sum3 -= *v3++ * sum1;
944       sum4 -= *v4++ * sum1;
945       sum5 -= *v5++ * sum1;
946       sum3 -= *v3++ * sum2;
947       sum4 -= *v4++ * sum2;
948       sum5 -= *v5++ * sum2;
949       sum4 -= *v4++ * sum3;
950       sum5 -= *v5++ * sum3;
951       sum5 -= *v5++ * sum4;
952 
953       tmp[row++]=sum1;
954       tmp[row++]=sum2;
955       tmp[row++]=sum3;
956       tmp[row++]=sum4;
957       tmp[row++]=sum5;
958       break;
959     default:
960       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
961     }
962   }
963   /* backward solve the upper triangular */
964   for (i=node_max -1,row = n-1; i>=0; i--) {
965     nsz = ns[i];
966     aii = ai[row+1] -1;
967     v1  = aa + aii;
968     vi  = aj + aii;
969     nz  = aii- ad[row];
970     switch (nsz) {               /* Each loop in 'case' is unrolled */
971     case 1:
972       sum1 = tmp[row];
973 
974       for (j=nz; j>1; j-=2) {
975         vi   -=2;
976         i0    = vi[2];
977         i1    = vi[1];
978         tmp0  = tmps[i0];
979         tmp1  = tmps[i1];
980         v1   -= 2;
981         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
982       }
983       if (j==1) {
984         tmp0  = tmps[*vi--];
985         sum1 -= *v1-- * tmp0;
986       }
987       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
988       break;
989     case 2:
990       sum1 = tmp[row];
991       sum2 = tmp[row -1];
992       v2   = aa + ai[row]-1;
993       for (j=nz; j>1; j-=2) {
994         vi   -=2;
995         i0    = vi[2];
996         i1    = vi[1];
997         tmp0  = tmps[i0];
998         tmp1  = tmps[i1];
999         v1   -= 2;
1000         v2   -= 2;
1001         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1002         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1003       }
1004       if (j==1) {
1005         tmp0  = tmps[*vi--];
1006         sum1 -= *v1-- * tmp0;
1007         sum2 -= *v2-- * tmp0;
1008       }
1009 
1010       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1011       sum2   -= *v2-- * tmp0;
1012       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1013       break;
1014     case 3:
1015       sum1 = tmp[row];
1016       sum2 = tmp[row -1];
1017       sum3 = tmp[row -2];
1018       v2   = aa + ai[row]-1;
1019       v3   = aa + ai[row -1]-1;
1020       for (j=nz; j>1; j-=2) {
1021         vi   -=2;
1022         i0    = vi[2];
1023         i1    = vi[1];
1024         tmp0  = tmps[i0];
1025         tmp1  = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         v3   -= 2;
1029         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1030         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1031         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1032       }
1033       if (j==1) {
1034         tmp0  = tmps[*vi--];
1035         sum1 -= *v1-- * tmp0;
1036         sum2 -= *v2-- * tmp0;
1037         sum3 -= *v3-- * tmp0;
1038       }
1039       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1040       sum2   -= *v2-- * tmp0;
1041       sum3   -= *v3-- * tmp0;
1042       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1043       sum3   -= *v3-- * tmp0;
1044       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1045 
1046       break;
1047     case 4:
1048       sum1 = tmp[row];
1049       sum2 = tmp[row -1];
1050       sum3 = tmp[row -2];
1051       sum4 = tmp[row -3];
1052       v2   = aa + ai[row]-1;
1053       v3   = aa + ai[row -1]-1;
1054       v4   = aa + ai[row -2]-1;
1055 
1056       for (j=nz; j>1; j-=2) {
1057         vi   -=2;
1058         i0    = vi[2];
1059         i1    = vi[1];
1060         tmp0  = tmps[i0];
1061         tmp1  = tmps[i1];
1062         v1   -= 2;
1063         v2   -= 2;
1064         v3   -= 2;
1065         v4   -= 2;
1066         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1067         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1068         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1069         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1070       }
1071       if (j==1) {
1072         tmp0  = tmps[*vi--];
1073         sum1 -= *v1-- * tmp0;
1074         sum2 -= *v2-- * tmp0;
1075         sum3 -= *v3-- * tmp0;
1076         sum4 -= *v4-- * tmp0;
1077       }
1078 
1079       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1080       sum2   -= *v2-- * tmp0;
1081       sum3   -= *v3-- * tmp0;
1082       sum4   -= *v4-- * tmp0;
1083       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1084       sum3   -= *v3-- * tmp0;
1085       sum4   -= *v4-- * tmp0;
1086       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1087       sum4   -= *v4-- * tmp0;
1088       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1089       break;
1090     case 5:
1091       sum1 = tmp[row];
1092       sum2 = tmp[row -1];
1093       sum3 = tmp[row -2];
1094       sum4 = tmp[row -3];
1095       sum5 = tmp[row -4];
1096       v2   = aa + ai[row]-1;
1097       v3   = aa + ai[row -1]-1;
1098       v4   = aa + ai[row -2]-1;
1099       v5   = aa + ai[row -3]-1;
1100       for (j=nz; j>1; j-=2) {
1101         vi   -= 2;
1102         i0    = vi[2];
1103         i1    = vi[1];
1104         tmp0  = tmps[i0];
1105         tmp1  = tmps[i1];
1106         v1   -= 2;
1107         v2   -= 2;
1108         v3   -= 2;
1109         v4   -= 2;
1110         v5   -= 2;
1111         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1112         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1113         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1114         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1115         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1116       }
1117       if (j==1) {
1118         tmp0  = tmps[*vi--];
1119         sum1 -= *v1-- * tmp0;
1120         sum2 -= *v2-- * tmp0;
1121         sum3 -= *v3-- * tmp0;
1122         sum4 -= *v4-- * tmp0;
1123         sum5 -= *v5-- * tmp0;
1124       }
1125 
1126       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1127       sum2   -= *v2-- * tmp0;
1128       sum3   -= *v3-- * tmp0;
1129       sum4   -= *v4-- * tmp0;
1130       sum5   -= *v5-- * tmp0;
1131       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1132       sum3   -= *v3-- * tmp0;
1133       sum4   -= *v4-- * tmp0;
1134       sum5   -= *v5-- * tmp0;
1135       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1136       sum4   -= *v4-- * tmp0;
1137       sum5   -= *v5-- * tmp0;
1138       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1139       sum5   -= *v5-- * tmp0;
1140       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1141       break;
1142     default:
1143       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1144     }
1145   }
1146   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1147   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1148   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1149   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1150   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1151   PetscFunctionReturn(0);
1152 }
1153 
1154 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1155 {
1156   Mat             C     =B;
1157   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1158   IS              isrow = b->row,isicol = b->icol;
1159   PetscErrorCode  ierr;
1160   const PetscInt  *r,*ic,*ics;
1161   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1162   PetscInt        i,j,k,nz,nzL,row,*pj;
1163   const PetscInt  *ajtmp,*bjtmp;
1164   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1165   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1166   FactorShiftCtx  sctx;
1167   const PetscInt  *ddiag;
1168   PetscReal       rs;
1169   MatScalar       d;
1170   PetscInt        inod,nodesz,node_max,col;
1171   const PetscInt  *ns;
1172   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1173 
1174   PetscFunctionBegin;
1175   /* MatPivotSetUp(): initialize shift context sctx */
1176   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1177 
1178   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1179     ddiag          = a->diag;
1180     sctx.shift_top = info->zeropivot;
1181     for (i=0; i<n; i++) {
1182       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183       d  = (aa)[ddiag[i]];
1184       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1185       v  = aa+ai[i];
1186       nz = ai[i+1] - ai[i];
1187       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1188       if (rs>sctx.shift_top) sctx.shift_top = rs;
1189     }
1190     sctx.shift_top *= 1.1;
1191     sctx.nshift_max = 5;
1192     sctx.shift_lo   = 0.;
1193     sctx.shift_hi   = 1.;
1194   }
1195 
1196   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1197   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1198 
1199   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1200   ics   = ic;
1201 
1202   node_max = a->inode.node_count;
1203   ns       = a->inode.size;
1204   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1205 
1206   /* If max inode size > 4, split it into two inodes.*/
1207   /* also map the inode sizes according to the ordering */
1208   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1209   for (i=0,j=0; i<node_max; ++i,++j) {
1210     if (ns[i] > 4) {
1211       tmp_vec1[j] = 4;
1212       ++j;
1213       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1214     } else {
1215       tmp_vec1[j] = ns[i];
1216     }
1217   }
1218   /* Use the correct node_max */
1219   node_max = j;
1220 
1221   /* Now reorder the inode info based on mat re-ordering info */
1222   /* First create a row -> inode_size_array_index map */
1223   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1224   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1225   for (i=0,row=0; i<node_max; i++) {
1226     nodesz = tmp_vec1[i];
1227     for (j=0; j<nodesz; j++,row++) {
1228       nsmap[row] = i;
1229     }
1230   }
1231   /* Using nsmap, create a reordered ns structure */
1232   for (i=0,j=0; i< node_max; i++) {
1233     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1234     tmp_vec2[i] = nodesz;
1235     j          += nodesz;
1236   }
1237   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1238   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1239 
1240   /* Now use the correct ns */
1241   ns = tmp_vec2;
1242 
1243   do {
1244     sctx.newshift = PETSC_FALSE;
1245     /* Now loop over each block-row, and do the factorization */
1246     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1247       nodesz = ns[inod];
1248 
1249       switch (nodesz) {
1250       case 1:
1251         /*----------*/
1252         /* zero rtmp1 */
1253         /* L part */
1254         nz    = bi[i+1] - bi[i];
1255         bjtmp = bj + bi[i];
1256         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1257 
1258         /* U part */
1259         nz    = bdiag[i]-bdiag[i+1];
1260         bjtmp = bj + bdiag[i+1]+1;
1261         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1262 
1263         /* load in initial (unfactored row) */
1264         nz    = ai[r[i]+1] - ai[r[i]];
1265         ajtmp = aj + ai[r[i]];
1266         v     = aa + ai[r[i]];
1267         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1268 
1269         /* ZeropivotApply() */
1270         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1271 
1272         /* elimination */
1273         bjtmp = bj + bi[i];
1274         row   = *bjtmp++;
1275         nzL   = bi[i+1] - bi[i];
1276         for (k=0; k < nzL; k++) {
1277           pc = rtmp1 + row;
1278           if (*pc != 0.0) {
1279             pv   = b->a + bdiag[row];
1280             mul1 = *pc * (*pv);
1281             *pc  = mul1;
1282             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1283             pv   = b->a + bdiag[row+1]+1;
1284             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1285             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1286             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1287           }
1288           row = *bjtmp++;
1289         }
1290 
1291         /* finished row so stick it into b->a */
1292         rs = 0.0;
1293         /* L part */
1294         pv = b->a + bi[i];
1295         pj = b->j + bi[i];
1296         nz = bi[i+1] - bi[i];
1297         for (j=0; j<nz; j++) {
1298           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1299         }
1300 
1301         /* U part */
1302         pv = b->a + bdiag[i+1]+1;
1303         pj = b->j + bdiag[i+1]+1;
1304         nz = bdiag[i] - bdiag[i+1]-1;
1305         for (j=0; j<nz; j++) {
1306           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1307         }
1308 
1309         /* Check zero pivot */
1310         sctx.rs = rs;
1311         sctx.pv = rtmp1[i];
1312         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1313         if (sctx.newshift) break;
1314 
1315         /* Mark diagonal and invert diagonal for simplier triangular solves */
1316         pv  = b->a + bdiag[i];
1317         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1318         break;
1319 
1320       case 2:
1321         /*----------*/
1322         /* zero rtmp1 and rtmp2 */
1323         /* L part */
1324         nz    = bi[i+1] - bi[i];
1325         bjtmp = bj + bi[i];
1326         for  (j=0; j<nz; j++) {
1327           col        = bjtmp[j];
1328           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1329         }
1330 
1331         /* U part */
1332         nz    = bdiag[i]-bdiag[i+1];
1333         bjtmp = bj + bdiag[i+1]+1;
1334         for  (j=0; j<nz; j++) {
1335           col        = bjtmp[j];
1336           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1337         }
1338 
1339         /* load in initial (unfactored row) */
1340         nz    = ai[r[i]+1] - ai[r[i]];
1341         ajtmp = aj + ai[r[i]];
1342         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1343         for (j=0; j<nz; j++) {
1344           col        = ics[ajtmp[j]];
1345           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1346         }
1347         /* ZeropivotApply(): shift the diagonal of the matrix  */
1348         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1349 
1350         /* elimination */
1351         bjtmp = bj + bi[i];
1352         row   = *bjtmp++; /* pivot row */
1353         nzL   = bi[i+1] - bi[i];
1354         for (k=0; k < nzL; k++) {
1355           pc1 = rtmp1 + row;
1356           pc2 = rtmp2 + row;
1357           if (*pc1 != 0.0 || *pc2 != 0.0) {
1358             pv   = b->a + bdiag[row];
1359             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1360             *pc1 = mul1;       *pc2 = mul2;
1361 
1362             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1363             pv = b->a + bdiag[row+1]+1;
1364             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1365             for (j=0; j<nz; j++) {
1366               col         = pj[j];
1367               rtmp1[col] -= mul1 * pv[j];
1368               rtmp2[col] -= mul2 * pv[j];
1369             }
1370             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1371           }
1372           row = *bjtmp++;
1373         }
1374 
1375         /* finished row i; check zero pivot, then stick row i into b->a */
1376         rs = 0.0;
1377         /* L part */
1378         pc1 = b->a + bi[i];
1379         pj  = b->j + bi[i];
1380         nz  = bi[i+1] - bi[i];
1381         for (j=0; j<nz; j++) {
1382           col    = pj[j];
1383           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1384         }
1385         /* U part */
1386         pc1 = b->a + bdiag[i+1]+1;
1387         pj  = b->j + bdiag[i+1]+1;
1388         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1389         for (j=0; j<nz; j++) {
1390           col    = pj[j];
1391           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1392         }
1393 
1394         sctx.rs = rs;
1395         sctx.pv = rtmp1[i];
1396         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1397         if (sctx.newshift) break;
1398         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1399         *pc1 = 1.0/sctx.pv;
1400 
1401         /* Now take care of diagonal 2x2 block. */
1402         pc2 = rtmp2 + i;
1403         if (*pc2 != 0.0) {
1404           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1405           *pc2 = mul1;          /* insert L entry */
1406           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1407           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1408           for (j=0; j<nz; j++) {
1409             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1410           }
1411           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1412         }
1413 
1414         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1415         rs = 0.0;
1416         /* L part */
1417         pc2 = b->a + bi[i+1];
1418         pj  = b->j + bi[i+1];
1419         nz  = bi[i+2] - bi[i+1];
1420         for (j=0; j<nz; j++) {
1421           col    = pj[j];
1422           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1423         }
1424         /* U part */
1425         pc2 = b->a + bdiag[i+2]+1;
1426         pj  = b->j + bdiag[i+2]+1;
1427         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1428         for (j=0; j<nz; j++) {
1429           col    = pj[j];
1430           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1431         }
1432 
1433         sctx.rs = rs;
1434         sctx.pv = rtmp2[i+1];
1435         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1436         if (sctx.newshift) break;
1437         pc2  = b->a + bdiag[i+1];
1438         *pc2 = 1.0/sctx.pv;
1439         break;
1440 
1441       case 3:
1442         /*----------*/
1443         /* zero rtmp */
1444         /* L part */
1445         nz    = bi[i+1] - bi[i];
1446         bjtmp = bj + bi[i];
1447         for  (j=0; j<nz; j++) {
1448           col        = bjtmp[j];
1449           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1450         }
1451 
1452         /* U part */
1453         nz    = bdiag[i]-bdiag[i+1];
1454         bjtmp = bj + bdiag[i+1]+1;
1455         for  (j=0; j<nz; j++) {
1456           col        = bjtmp[j];
1457           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1458         }
1459 
1460         /* load in initial (unfactored row) */
1461         nz    = ai[r[i]+1] - ai[r[i]];
1462         ajtmp = aj + ai[r[i]];
1463         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1464         for (j=0; j<nz; j++) {
1465           col        = ics[ajtmp[j]];
1466           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1467         }
1468         /* ZeropivotApply(): shift the diagonal of the matrix  */
1469         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1470 
1471         /* elimination */
1472         bjtmp = bj + bi[i];
1473         row   = *bjtmp++; /* pivot row */
1474         nzL   = bi[i+1] - bi[i];
1475         for (k=0; k < nzL; k++) {
1476           pc1 = rtmp1 + row;
1477           pc2 = rtmp2 + row;
1478           pc3 = rtmp3 + row;
1479           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1480             pv   = b->a + bdiag[row];
1481             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1482             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1483 
1484             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1485             pv = b->a + bdiag[row+1]+1;
1486             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1487             for (j=0; j<nz; j++) {
1488               col         = pj[j];
1489               rtmp1[col] -= mul1 * pv[j];
1490               rtmp2[col] -= mul2 * pv[j];
1491               rtmp3[col] -= mul3 * pv[j];
1492             }
1493             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1494           }
1495           row = *bjtmp++;
1496         }
1497 
1498         /* finished row i; check zero pivot, then stick row i into b->a */
1499         rs = 0.0;
1500         /* L part */
1501         pc1 = b->a + bi[i];
1502         pj  = b->j + bi[i];
1503         nz  = bi[i+1] - bi[i];
1504         for (j=0; j<nz; j++) {
1505           col    = pj[j];
1506           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1507         }
1508         /* U part */
1509         pc1 = b->a + bdiag[i+1]+1;
1510         pj  = b->j + bdiag[i+1]+1;
1511         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1512         for (j=0; j<nz; j++) {
1513           col    = pj[j];
1514           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1515         }
1516 
1517         sctx.rs = rs;
1518         sctx.pv = rtmp1[i];
1519         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1520         if (sctx.newshift) break;
1521         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1522         *pc1 = 1.0/sctx.pv;
1523 
1524         /* Now take care of 1st column of diagonal 3x3 block. */
1525         pc2 = rtmp2 + i;
1526         pc3 = rtmp3 + i;
1527         if (*pc2 != 0.0 || *pc3 != 0.0) {
1528           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1529           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1530           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1531           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1532           for (j=0; j<nz; j++) {
1533             col         = pj[j];
1534             rtmp2[col] -= mul2 * rtmp1[col];
1535             rtmp3[col] -= mul3 * rtmp1[col];
1536           }
1537           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1538         }
1539 
1540         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1541         rs = 0.0;
1542         /* L part */
1543         pc2 = b->a + bi[i+1];
1544         pj  = b->j + bi[i+1];
1545         nz  = bi[i+2] - bi[i+1];
1546         for (j=0; j<nz; j++) {
1547           col    = pj[j];
1548           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1549         }
1550         /* U part */
1551         pc2 = b->a + bdiag[i+2]+1;
1552         pj  = b->j + bdiag[i+2]+1;
1553         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1554         for (j=0; j<nz; j++) {
1555           col    = pj[j];
1556           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1557         }
1558 
1559         sctx.rs = rs;
1560         sctx.pv = rtmp2[i+1];
1561         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1562         if (sctx.newshift) break;
1563         pc2  = b->a + bdiag[i+1];
1564         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1565 
1566         /* Now take care of 2nd column of diagonal 3x3 block. */
1567         pc3 = rtmp3 + i+1;
1568         if (*pc3 != 0.0) {
1569           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1570           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1571           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1572           for (j=0; j<nz; j++) {
1573             col         = pj[j];
1574             rtmp3[col] -= mul3 * rtmp2[col];
1575           }
1576           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1577         }
1578 
1579         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1580         rs = 0.0;
1581         /* L part */
1582         pc3 = b->a + bi[i+2];
1583         pj  = b->j + bi[i+2];
1584         nz  = bi[i+3] - bi[i+2];
1585         for (j=0; j<nz; j++) {
1586           col    = pj[j];
1587           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1588         }
1589         /* U part */
1590         pc3 = b->a + bdiag[i+3]+1;
1591         pj  = b->j + bdiag[i+3]+1;
1592         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1593         for (j=0; j<nz; j++) {
1594           col    = pj[j];
1595           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1596         }
1597 
1598         sctx.rs = rs;
1599         sctx.pv = rtmp3[i+2];
1600         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1601         if (sctx.newshift) break;
1602         pc3  = b->a + bdiag[i+2];
1603         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1604         break;
1605       case 4:
1606         /*----------*/
1607         /* zero rtmp */
1608         /* L part */
1609         nz    = bi[i+1] - bi[i];
1610         bjtmp = bj + bi[i];
1611         for  (j=0; j<nz; j++) {
1612           col        = bjtmp[j];
1613           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1614         }
1615 
1616         /* U part */
1617         nz    = bdiag[i]-bdiag[i+1];
1618         bjtmp = bj + bdiag[i+1]+1;
1619         for  (j=0; j<nz; j++) {
1620           col        = bjtmp[j];
1621           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1622         }
1623 
1624         /* load in initial (unfactored row) */
1625         nz    = ai[r[i]+1] - ai[r[i]];
1626         ajtmp = aj + ai[r[i]];
1627         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1628         for (j=0; j<nz; j++) {
1629           col        = ics[ajtmp[j]];
1630           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1631         }
1632         /* ZeropivotApply(): shift the diagonal of the matrix  */
1633         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1634 
1635         /* elimination */
1636         bjtmp = bj + bi[i];
1637         row   = *bjtmp++; /* pivot row */
1638         nzL   = bi[i+1] - bi[i];
1639         for (k=0; k < nzL; k++) {
1640           pc1 = rtmp1 + row;
1641           pc2 = rtmp2 + row;
1642           pc3 = rtmp3 + row;
1643           pc4 = rtmp4 + row;
1644           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1645             pv   = b->a + bdiag[row];
1646             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1647             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1648 
1649             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1650             pv = b->a + bdiag[row+1]+1;
1651             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1652             for (j=0; j<nz; j++) {
1653               col         = pj[j];
1654               rtmp1[col] -= mul1 * pv[j];
1655               rtmp2[col] -= mul2 * pv[j];
1656               rtmp3[col] -= mul3 * pv[j];
1657               rtmp4[col] -= mul4 * pv[j];
1658             }
1659             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1660           }
1661           row = *bjtmp++;
1662         }
1663 
1664         /* finished row i; check zero pivot, then stick row i into b->a */
1665         rs = 0.0;
1666         /* L part */
1667         pc1 = b->a + bi[i];
1668         pj  = b->j + bi[i];
1669         nz  = bi[i+1] - bi[i];
1670         for (j=0; j<nz; j++) {
1671           col    = pj[j];
1672           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1673         }
1674         /* U part */
1675         pc1 = b->a + bdiag[i+1]+1;
1676         pj  = b->j + bdiag[i+1]+1;
1677         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1678         for (j=0; j<nz; j++) {
1679           col    = pj[j];
1680           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1681         }
1682 
1683         sctx.rs = rs;
1684         sctx.pv = rtmp1[i];
1685         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1686         if (sctx.newshift) break;
1687         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1688         *pc1 = 1.0/sctx.pv;
1689 
1690         /* Now take care of 1st column of diagonal 4x4 block. */
1691         pc2 = rtmp2 + i;
1692         pc3 = rtmp3 + i;
1693         pc4 = rtmp4 + i;
1694         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1695           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1696           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1697           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1698           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1699           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1700           for (j=0; j<nz; j++) {
1701             col         = pj[j];
1702             rtmp2[col] -= mul2 * rtmp1[col];
1703             rtmp3[col] -= mul3 * rtmp1[col];
1704             rtmp4[col] -= mul4 * rtmp1[col];
1705           }
1706           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1707         }
1708 
1709         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1710         rs = 0.0;
1711         /* L part */
1712         pc2 = b->a + bi[i+1];
1713         pj  = b->j + bi[i+1];
1714         nz  = bi[i+2] - bi[i+1];
1715         for (j=0; j<nz; j++) {
1716           col    = pj[j];
1717           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1718         }
1719         /* U part */
1720         pc2 = b->a + bdiag[i+2]+1;
1721         pj  = b->j + bdiag[i+2]+1;
1722         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1723         for (j=0; j<nz; j++) {
1724           col    = pj[j];
1725           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1726         }
1727 
1728         sctx.rs = rs;
1729         sctx.pv = rtmp2[i+1];
1730         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1731         if (sctx.newshift) break;
1732         pc2  = b->a + bdiag[i+1];
1733         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1734 
1735         /* Now take care of 2nd column of diagonal 4x4 block. */
1736         pc3 = rtmp3 + i+1;
1737         pc4 = rtmp4 + i+1;
1738         if (*pc3 != 0.0 || *pc4 != 0.0) {
1739           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1740           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1741           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1742           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1743           for (j=0; j<nz; j++) {
1744             col         = pj[j];
1745             rtmp3[col] -= mul3 * rtmp2[col];
1746             rtmp4[col] -= mul4 * rtmp2[col];
1747           }
1748           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1749         }
1750 
1751         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1752         rs = 0.0;
1753         /* L part */
1754         pc3 = b->a + bi[i+2];
1755         pj  = b->j + bi[i+2];
1756         nz  = bi[i+3] - bi[i+2];
1757         for (j=0; j<nz; j++) {
1758           col    = pj[j];
1759           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1760         }
1761         /* U part */
1762         pc3 = b->a + bdiag[i+3]+1;
1763         pj  = b->j + bdiag[i+3]+1;
1764         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1765         for (j=0; j<nz; j++) {
1766           col    = pj[j];
1767           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1768         }
1769 
1770         sctx.rs = rs;
1771         sctx.pv = rtmp3[i+2];
1772         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1773         if (sctx.newshift) break;
1774         pc3  = b->a + bdiag[i+2];
1775         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1776 
1777         /* Now take care of 3rd column of diagonal 4x4 block. */
1778         pc4 = rtmp4 + i+2;
1779         if (*pc4 != 0.0) {
1780           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1781           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1782           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1783           for (j=0; j<nz; j++) {
1784             col         = pj[j];
1785             rtmp4[col] -= mul4 * rtmp3[col];
1786           }
1787           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1788         }
1789 
1790         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1791         rs = 0.0;
1792         /* L part */
1793         pc4 = b->a + bi[i+3];
1794         pj  = b->j + bi[i+3];
1795         nz  = bi[i+4] - bi[i+3];
1796         for (j=0; j<nz; j++) {
1797           col    = pj[j];
1798           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1799         }
1800         /* U part */
1801         pc4 = b->a + bdiag[i+4]+1;
1802         pj  = b->j + bdiag[i+4]+1;
1803         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1804         for (j=0; j<nz; j++) {
1805           col    = pj[j];
1806           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1807         }
1808 
1809         sctx.rs = rs;
1810         sctx.pv = rtmp4[i+3];
1811         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1812         if (sctx.newshift) break;
1813         pc4  = b->a + bdiag[i+3];
1814         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1815         break;
1816 
1817       default:
1818         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1819       }
1820       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1821       i += nodesz;                 /* Update the row */
1822     }
1823 
1824     /* MatPivotRefine() */
1825     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1826       /*
1827        * if no shift in this attempt & shifting & started shifting & can refine,
1828        * then try lower shift
1829        */
1830       sctx.shift_hi       = sctx.shift_fraction;
1831       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1832       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1833       sctx.newshift       = PETSC_TRUE;
1834       sctx.nshift++;
1835     }
1836   } while (sctx.newshift);
1837 
1838   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1839   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1840   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1841   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1842 
1843   if (b->inode.size) {
1844     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1845   } else {
1846     C->ops->solve           = MatSolve_SeqAIJ;
1847   }
1848   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1849   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1850   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1851   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1852   C->assembled              = PETSC_TRUE;
1853   C->preallocated           = PETSC_TRUE;
1854 
1855   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1856 
1857   /* MatShiftView(A,info,&sctx) */
1858   if (sctx.nshift) {
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1860       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1861     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1862       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1863     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1864       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1865     }
1866   }
1867   PetscFunctionReturn(0);
1868 }
1869 
1870 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1871 {
1872   Mat             C     = B;
1873   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1874   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1875   PetscErrorCode  ierr;
1876   const PetscInt  *r,*ic,*c,*ics;
1877   PetscInt        n   = A->rmap->n,*bi = b->i;
1878   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1879   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1880   PetscInt        *ai = a->i,*aj = a->j;
1881   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1882   PetscScalar     mul1,mul2,mul3,tmp;
1883   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1884   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1885   PetscReal       rs=0.0;
1886   FactorShiftCtx  sctx;
1887 
1888   PetscFunctionBegin;
1889   sctx.shift_top      = 0;
1890   sctx.nshift_max     = 0;
1891   sctx.shift_lo       = 0;
1892   sctx.shift_hi       = 0;
1893   sctx.shift_fraction = 0;
1894 
1895   /* if both shift schemes are chosen by user, only use info->shiftpd */
1896   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1897     sctx.shift_top = 0;
1898     for (i=0; i<n; i++) {
1899       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1900       rs    = 0.0;
1901       ajtmp = aj + ai[i];
1902       rtmp1 = aa + ai[i];
1903       nz    = ai[i+1] - ai[i];
1904       for (j=0; j<nz; j++) {
1905         if (*ajtmp != i) {
1906           rs += PetscAbsScalar(*rtmp1++);
1907         } else {
1908           rs -= PetscRealPart(*rtmp1++);
1909         }
1910         ajtmp++;
1911       }
1912       if (rs>sctx.shift_top) sctx.shift_top = rs;
1913     }
1914     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1915     sctx.shift_top *= 1.1;
1916     sctx.nshift_max = 5;
1917     sctx.shift_lo   = 0.;
1918     sctx.shift_hi   = 1.;
1919   }
1920   sctx.shift_amount = 0;
1921   sctx.nshift       = 0;
1922 
1923   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1924   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1925   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1926   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1927   ics    = ic;
1928 
1929   node_max = a->inode.node_count;
1930   ns       = a->inode.size;
1931   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1932 
1933   /* If max inode size > 3, split it into two inodes.*/
1934   /* also map the inode sizes according to the ordering */
1935   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1936   for (i=0,j=0; i<node_max; ++i,++j) {
1937     if (ns[i]>3) {
1938       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1939       ++j;
1940       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1941     } else {
1942       tmp_vec1[j] = ns[i];
1943     }
1944   }
1945   /* Use the correct node_max */
1946   node_max = j;
1947 
1948   /* Now reorder the inode info based on mat re-ordering info */
1949   /* First create a row -> inode_size_array_index map */
1950   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1951   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1952   for (i=0,row=0; i<node_max; i++) {
1953     nodesz = tmp_vec1[i];
1954     for (j=0; j<nodesz; j++,row++) {
1955       nsmap[row] = i;
1956     }
1957   }
1958   /* Using nsmap, create a reordered ns structure */
1959   for (i=0,j=0; i< node_max; i++) {
1960     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1961     tmp_vec2[i] = nodesz;
1962     j          += nodesz;
1963   }
1964   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1965   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1966   /* Now use the correct ns */
1967   ns = tmp_vec2;
1968 
1969   do {
1970     sctx.newshift = PETSC_FALSE;
1971     /* Now loop over each block-row, and do the factorization */
1972     for (i=0,row=0; i<node_max; i++) {
1973       nodesz = ns[i];
1974       nz     = bi[row+1] - bi[row];
1975       bjtmp  = bj + bi[row];
1976 
1977       switch (nodesz) {
1978       case 1:
1979         for  (j=0; j<nz; j++) {
1980           idx         = bjtmp[j];
1981           rtmp11[idx] = 0.0;
1982         }
1983 
1984         /* load in initial (unfactored row) */
1985         idx    = r[row];
1986         nz_tmp = ai[idx+1] - ai[idx];
1987         ajtmp  = aj + ai[idx];
1988         v1     = aa + ai[idx];
1989 
1990         for (j=0; j<nz_tmp; j++) {
1991           idx         = ics[ajtmp[j]];
1992           rtmp11[idx] = v1[j];
1993         }
1994         rtmp11[ics[r[row]]] += sctx.shift_amount;
1995 
1996         prow = *bjtmp++;
1997         while (prow < row) {
1998           pc1 = rtmp11 + prow;
1999           if (*pc1 != 0.0) {
2000             pv     = ba + bd[prow];
2001             pj     = nbj + bd[prow];
2002             mul1   = *pc1 * *pv++;
2003             *pc1   = mul1;
2004             nz_tmp = bi[prow+1] - bd[prow] - 1;
2005             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2006             for (j=0; j<nz_tmp; j++) {
2007               tmp          = pv[j];
2008               idx          = pj[j];
2009               rtmp11[idx] -= mul1 * tmp;
2010             }
2011           }
2012           prow = *bjtmp++;
2013         }
2014         pj  = bj + bi[row];
2015         pc1 = ba + bi[row];
2016 
2017         sctx.pv     = rtmp11[row];
2018         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2019         rs          = 0.0;
2020         for (j=0; j<nz; j++) {
2021           idx    = pj[j];
2022           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2023           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2024         }
2025         sctx.rs = rs;
2026         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2027         if (sctx.newshift) goto endofwhile;
2028         break;
2029 
2030       case 2:
2031         for (j=0; j<nz; j++) {
2032           idx         = bjtmp[j];
2033           rtmp11[idx] = 0.0;
2034           rtmp22[idx] = 0.0;
2035         }
2036 
2037         /* load in initial (unfactored row) */
2038         idx    = r[row];
2039         nz_tmp = ai[idx+1] - ai[idx];
2040         ajtmp  = aj + ai[idx];
2041         v1     = aa + ai[idx];
2042         v2     = aa + ai[idx+1];
2043         for (j=0; j<nz_tmp; j++) {
2044           idx         = ics[ajtmp[j]];
2045           rtmp11[idx] = v1[j];
2046           rtmp22[idx] = v2[j];
2047         }
2048         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2049         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2050 
2051         prow = *bjtmp++;
2052         while (prow < row) {
2053           pc1 = rtmp11 + prow;
2054           pc2 = rtmp22 + prow;
2055           if (*pc1 != 0.0 || *pc2 != 0.0) {
2056             pv   = ba + bd[prow];
2057             pj   = nbj + bd[prow];
2058             mul1 = *pc1 * *pv;
2059             mul2 = *pc2 * *pv;
2060             ++pv;
2061             *pc1 = mul1;
2062             *pc2 = mul2;
2063 
2064             nz_tmp = bi[prow+1] - bd[prow] - 1;
2065             for (j=0; j<nz_tmp; j++) {
2066               tmp          = pv[j];
2067               idx          = pj[j];
2068               rtmp11[idx] -= mul1 * tmp;
2069               rtmp22[idx] -= mul2 * tmp;
2070             }
2071             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2072           }
2073           prow = *bjtmp++;
2074         }
2075 
2076         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2077         pc1 = rtmp11 + prow;
2078         pc2 = rtmp22 + prow;
2079 
2080         sctx.pv = *pc1;
2081         pj      = bj + bi[prow];
2082         rs      = 0.0;
2083         for (j=0; j<nz; j++) {
2084           idx = pj[j];
2085           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2086         }
2087         sctx.rs = rs;
2088         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2089         if (sctx.newshift) goto endofwhile;
2090 
2091         if (*pc2 != 0.0) {
2092           pj     = nbj + bd[prow];
2093           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2094           *pc2   = mul2;
2095           nz_tmp = bi[prow+1] - bd[prow] - 1;
2096           for (j=0; j<nz_tmp; j++) {
2097             idx          = pj[j];
2098             tmp          = rtmp11[idx];
2099             rtmp22[idx] -= mul2 * tmp;
2100           }
2101           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2102         }
2103 
2104         pj  = bj + bi[row];
2105         pc1 = ba + bi[row];
2106         pc2 = ba + bi[row+1];
2107 
2108         sctx.pv       = rtmp22[row+1];
2109         rs            = 0.0;
2110         rtmp11[row]   = 1.0/rtmp11[row];
2111         rtmp22[row+1] = 1.0/rtmp22[row+1];
2112         /* copy row entries from dense representation to sparse */
2113         for (j=0; j<nz; j++) {
2114           idx    = pj[j];
2115           pc1[j] = rtmp11[idx];
2116           pc2[j] = rtmp22[idx];
2117           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2118         }
2119         sctx.rs = rs;
2120         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2121         if (sctx.newshift) goto endofwhile;
2122         break;
2123 
2124       case 3:
2125         for  (j=0; j<nz; j++) {
2126           idx         = bjtmp[j];
2127           rtmp11[idx] = 0.0;
2128           rtmp22[idx] = 0.0;
2129           rtmp33[idx] = 0.0;
2130         }
2131         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2132         idx    = r[row];
2133         nz_tmp = ai[idx+1] - ai[idx];
2134         ajtmp  = aj + ai[idx];
2135         v1     = aa + ai[idx];
2136         v2     = aa + ai[idx+1];
2137         v3     = aa + ai[idx+2];
2138         for (j=0; j<nz_tmp; j++) {
2139           idx         = ics[ajtmp[j]];
2140           rtmp11[idx] = v1[j];
2141           rtmp22[idx] = v2[j];
2142           rtmp33[idx] = v3[j];
2143         }
2144         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2145         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2146         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2147 
2148         /* loop over all pivot row blocks above this row block */
2149         prow = *bjtmp++;
2150         while (prow < row) {
2151           pc1 = rtmp11 + prow;
2152           pc2 = rtmp22 + prow;
2153           pc3 = rtmp33 + prow;
2154           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2155             pv   = ba  + bd[prow];
2156             pj   = nbj + bd[prow];
2157             mul1 = *pc1 * *pv;
2158             mul2 = *pc2 * *pv;
2159             mul3 = *pc3 * *pv;
2160             ++pv;
2161             *pc1 = mul1;
2162             *pc2 = mul2;
2163             *pc3 = mul3;
2164 
2165             nz_tmp = bi[prow+1] - bd[prow] - 1;
2166             /* update this row based on pivot row */
2167             for (j=0; j<nz_tmp; j++) {
2168               tmp          = pv[j];
2169               idx          = pj[j];
2170               rtmp11[idx] -= mul1 * tmp;
2171               rtmp22[idx] -= mul2 * tmp;
2172               rtmp33[idx] -= mul3 * tmp;
2173             }
2174             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2175           }
2176           prow = *bjtmp++;
2177         }
2178 
2179         /* Now take care of diagonal 3x3 block in this set of rows */
2180         /* note: prow = row here */
2181         pc1 = rtmp11 + prow;
2182         pc2 = rtmp22 + prow;
2183         pc3 = rtmp33 + prow;
2184 
2185         sctx.pv = *pc1;
2186         pj      = bj + bi[prow];
2187         rs      = 0.0;
2188         for (j=0; j<nz; j++) {
2189           idx = pj[j];
2190           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2191         }
2192         sctx.rs = rs;
2193         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2194         if (sctx.newshift) goto endofwhile;
2195 
2196         if (*pc2 != 0.0 || *pc3 != 0.0) {
2197           mul2   = (*pc2)/(*pc1);
2198           mul3   = (*pc3)/(*pc1);
2199           *pc2   = mul2;
2200           *pc3   = mul3;
2201           nz_tmp = bi[prow+1] - bd[prow] - 1;
2202           pj     = nbj + bd[prow];
2203           for (j=0; j<nz_tmp; j++) {
2204             idx          = pj[j];
2205             tmp          = rtmp11[idx];
2206             rtmp22[idx] -= mul2 * tmp;
2207             rtmp33[idx] -= mul3 * tmp;
2208           }
2209           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2210         }
2211         ++prow;
2212 
2213         pc2     = rtmp22 + prow;
2214         pc3     = rtmp33 + prow;
2215         sctx.pv = *pc2;
2216         pj      = bj + bi[prow];
2217         rs      = 0.0;
2218         for (j=0; j<nz; j++) {
2219           idx = pj[j];
2220           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2221         }
2222         sctx.rs = rs;
2223         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2224         if (sctx.newshift) goto endofwhile;
2225 
2226         if (*pc3 != 0.0) {
2227           mul3   = (*pc3)/(*pc2);
2228           *pc3   = mul3;
2229           pj     = nbj + bd[prow];
2230           nz_tmp = bi[prow+1] - bd[prow] - 1;
2231           for (j=0; j<nz_tmp; j++) {
2232             idx          = pj[j];
2233             tmp          = rtmp22[idx];
2234             rtmp33[idx] -= mul3 * tmp;
2235           }
2236           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2237         }
2238 
2239         pj  = bj + bi[row];
2240         pc1 = ba + bi[row];
2241         pc2 = ba + bi[row+1];
2242         pc3 = ba + bi[row+2];
2243 
2244         sctx.pv       = rtmp33[row+2];
2245         rs            = 0.0;
2246         rtmp11[row]   = 1.0/rtmp11[row];
2247         rtmp22[row+1] = 1.0/rtmp22[row+1];
2248         rtmp33[row+2] = 1.0/rtmp33[row+2];
2249         /* copy row entries from dense representation to sparse */
2250         for (j=0; j<nz; j++) {
2251           idx    = pj[j];
2252           pc1[j] = rtmp11[idx];
2253           pc2[j] = rtmp22[idx];
2254           pc3[j] = rtmp33[idx];
2255           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2256         }
2257 
2258         sctx.rs = rs;
2259         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2260         if (sctx.newshift) goto endofwhile;
2261         break;
2262 
2263       default:
2264         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2265       }
2266       row += nodesz;                 /* Update the row */
2267     }
2268 endofwhile:;
2269   } while (sctx.newshift);
2270   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2271   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2272   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2273   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2274   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2275 
2276   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2277   /* do not set solve add, since MatSolve_Inode + Add is faster */
2278   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2279   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2280   C->assembled              = PETSC_TRUE;
2281   C->preallocated           = PETSC_TRUE;
2282   if (sctx.nshift) {
2283     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2284       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2285     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2286       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2287     }
2288   }
2289   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2290   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 
2295 /* ----------------------------------------------------------- */
2296 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2297 {
2298   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2299   IS                iscol = a->col,isrow = a->row;
2300   PetscErrorCode    ierr;
2301   const PetscInt    *r,*c,*rout,*cout;
2302   PetscInt          i,j,n = A->rmap->n;
2303   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2304   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2305   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2306   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2307   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2308   const PetscScalar *b;
2309 
2310   PetscFunctionBegin;
2311   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2312   node_max = a->inode.node_count;
2313   ns       = a->inode.size;     /* Node Size array */
2314 
2315   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2316   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2317   tmp  = a->solve_work;
2318 
2319   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2320   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2321 
2322   /* forward solve the lower triangular */
2323   tmps = tmp;
2324   aa   = a_a;
2325   aj   = a_j;
2326   ad   = a->diag;
2327 
2328   for (i = 0,row = 0; i< node_max; ++i) {
2329     nsz = ns[i];
2330     aii = ai[row];
2331     v1  = aa + aii;
2332     vi  = aj + aii;
2333     nz  = ai[row+1]- ai[row];
2334 
2335     if (i < node_max-1) {
2336       /* Prefetch the indices for the next block */
2337       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2338       /* Prefetch the data for the next block */
2339       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2340     }
2341 
2342     switch (nsz) {               /* Each loop in 'case' is unrolled */
2343     case 1:
2344       sum1 = b[r[row]];
2345       for (j=0; j<nz-1; j+=2) {
2346         i0    = vi[j];
2347         i1    = vi[j+1];
2348         tmp0  = tmps[i0];
2349         tmp1  = tmps[i1];
2350         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2351       }
2352       if (j == nz-1) {
2353         tmp0  = tmps[vi[j]];
2354         sum1 -= v1[j]*tmp0;
2355       }
2356       tmp[row++]=sum1;
2357       break;
2358     case 2:
2359       sum1 = b[r[row]];
2360       sum2 = b[r[row+1]];
2361       v2   = aa + ai[row+1];
2362 
2363       for (j=0; j<nz-1; j+=2) {
2364         i0    = vi[j];
2365         i1    = vi[j+1];
2366         tmp0  = tmps[i0];
2367         tmp1  = tmps[i1];
2368         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2369         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2370       }
2371       if (j == nz-1) {
2372         tmp0  = tmps[vi[j]];
2373         sum1 -= v1[j] *tmp0;
2374         sum2 -= v2[j] *tmp0;
2375       }
2376       sum2     -= v2[nz] * sum1;
2377       tmp[row++]=sum1;
2378       tmp[row++]=sum2;
2379       break;
2380     case 3:
2381       sum1 = b[r[row]];
2382       sum2 = b[r[row+1]];
2383       sum3 = b[r[row+2]];
2384       v2   = aa + ai[row+1];
2385       v3   = aa + ai[row+2];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2395       }
2396       if (j == nz-1) {
2397         tmp0  = tmps[vi[j]];
2398         sum1 -= v1[j] *tmp0;
2399         sum2 -= v2[j] *tmp0;
2400         sum3 -= v3[j] *tmp0;
2401       }
2402       sum2     -= v2[nz] * sum1;
2403       sum3     -= v3[nz] * sum1;
2404       sum3     -= v3[nz+1] * sum2;
2405       tmp[row++]=sum1;
2406       tmp[row++]=sum2;
2407       tmp[row++]=sum3;
2408       break;
2409 
2410     case 4:
2411       sum1 = b[r[row]];
2412       sum2 = b[r[row+1]];
2413       sum3 = b[r[row+2]];
2414       sum4 = b[r[row+3]];
2415       v2   = aa + ai[row+1];
2416       v3   = aa + ai[row+2];
2417       v4   = aa + ai[row+3];
2418 
2419       for (j=0; j<nz-1; j+=2) {
2420         i0    = vi[j];
2421         i1    = vi[j+1];
2422         tmp0  = tmps[i0];
2423         tmp1  = tmps[i1];
2424         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2428       }
2429       if (j == nz-1) {
2430         tmp0  = tmps[vi[j]];
2431         sum1 -= v1[j] *tmp0;
2432         sum2 -= v2[j] *tmp0;
2433         sum3 -= v3[j] *tmp0;
2434         sum4 -= v4[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum4 -= v4[nz] * sum1;
2439       sum3 -= v3[nz+1] * sum2;
2440       sum4 -= v4[nz+1] * sum2;
2441       sum4 -= v4[nz+2] * sum3;
2442 
2443       tmp[row++]=sum1;
2444       tmp[row++]=sum2;
2445       tmp[row++]=sum3;
2446       tmp[row++]=sum4;
2447       break;
2448     case 5:
2449       sum1 = b[r[row]];
2450       sum2 = b[r[row+1]];
2451       sum3 = b[r[row+2]];
2452       sum4 = b[r[row+3]];
2453       sum5 = b[r[row+4]];
2454       v2   = aa + ai[row+1];
2455       v3   = aa + ai[row+2];
2456       v4   = aa + ai[row+3];
2457       v5   = aa + ai[row+4];
2458 
2459       for (j=0; j<nz-1; j+=2) {
2460         i0    = vi[j];
2461         i1    = vi[j+1];
2462         tmp0  = tmps[i0];
2463         tmp1  = tmps[i1];
2464         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2465         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2466         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2467         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2468         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2469       }
2470       if (j == nz-1) {
2471         tmp0  = tmps[vi[j]];
2472         sum1 -= v1[j] *tmp0;
2473         sum2 -= v2[j] *tmp0;
2474         sum3 -= v3[j] *tmp0;
2475         sum4 -= v4[j] *tmp0;
2476         sum5 -= v5[j] *tmp0;
2477       }
2478 
2479       sum2 -= v2[nz] * sum1;
2480       sum3 -= v3[nz] * sum1;
2481       sum4 -= v4[nz] * sum1;
2482       sum5 -= v5[nz] * sum1;
2483       sum3 -= v3[nz+1] * sum2;
2484       sum4 -= v4[nz+1] * sum2;
2485       sum5 -= v5[nz+1] * sum2;
2486       sum4 -= v4[nz+2] * sum3;
2487       sum5 -= v5[nz+2] * sum3;
2488       sum5 -= v5[nz+3] * sum4;
2489 
2490       tmp[row++]=sum1;
2491       tmp[row++]=sum2;
2492       tmp[row++]=sum3;
2493       tmp[row++]=sum4;
2494       tmp[row++]=sum5;
2495       break;
2496     default:
2497       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2498     }
2499   }
2500   /* backward solve the upper triangular */
2501   for (i=node_max -1,row = n-1; i>=0; i--) {
2502     nsz = ns[i];
2503     aii = ad[row+1] + 1;
2504     v1  = aa + aii;
2505     vi  = aj + aii;
2506     nz  = ad[row]- ad[row+1] - 1;
2507 
2508     if (i > 0) {
2509       /* Prefetch the indices for the next block */
2510       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2511       /* Prefetch the data for the next block */
2512       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2513     }
2514 
2515     switch (nsz) {               /* Each loop in 'case' is unrolled */
2516     case 1:
2517       sum1 = tmp[row];
2518 
2519       for (j=0; j<nz-1; j+=2) {
2520         i0    = vi[j];
2521         i1    = vi[j+1];
2522         tmp0  = tmps[i0];
2523         tmp1  = tmps[i1];
2524         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2525       }
2526       if (j == nz-1) {
2527         tmp0  = tmps[vi[j]];
2528         sum1 -= v1[j]*tmp0;
2529       }
2530       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2531       break;
2532     case 2:
2533       sum1 = tmp[row];
2534       sum2 = tmp[row-1];
2535       v2   = aa + ad[row] + 1;
2536       for (j=0; j<nz-1; j+=2) {
2537         i0    = vi[j];
2538         i1    = vi[j+1];
2539         tmp0  = tmps[i0];
2540         tmp1  = tmps[i1];
2541         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2542         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2543       }
2544       if (j == nz-1) {
2545         tmp0  = tmps[vi[j]];
2546         sum1 -= v1[j]* tmp0;
2547         sum2 -= v2[j+1]* tmp0;
2548       }
2549 
2550       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2551       sum2     -= v2[0] * tmp0;
2552       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2553       break;
2554     case 3:
2555       sum1 = tmp[row];
2556       sum2 = tmp[row -1];
2557       sum3 = tmp[row -2];
2558       v2   = aa + ad[row] + 1;
2559       v3   = aa + ad[row -1] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2568       }
2569       if (j== nz-1) {
2570         tmp0  = tmps[vi[j]];
2571         sum1 -= v1[j] * tmp0;
2572         sum2 -= v2[j+1] * tmp0;
2573         sum3 -= v3[j+2] * tmp0;
2574       }
2575       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2576       sum2     -= v2[0]* tmp0;
2577       sum3     -= v3[1] * tmp0;
2578       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2579       sum3     -= v3[0]* tmp0;
2580       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2581 
2582       break;
2583     case 4:
2584       sum1 = tmp[row];
2585       sum2 = tmp[row -1];
2586       sum3 = tmp[row -2];
2587       sum4 = tmp[row -3];
2588       v2   = aa + ad[row]+1;
2589       v3   = aa + ad[row -1]+1;
2590       v4   = aa + ad[row -2]+1;
2591 
2592       for (j=0; j<nz-1; j+=2) {
2593         i0    = vi[j];
2594         i1    = vi[j+1];
2595         tmp0  = tmps[i0];
2596         tmp1  = tmps[i1];
2597         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2598         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2601       }
2602       if (j== nz-1) {
2603         tmp0  = tmps[vi[j]];
2604         sum1 -= v1[j] * tmp0;
2605         sum2 -= v2[j+1] * tmp0;
2606         sum3 -= v3[j+2] * tmp0;
2607         sum4 -= v4[j+3] * tmp0;
2608       }
2609 
2610       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611       sum2     -= v2[0] * tmp0;
2612       sum3     -= v3[1] * tmp0;
2613       sum4     -= v4[2] * tmp0;
2614       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2615       sum3     -= v3[0] * tmp0;
2616       sum4     -= v4[1] * tmp0;
2617       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2618       sum4     -= v4[0] * tmp0;
2619       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2620       break;
2621     case 5:
2622       sum1 = tmp[row];
2623       sum2 = tmp[row -1];
2624       sum3 = tmp[row -2];
2625       sum4 = tmp[row -3];
2626       sum5 = tmp[row -4];
2627       v2   = aa + ad[row]+1;
2628       v3   = aa + ad[row -1]+1;
2629       v4   = aa + ad[row -2]+1;
2630       v5   = aa + ad[row -3]+1;
2631       for (j=0; j<nz-1; j+=2) {
2632         i0    = vi[j];
2633         i1    = vi[j+1];
2634         tmp0  = tmps[i0];
2635         tmp1  = tmps[i1];
2636         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2637         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2641       }
2642       if (j==nz-1) {
2643         tmp0  = tmps[vi[j]];
2644         sum1 -= v1[j] * tmp0;
2645         sum2 -= v2[j+1] * tmp0;
2646         sum3 -= v3[j+2] * tmp0;
2647         sum4 -= v4[j+3] * tmp0;
2648         sum5 -= v5[j+4] * tmp0;
2649       }
2650 
2651       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2652       sum2     -= v2[0] * tmp0;
2653       sum3     -= v3[1] * tmp0;
2654       sum4     -= v4[2] * tmp0;
2655       sum5     -= v5[3] * tmp0;
2656       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2657       sum3     -= v3[0] * tmp0;
2658       sum4     -= v4[1] * tmp0;
2659       sum5     -= v5[2] * tmp0;
2660       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2661       sum4     -= v4[0] * tmp0;
2662       sum5     -= v5[1] * tmp0;
2663       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2664       sum5     -= v5[0] * tmp0;
2665       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2666       break;
2667     default:
2668       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2669     }
2670   }
2671   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2672   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2673   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2674   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2675   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2676   PetscFunctionReturn(0);
2677 }
2678 
2679 
2680 /*
2681      Makes a longer coloring[] array and calls the usual code with that
2682 */
2683 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2684 {
2685   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2686   PetscErrorCode  ierr;
2687   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2688   PetscInt        *colorused,i;
2689   ISColoringValue *newcolor;
2690 
2691   PetscFunctionBegin;
2692   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2693   /* loop over inodes, marking a color for each column*/
2694   row = 0;
2695   for (i=0; i<m; i++) {
2696     for (j=0; j<ns[i]; j++) {
2697       newcolor[row++] = coloring[i] + j*ncolors;
2698     }
2699   }
2700 
2701   /* eliminate unneeded colors */
2702   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2703   for (i=0; i<n; i++) {
2704     colorused[newcolor[i]] = 1;
2705   }
2706 
2707   for (i=1; i<5*ncolors; i++) {
2708     colorused[i] += colorused[i-1];
2709   }
2710   ncolors = colorused[5*ncolors-1];
2711   for (i=0; i<n; i++) {
2712     newcolor[i] = colorused[newcolor[i]]-1;
2713   }
2714   ierr = PetscFree(colorused);CHKERRQ(ierr);
2715   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2716   ierr = PetscFree(coloring);CHKERRQ(ierr);
2717   PetscFunctionReturn(0);
2718 }
2719 
2720 #include <petsc/private/kernels/blockinvert.h>
2721 
2722 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2723 {
2724   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2725   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2726   MatScalar         *ibdiag,*bdiag,work[25],*t;
2727   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2728   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2729   const PetscScalar *xb, *b;
2730   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2731   PetscErrorCode    ierr;
2732   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2733   PetscInt          sz,k,ipvt[5];
2734   PetscBool         allowzeropivot,zeropivotdetected;
2735   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2736 
2737   PetscFunctionBegin;
2738   allowzeropivot = PetscNot(A->erroriffailure);
2739   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2740   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2741 
2742   if (!a->inode.ibdiagvalid) {
2743     if (!a->inode.ibdiag) {
2744       /* calculate space needed for diagonal blocks */
2745       for (i=0; i<m; i++) {
2746         cnt += sizes[i]*sizes[i];
2747       }
2748       a->inode.bdiagsize = cnt;
2749 
2750       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2751     }
2752 
2753     /* copy over the diagonal blocks and invert them */
2754     ibdiag = a->inode.ibdiag;
2755     bdiag  = a->inode.bdiag;
2756     cnt    = 0;
2757     for (i=0, row = 0; i<m; i++) {
2758       for (j=0; j<sizes[i]; j++) {
2759         for (k=0; k<sizes[i]; k++) {
2760           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2761         }
2762       }
2763       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2764 
2765       switch (sizes[i]) {
2766       case 1:
2767         /* Create matrix data structure */
2768         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2769           if (allowzeropivot) {
2770             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2771             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2772             A->factorerror_zeropivot_row   = row;
2773             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2774           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2775         }
2776         ibdiag[cnt] = 1.0/ibdiag[cnt];
2777         break;
2778       case 2:
2779         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2780         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2781         break;
2782       case 3:
2783         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2784         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2785         break;
2786       case 4:
2787         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2788         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2789         break;
2790       case 5:
2791         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2792         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2793         break;
2794       default:
2795         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2796       }
2797       cnt += sizes[i]*sizes[i];
2798       row += sizes[i];
2799     }
2800     a->inode.ibdiagvalid = PETSC_TRUE;
2801   }
2802   ibdiag = a->inode.ibdiag;
2803   bdiag  = a->inode.bdiag;
2804   t      = a->inode.ssor_work;
2805 
2806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2807   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2808   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2809   if (flag & SOR_ZERO_INITIAL_GUESS) {
2810     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2811 
2812       for (i=0, row=0; i<m; i++) {
2813         sz  = diag[row] - ii[row];
2814         v1  = a->a + ii[row];
2815         idx = a->j + ii[row];
2816 
2817         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2818         switch (sizes[i]) {
2819         case 1:
2820 
2821           sum1 = b[row];
2822           for (n = 0; n<sz-1; n+=2) {
2823             i1    = idx[0];
2824             i2    = idx[1];
2825             idx  += 2;
2826             tmp0  = x[i1];
2827             tmp1  = x[i2];
2828             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2829           }
2830 
2831           if (n == sz-1) {
2832             tmp0  = x[*idx];
2833             sum1 -= *v1 * tmp0;
2834           }
2835           t[row]   = sum1;
2836           x[row++] = sum1*(*ibdiag++);
2837           break;
2838         case 2:
2839           v2   = a->a + ii[row+1];
2840           sum1 = b[row];
2841           sum2 = b[row+1];
2842           for (n = 0; n<sz-1; n+=2) {
2843             i1    = idx[0];
2844             i2    = idx[1];
2845             idx  += 2;
2846             tmp0  = x[i1];
2847             tmp1  = x[i2];
2848             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2849             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2850           }
2851 
2852           if (n == sz-1) {
2853             tmp0  = x[*idx];
2854             sum1 -= v1[0] * tmp0;
2855             sum2 -= v2[0] * tmp0;
2856           }
2857           t[row]   = sum1;
2858           t[row+1] = sum2;
2859           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2860           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2861           ibdiag  += 4;
2862           break;
2863         case 3:
2864           v2   = a->a + ii[row+1];
2865           v3   = a->a + ii[row+2];
2866           sum1 = b[row];
2867           sum2 = b[row+1];
2868           sum3 = b[row+2];
2869           for (n = 0; n<sz-1; n+=2) {
2870             i1    = idx[0];
2871             i2    = idx[1];
2872             idx  += 2;
2873             tmp0  = x[i1];
2874             tmp1  = x[i2];
2875             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2878           }
2879 
2880           if (n == sz-1) {
2881             tmp0  = x[*idx];
2882             sum1 -= v1[0] * tmp0;
2883             sum2 -= v2[0] * tmp0;
2884             sum3 -= v3[0] * tmp0;
2885           }
2886           t[row]   = sum1;
2887           t[row+1] = sum2;
2888           t[row+2] = sum3;
2889           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2890           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2891           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2892           ibdiag  += 9;
2893           break;
2894         case 4:
2895           v2   = a->a + ii[row+1];
2896           v3   = a->a + ii[row+2];
2897           v4   = a->a + ii[row+3];
2898           sum1 = b[row];
2899           sum2 = b[row+1];
2900           sum3 = b[row+2];
2901           sum4 = b[row+3];
2902           for (n = 0; n<sz-1; n+=2) {
2903             i1    = idx[0];
2904             i2    = idx[1];
2905             idx  += 2;
2906             tmp0  = x[i1];
2907             tmp1  = x[i2];
2908             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2909             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2910             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2911             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2912           }
2913 
2914           if (n == sz-1) {
2915             tmp0  = x[*idx];
2916             sum1 -= v1[0] * tmp0;
2917             sum2 -= v2[0] * tmp0;
2918             sum3 -= v3[0] * tmp0;
2919             sum4 -= v4[0] * tmp0;
2920           }
2921           t[row]   = sum1;
2922           t[row+1] = sum2;
2923           t[row+2] = sum3;
2924           t[row+3] = sum4;
2925           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2926           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2927           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2928           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2929           ibdiag  += 16;
2930           break;
2931         case 5:
2932           v2   = a->a + ii[row+1];
2933           v3   = a->a + ii[row+2];
2934           v4   = a->a + ii[row+3];
2935           v5   = a->a + ii[row+4];
2936           sum1 = b[row];
2937           sum2 = b[row+1];
2938           sum3 = b[row+2];
2939           sum4 = b[row+3];
2940           sum5 = b[row+4];
2941           for (n = 0; n<sz-1; n+=2) {
2942             i1    = idx[0];
2943             i2    = idx[1];
2944             idx  += 2;
2945             tmp0  = x[i1];
2946             tmp1  = x[i2];
2947             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2948             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2949             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2950             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2951             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2952           }
2953 
2954           if (n == sz-1) {
2955             tmp0  = x[*idx];
2956             sum1 -= v1[0] * tmp0;
2957             sum2 -= v2[0] * tmp0;
2958             sum3 -= v3[0] * tmp0;
2959             sum4 -= v4[0] * tmp0;
2960             sum5 -= v5[0] * tmp0;
2961           }
2962           t[row]   = sum1;
2963           t[row+1] = sum2;
2964           t[row+2] = sum3;
2965           t[row+3] = sum4;
2966           t[row+4] = sum5;
2967           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2968           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2969           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2970           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2971           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2972           ibdiag  += 25;
2973           break;
2974         default:
2975           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2976         }
2977       }
2978 
2979       xb   = t;
2980       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2981     } else xb = b;
2982     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2983 
2984       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2985       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2986         ibdiag -= sizes[i]*sizes[i];
2987         sz      = ii[row+1] - diag[row] - 1;
2988         v1      = a->a + diag[row] + 1;
2989         idx     = a->j + diag[row] + 1;
2990 
2991         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2992         switch (sizes[i]) {
2993         case 1:
2994 
2995           sum1 = xb[row];
2996           for (n = 0; n<sz-1; n+=2) {
2997             i1    = idx[0];
2998             i2    = idx[1];
2999             idx  += 2;
3000             tmp0  = x[i1];
3001             tmp1  = x[i2];
3002             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3003           }
3004 
3005           if (n == sz-1) {
3006             tmp0  = x[*idx];
3007             sum1 -= *v1*tmp0;
3008           }
3009           x[row--] = sum1*(*ibdiag);
3010           break;
3011 
3012         case 2:
3013 
3014           sum1 = xb[row];
3015           sum2 = xb[row-1];
3016           /* note that sum1 is associated with the second of the two rows */
3017           v2 = a->a + diag[row-1] + 2;
3018           for (n = 0; n<sz-1; n+=2) {
3019             i1    = idx[0];
3020             i2    = idx[1];
3021             idx  += 2;
3022             tmp0  = x[i1];
3023             tmp1  = x[i2];
3024             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3025             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3026           }
3027 
3028           if (n == sz-1) {
3029             tmp0  = x[*idx];
3030             sum1 -= *v1*tmp0;
3031             sum2 -= *v2*tmp0;
3032           }
3033           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3034           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3035           break;
3036         case 3:
3037 
3038           sum1 = xb[row];
3039           sum2 = xb[row-1];
3040           sum3 = xb[row-2];
3041           v2   = a->a + diag[row-1] + 2;
3042           v3   = a->a + diag[row-2] + 3;
3043           for (n = 0; n<sz-1; n+=2) {
3044             i1    = idx[0];
3045             i2    = idx[1];
3046             idx  += 2;
3047             tmp0  = x[i1];
3048             tmp1  = x[i2];
3049             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3050             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3051             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3052           }
3053 
3054           if (n == sz-1) {
3055             tmp0  = x[*idx];
3056             sum1 -= *v1*tmp0;
3057             sum2 -= *v2*tmp0;
3058             sum3 -= *v3*tmp0;
3059           }
3060           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3061           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3062           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3063           break;
3064         case 4:
3065 
3066           sum1 = xb[row];
3067           sum2 = xb[row-1];
3068           sum3 = xb[row-2];
3069           sum4 = xb[row-3];
3070           v2   = a->a + diag[row-1] + 2;
3071           v3   = a->a + diag[row-2] + 3;
3072           v4   = a->a + diag[row-3] + 4;
3073           for (n = 0; n<sz-1; n+=2) {
3074             i1    = idx[0];
3075             i2    = idx[1];
3076             idx  += 2;
3077             tmp0  = x[i1];
3078             tmp1  = x[i2];
3079             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3080             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3081             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3082             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3083           }
3084 
3085           if (n == sz-1) {
3086             tmp0  = x[*idx];
3087             sum1 -= *v1*tmp0;
3088             sum2 -= *v2*tmp0;
3089             sum3 -= *v3*tmp0;
3090             sum4 -= *v4*tmp0;
3091           }
3092           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3093           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3094           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3095           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3096           break;
3097         case 5:
3098 
3099           sum1 = xb[row];
3100           sum2 = xb[row-1];
3101           sum3 = xb[row-2];
3102           sum4 = xb[row-3];
3103           sum5 = xb[row-4];
3104           v2   = a->a + diag[row-1] + 2;
3105           v3   = a->a + diag[row-2] + 3;
3106           v4   = a->a + diag[row-3] + 4;
3107           v5   = a->a + diag[row-4] + 5;
3108           for (n = 0; n<sz-1; n+=2) {
3109             i1    = idx[0];
3110             i2    = idx[1];
3111             idx  += 2;
3112             tmp0  = x[i1];
3113             tmp1  = x[i2];
3114             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3115             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3116             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3117             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3118             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3119           }
3120 
3121           if (n == sz-1) {
3122             tmp0  = x[*idx];
3123             sum1 -= *v1*tmp0;
3124             sum2 -= *v2*tmp0;
3125             sum3 -= *v3*tmp0;
3126             sum4 -= *v4*tmp0;
3127             sum5 -= *v5*tmp0;
3128           }
3129           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3130           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3131           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3132           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3133           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3134           break;
3135         default:
3136           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3137         }
3138       }
3139 
3140       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3141     }
3142     its--;
3143   }
3144   while (its--) {
3145 
3146     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3147       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3148            i<m;
3149            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3150 
3151         sz  = diag[row] - ii[row];
3152         v1  = a->a + ii[row];
3153         idx = a->j + ii[row];
3154         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3155         switch (sizes[i]) {
3156         case 1:
3157           sum1 = b[row];
3158           for (n = 0; n<sz-1; n+=2) {
3159             i1    = idx[0];
3160             i2    = idx[1];
3161             idx  += 2;
3162             tmp0  = x[i1];
3163             tmp1  = x[i2];
3164             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3165           }
3166           if (n == sz-1) {
3167             tmp0  = x[*idx++];
3168             sum1 -= *v1 * tmp0;
3169             v1++;
3170           }
3171           t[row]   = sum1;
3172           sz      = ii[row+1] - diag[row] - 1;
3173           idx     = a->j + diag[row] + 1;
3174           v1 += 1;
3175           for (n = 0; n<sz-1; n+=2) {
3176             i1    = idx[0];
3177             i2    = idx[1];
3178             idx  += 2;
3179             tmp0  = x[i1];
3180             tmp1  = x[i2];
3181             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3182           }
3183           if (n == sz-1) {
3184             tmp0  = x[*idx++];
3185             sum1 -= *v1 * tmp0;
3186           }
3187           /* in MatSOR_SeqAIJ this line would be
3188            *
3189            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3190            *
3191            * but omega == 1, so this becomes
3192            *
3193            * x[row] = sum1*(*ibdiag++);
3194            *
3195            */
3196           x[row] = sum1*(*ibdiag);
3197           break;
3198         case 2:
3199           v2   = a->a + ii[row+1];
3200           sum1 = b[row];
3201           sum2 = b[row+1];
3202           for (n = 0; n<sz-1; n+=2) {
3203             i1    = idx[0];
3204             i2    = idx[1];
3205             idx  += 2;
3206             tmp0  = x[i1];
3207             tmp1  = x[i2];
3208             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3209             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3210           }
3211           if (n == sz-1) {
3212             tmp0  = x[*idx++];
3213             sum1 -= v1[0] * tmp0;
3214             sum2 -= v2[0] * tmp0;
3215             v1++; v2++;
3216           }
3217           t[row]   = sum1;
3218           t[row+1] = sum2;
3219           sz      = ii[row+1] - diag[row] - 2;
3220           idx     = a->j + diag[row] + 2;
3221           v1 += 2;
3222           v2 += 2;
3223           for (n = 0; n<sz-1; n+=2) {
3224             i1    = idx[0];
3225             i2    = idx[1];
3226             idx  += 2;
3227             tmp0  = x[i1];
3228             tmp1  = x[i2];
3229             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3230             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3231           }
3232           if (n == sz-1) {
3233             tmp0  = x[*idx];
3234             sum1 -= v1[0] * tmp0;
3235             sum2 -= v2[0] * tmp0;
3236           }
3237           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3238           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3239           break;
3240         case 3:
3241           v2   = a->a + ii[row+1];
3242           v3   = a->a + ii[row+2];
3243           sum1 = b[row];
3244           sum2 = b[row+1];
3245           sum3 = b[row+2];
3246           for (n = 0; n<sz-1; n+=2) {
3247             i1    = idx[0];
3248             i2    = idx[1];
3249             idx  += 2;
3250             tmp0  = x[i1];
3251             tmp1  = x[i2];
3252             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3253             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3254             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3255           }
3256           if (n == sz-1) {
3257             tmp0  = x[*idx++];
3258             sum1 -= v1[0] * tmp0;
3259             sum2 -= v2[0] * tmp0;
3260             sum3 -= v3[0] * tmp0;
3261             v1++; v2++; v3++;
3262           }
3263           t[row]   = sum1;
3264           t[row+1] = sum2;
3265           t[row+2] = sum3;
3266           sz      = ii[row+1] - diag[row] - 3;
3267           idx     = a->j + diag[row] + 3;
3268           v1 += 3;
3269           v2 += 3;
3270           v3 += 3;
3271           for (n = 0; n<sz-1; n+=2) {
3272             i1    = idx[0];
3273             i2    = idx[1];
3274             idx  += 2;
3275             tmp0  = x[i1];
3276             tmp1  = x[i2];
3277             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3278             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3279             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3280           }
3281           if (n == sz-1) {
3282             tmp0  = x[*idx];
3283             sum1 -= v1[0] * tmp0;
3284             sum2 -= v2[0] * tmp0;
3285             sum3 -= v3[0] * tmp0;
3286           }
3287           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3288           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3289           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3290           break;
3291         case 4:
3292           v2   = a->a + ii[row+1];
3293           v3   = a->a + ii[row+2];
3294           v4   = a->a + ii[row+3];
3295           sum1 = b[row];
3296           sum2 = b[row+1];
3297           sum3 = b[row+2];
3298           sum4 = b[row+3];
3299           for (n = 0; n<sz-1; n+=2) {
3300             i1    = idx[0];
3301             i2    = idx[1];
3302             idx  += 2;
3303             tmp0  = x[i1];
3304             tmp1  = x[i2];
3305             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3306             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3307             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3308             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3309           }
3310           if (n == sz-1) {
3311             tmp0  = x[*idx++];
3312             sum1 -= v1[0] * tmp0;
3313             sum2 -= v2[0] * tmp0;
3314             sum3 -= v3[0] * tmp0;
3315             sum4 -= v4[0] * tmp0;
3316             v1++; v2++; v3++; v4++;
3317           }
3318           t[row]   = sum1;
3319           t[row+1] = sum2;
3320           t[row+2] = sum3;
3321           t[row+3] = sum4;
3322           sz      = ii[row+1] - diag[row] - 4;
3323           idx     = a->j + diag[row] + 4;
3324           v1 += 4;
3325           v2 += 4;
3326           v3 += 4;
3327           v4 += 4;
3328           for (n = 0; n<sz-1; n+=2) {
3329             i1    = idx[0];
3330             i2    = idx[1];
3331             idx  += 2;
3332             tmp0  = x[i1];
3333             tmp1  = x[i2];
3334             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3335             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3336             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3337             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3338           }
3339           if (n == sz-1) {
3340             tmp0  = x[*idx];
3341             sum1 -= v1[0] * tmp0;
3342             sum2 -= v2[0] * tmp0;
3343             sum3 -= v3[0] * tmp0;
3344             sum4 -= v4[0] * tmp0;
3345           }
3346           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3347           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3348           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3349           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3350           break;
3351         case 5:
3352           v2   = a->a + ii[row+1];
3353           v3   = a->a + ii[row+2];
3354           v4   = a->a + ii[row+3];
3355           v5   = a->a + ii[row+4];
3356           sum1 = b[row];
3357           sum2 = b[row+1];
3358           sum3 = b[row+2];
3359           sum4 = b[row+3];
3360           sum5 = b[row+4];
3361           for (n = 0; n<sz-1; n+=2) {
3362             i1    = idx[0];
3363             i2    = idx[1];
3364             idx  += 2;
3365             tmp0  = x[i1];
3366             tmp1  = x[i2];
3367             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3368             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3369             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3370             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3371             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3372           }
3373           if (n == sz-1) {
3374             tmp0  = x[*idx++];
3375             sum1 -= v1[0] * tmp0;
3376             sum2 -= v2[0] * tmp0;
3377             sum3 -= v3[0] * tmp0;
3378             sum4 -= v4[0] * tmp0;
3379             sum5 -= v5[0] * tmp0;
3380             v1++; v2++; v3++; v4++; v5++;
3381           }
3382           t[row]   = sum1;
3383           t[row+1] = sum2;
3384           t[row+2] = sum3;
3385           t[row+3] = sum4;
3386           t[row+4] = sum5;
3387           sz      = ii[row+1] - diag[row] - 5;
3388           idx     = a->j + diag[row] + 5;
3389           v1 += 5;
3390           v2 += 5;
3391           v3 += 5;
3392           v4 += 5;
3393           v5 += 5;
3394           for (n = 0; n<sz-1; n+=2) {
3395             i1    = idx[0];
3396             i2    = idx[1];
3397             idx  += 2;
3398             tmp0  = x[i1];
3399             tmp1  = x[i2];
3400             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3401             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3402             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3403             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3404             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3405           }
3406           if (n == sz-1) {
3407             tmp0  = x[*idx];
3408             sum1 -= v1[0] * tmp0;
3409             sum2 -= v2[0] * tmp0;
3410             sum3 -= v3[0] * tmp0;
3411             sum4 -= v4[0] * tmp0;
3412             sum5 -= v5[0] * tmp0;
3413           }
3414           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3415           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3416           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3417           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3418           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3419           break;
3420         default:
3421           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3422         }
3423       }
3424       xb   = t;
3425       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3426     } else xb = b;
3427 
3428     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3429 
3430       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3431       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3432         ibdiag -= sizes[i]*sizes[i];
3433 
3434         /* set RHS */
3435         if (xb == b) {
3436           /* whole (old way) */
3437           sz      = ii[row+1] - ii[row];
3438           idx     = a->j + ii[row];
3439           switch (sizes[i]) {
3440           case 5:
3441             v5      = a->a + ii[row-4];
3442           case 4: /* fall through */
3443             v4      = a->a + ii[row-3];
3444           case 3:
3445             v3      = a->a + ii[row-2];
3446           case 2:
3447             v2      = a->a + ii[row-1];
3448           case 1:
3449             v1      = a->a + ii[row];
3450             break;
3451           default:
3452             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453           }
3454         } else {
3455           /* upper, no diag */
3456           sz      = ii[row+1] - diag[row] - 1;
3457           idx     = a->j + diag[row] + 1;
3458           switch (sizes[i]) {
3459           case 5:
3460             v5      = a->a + diag[row-4] + 5;
3461           case 4: /* fall through */
3462             v4      = a->a + diag[row-3] + 4;
3463           case 3:
3464             v3      = a->a + diag[row-2] + 3;
3465           case 2:
3466             v2      = a->a + diag[row-1] + 2;
3467           case 1:
3468             v1      = a->a + diag[row] + 1;
3469           }
3470         }
3471         /* set sum */
3472         switch (sizes[i]) {
3473         case 5:
3474           sum5 = xb[row-4];
3475         case 4: /* fall through */
3476           sum4 = xb[row-3];
3477         case 3:
3478           sum3 = xb[row-2];
3479         case 2:
3480           sum2 = xb[row-1];
3481         case 1:
3482           /* note that sum1 is associated with the last row */
3483           sum1 = xb[row];
3484         }
3485         /* do sums */
3486         for (n = 0; n<sz-1; n+=2) {
3487             i1    = idx[0];
3488             i2    = idx[1];
3489             idx  += 2;
3490             tmp0  = x[i1];
3491             tmp1  = x[i2];
3492             switch (sizes[i]) {
3493             case 5:
3494               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3495             case 4: /* fall through */
3496               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3497             case 3:
3498               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3499             case 2:
3500               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3501             case 1:
3502               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3503             }
3504         }
3505         /* ragged edge */
3506         if (n == sz-1) {
3507           tmp0  = x[*idx];
3508           switch (sizes[i]) {
3509           case 5:
3510             sum5 -= *v5*tmp0;
3511           case 4: /* fall through */
3512             sum4 -= *v4*tmp0;
3513           case 3:
3514             sum3 -= *v3*tmp0;
3515           case 2:
3516             sum2 -= *v2*tmp0;
3517           case 1:
3518             sum1 -= *v1*tmp0;
3519           }
3520         }
3521         /* update */
3522         if (xb == b) {
3523           /* whole (old way) w/ diag */
3524           switch (sizes[i]) {
3525           case 5:
3526             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3527             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3528             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3529             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3530             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3531             break;
3532           case 4:
3533             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3534             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3535             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3536             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3537             break;
3538           case 3:
3539             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3540             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3541             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3542             break;
3543           case 2:
3544             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3545             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3546             break;
3547           case 1:
3548             x[row--] += sum1*(*ibdiag);
3549             break;
3550           }
3551         } else {
3552           /* no diag so set =  */
3553           switch (sizes[i]) {
3554           case 5:
3555             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3556             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3557             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3558             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3559             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3560             break;
3561           case 4:
3562             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3563             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3564             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3565             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3566             break;
3567           case 3:
3568             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3569             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3570             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3571             break;
3572           case 2:
3573             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3574             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3575             break;
3576           case 1:
3577             x[row--] = sum1*(*ibdiag);
3578             break;
3579           }
3580         }
3581       }
3582       if (xb == b) {
3583         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3584       } else {
3585         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3586       }
3587     }
3588   }
3589   if (flag & SOR_EISENSTAT) {
3590     /*
3591           Apply  (U + D)^-1  where D is now the block diagonal
3592     */
3593     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3594     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3595       ibdiag -= sizes[i]*sizes[i];
3596       sz      = ii[row+1] - diag[row] - 1;
3597       v1      = a->a + diag[row] + 1;
3598       idx     = a->j + diag[row] + 1;
3599       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3600       switch (sizes[i]) {
3601       case 1:
3602 
3603         sum1 = b[row];
3604         for (n = 0; n<sz-1; n+=2) {
3605           i1    = idx[0];
3606           i2    = idx[1];
3607           idx  += 2;
3608           tmp0  = x[i1];
3609           tmp1  = x[i2];
3610           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3611         }
3612 
3613         if (n == sz-1) {
3614           tmp0  = x[*idx];
3615           sum1 -= *v1*tmp0;
3616         }
3617         x[row] = sum1*(*ibdiag);row--;
3618         break;
3619 
3620       case 2:
3621 
3622         sum1 = b[row];
3623         sum2 = b[row-1];
3624         /* note that sum1 is associated with the second of the two rows */
3625         v2 = a->a + diag[row-1] + 2;
3626         for (n = 0; n<sz-1; n+=2) {
3627           i1    = idx[0];
3628           i2    = idx[1];
3629           idx  += 2;
3630           tmp0  = x[i1];
3631           tmp1  = x[i2];
3632           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3633           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3634         }
3635 
3636         if (n == sz-1) {
3637           tmp0  = x[*idx];
3638           sum1 -= *v1*tmp0;
3639           sum2 -= *v2*tmp0;
3640         }
3641         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3642         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3643         row     -= 2;
3644         break;
3645       case 3:
3646 
3647         sum1 = b[row];
3648         sum2 = b[row-1];
3649         sum3 = b[row-2];
3650         v2   = a->a + diag[row-1] + 2;
3651         v3   = a->a + diag[row-2] + 3;
3652         for (n = 0; n<sz-1; n+=2) {
3653           i1    = idx[0];
3654           i2    = idx[1];
3655           idx  += 2;
3656           tmp0  = x[i1];
3657           tmp1  = x[i2];
3658           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3659           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3660           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3661         }
3662 
3663         if (n == sz-1) {
3664           tmp0  = x[*idx];
3665           sum1 -= *v1*tmp0;
3666           sum2 -= *v2*tmp0;
3667           sum3 -= *v3*tmp0;
3668         }
3669         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3670         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3671         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3672         row     -= 3;
3673         break;
3674       case 4:
3675 
3676         sum1 = b[row];
3677         sum2 = b[row-1];
3678         sum3 = b[row-2];
3679         sum4 = b[row-3];
3680         v2   = a->a + diag[row-1] + 2;
3681         v3   = a->a + diag[row-2] + 3;
3682         v4   = a->a + diag[row-3] + 4;
3683         for (n = 0; n<sz-1; n+=2) {
3684           i1    = idx[0];
3685           i2    = idx[1];
3686           idx  += 2;
3687           tmp0  = x[i1];
3688           tmp1  = x[i2];
3689           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3693         }
3694 
3695         if (n == sz-1) {
3696           tmp0  = x[*idx];
3697           sum1 -= *v1*tmp0;
3698           sum2 -= *v2*tmp0;
3699           sum3 -= *v3*tmp0;
3700           sum4 -= *v4*tmp0;
3701         }
3702         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3703         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3704         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3705         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3706         row     -= 4;
3707         break;
3708       case 5:
3709 
3710         sum1 = b[row];
3711         sum2 = b[row-1];
3712         sum3 = b[row-2];
3713         sum4 = b[row-3];
3714         sum5 = b[row-4];
3715         v2   = a->a + diag[row-1] + 2;
3716         v3   = a->a + diag[row-2] + 3;
3717         v4   = a->a + diag[row-3] + 4;
3718         v5   = a->a + diag[row-4] + 5;
3719         for (n = 0; n<sz-1; n+=2) {
3720           i1    = idx[0];
3721           i2    = idx[1];
3722           idx  += 2;
3723           tmp0  = x[i1];
3724           tmp1  = x[i2];
3725           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3726           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3727           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3728           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3729           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3730         }
3731 
3732         if (n == sz-1) {
3733           tmp0  = x[*idx];
3734           sum1 -= *v1*tmp0;
3735           sum2 -= *v2*tmp0;
3736           sum3 -= *v3*tmp0;
3737           sum4 -= *v4*tmp0;
3738           sum5 -= *v5*tmp0;
3739         }
3740         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3741         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3742         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3743         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3744         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3745         row     -= 5;
3746         break;
3747       default:
3748         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3749       }
3750     }
3751     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3752 
3753     /*
3754            t = b - D x    where D is the block diagonal
3755     */
3756     cnt = 0;
3757     for (i=0, row=0; i<m; i++) {
3758       switch (sizes[i]) {
3759       case 1:
3760         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3761         break;
3762       case 2:
3763         x1       = x[row]; x2 = x[row+1];
3764         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3765         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3766         t[row]   = b[row] - tmp1;
3767         t[row+1] = b[row+1] - tmp2; row += 2;
3768         cnt     += 4;
3769         break;
3770       case 3:
3771         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3772         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3773         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3774         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3775         t[row]   = b[row] - tmp1;
3776         t[row+1] = b[row+1] - tmp2;
3777         t[row+2] = b[row+2] - tmp3; row += 3;
3778         cnt     += 9;
3779         break;
3780       case 4:
3781         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3782         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3783         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3784         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3785         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3786         t[row]   = b[row] - tmp1;
3787         t[row+1] = b[row+1] - tmp2;
3788         t[row+2] = b[row+2] - tmp3;
3789         t[row+3] = b[row+3] - tmp4; row += 4;
3790         cnt     += 16;
3791         break;
3792       case 5:
3793         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3794         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3795         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3796         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3797         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3798         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3799         t[row]   = b[row] - tmp1;
3800         t[row+1] = b[row+1] - tmp2;
3801         t[row+2] = b[row+2] - tmp3;
3802         t[row+3] = b[row+3] - tmp4;
3803         t[row+4] = b[row+4] - tmp5;row += 5;
3804         cnt     += 25;
3805         break;
3806       default:
3807         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3808       }
3809     }
3810     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3811 
3812 
3813 
3814     /*
3815           Apply (L + D)^-1 where D is the block diagonal
3816     */
3817     for (i=0, row=0; i<m; i++) {
3818       sz  = diag[row] - ii[row];
3819       v1  = a->a + ii[row];
3820       idx = a->j + ii[row];
3821       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3822       switch (sizes[i]) {
3823       case 1:
3824 
3825         sum1 = t[row];
3826         for (n = 0; n<sz-1; n+=2) {
3827           i1    = idx[0];
3828           i2    = idx[1];
3829           idx  += 2;
3830           tmp0  = t[i1];
3831           tmp1  = t[i2];
3832           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3833         }
3834 
3835         if (n == sz-1) {
3836           tmp0  = t[*idx];
3837           sum1 -= *v1 * tmp0;
3838         }
3839         x[row] += t[row] = sum1*(*ibdiag++); row++;
3840         break;
3841       case 2:
3842         v2   = a->a + ii[row+1];
3843         sum1 = t[row];
3844         sum2 = t[row+1];
3845         for (n = 0; n<sz-1; n+=2) {
3846           i1    = idx[0];
3847           i2    = idx[1];
3848           idx  += 2;
3849           tmp0  = t[i1];
3850           tmp1  = t[i2];
3851           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3852           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3853         }
3854 
3855         if (n == sz-1) {
3856           tmp0  = t[*idx];
3857           sum1 -= v1[0] * tmp0;
3858           sum2 -= v2[0] * tmp0;
3859         }
3860         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3861         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3862         ibdiag   += 4; row += 2;
3863         break;
3864       case 3:
3865         v2   = a->a + ii[row+1];
3866         v3   = a->a + ii[row+2];
3867         sum1 = t[row];
3868         sum2 = t[row+1];
3869         sum3 = t[row+2];
3870         for (n = 0; n<sz-1; n+=2) {
3871           i1    = idx[0];
3872           i2    = idx[1];
3873           idx  += 2;
3874           tmp0  = t[i1];
3875           tmp1  = t[i2];
3876           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3877           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3878           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3879         }
3880 
3881         if (n == sz-1) {
3882           tmp0  = t[*idx];
3883           sum1 -= v1[0] * tmp0;
3884           sum2 -= v2[0] * tmp0;
3885           sum3 -= v3[0] * tmp0;
3886         }
3887         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3888         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3889         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3890         ibdiag   += 9; row += 3;
3891         break;
3892       case 4:
3893         v2   = a->a + ii[row+1];
3894         v3   = a->a + ii[row+2];
3895         v4   = a->a + ii[row+3];
3896         sum1 = t[row];
3897         sum2 = t[row+1];
3898         sum3 = t[row+2];
3899         sum4 = t[row+3];
3900         for (n = 0; n<sz-1; n+=2) {
3901           i1    = idx[0];
3902           i2    = idx[1];
3903           idx  += 2;
3904           tmp0  = t[i1];
3905           tmp1  = t[i2];
3906           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3907           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3908           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3909           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3910         }
3911 
3912         if (n == sz-1) {
3913           tmp0  = t[*idx];
3914           sum1 -= v1[0] * tmp0;
3915           sum2 -= v2[0] * tmp0;
3916           sum3 -= v3[0] * tmp0;
3917           sum4 -= v4[0] * tmp0;
3918         }
3919         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3920         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3921         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3922         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3923         ibdiag   += 16; row += 4;
3924         break;
3925       case 5:
3926         v2   = a->a + ii[row+1];
3927         v3   = a->a + ii[row+2];
3928         v4   = a->a + ii[row+3];
3929         v5   = a->a + ii[row+4];
3930         sum1 = t[row];
3931         sum2 = t[row+1];
3932         sum3 = t[row+2];
3933         sum4 = t[row+3];
3934         sum5 = t[row+4];
3935         for (n = 0; n<sz-1; n+=2) {
3936           i1    = idx[0];
3937           i2    = idx[1];
3938           idx  += 2;
3939           tmp0  = t[i1];
3940           tmp1  = t[i2];
3941           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3942           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3943           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3944           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3945           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3946         }
3947 
3948         if (n == sz-1) {
3949           tmp0  = t[*idx];
3950           sum1 -= v1[0] * tmp0;
3951           sum2 -= v2[0] * tmp0;
3952           sum3 -= v3[0] * tmp0;
3953           sum4 -= v4[0] * tmp0;
3954           sum5 -= v5[0] * tmp0;
3955         }
3956         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3957         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3958         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3959         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3960         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3961         ibdiag   += 25; row += 5;
3962         break;
3963       default:
3964         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3965       }
3966     }
3967     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3968   }
3969   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3970   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3971   PetscFunctionReturn(0);
3972 }
3973 
3974 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3975 {
3976   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3977   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3978   const MatScalar   *bdiag = a->inode.bdiag;
3979   const PetscScalar *b;
3980   PetscErrorCode    ierr;
3981   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3982   const PetscInt    *sizes = a->inode.size;
3983 
3984   PetscFunctionBegin;
3985   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3986   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3987   cnt  = 0;
3988   for (i=0, row=0; i<m; i++) {
3989     switch (sizes[i]) {
3990     case 1:
3991       x[row] = b[row]*bdiag[cnt++];row++;
3992       break;
3993     case 2:
3994       x1       = b[row]; x2 = b[row+1];
3995       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3996       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3997       x[row++] = tmp1;
3998       x[row++] = tmp2;
3999       cnt     += 4;
4000       break;
4001     case 3:
4002       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4003       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4004       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4005       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4006       x[row++] = tmp1;
4007       x[row++] = tmp2;
4008       x[row++] = tmp3;
4009       cnt     += 9;
4010       break;
4011     case 4:
4012       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4013       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4014       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4015       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4016       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4017       x[row++] = tmp1;
4018       x[row++] = tmp2;
4019       x[row++] = tmp3;
4020       x[row++] = tmp4;
4021       cnt     += 16;
4022       break;
4023     case 5:
4024       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4025       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4026       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4027       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4028       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4029       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4030       x[row++] = tmp1;
4031       x[row++] = tmp2;
4032       x[row++] = tmp3;
4033       x[row++] = tmp4;
4034       x[row++] = tmp5;
4035       cnt     += 25;
4036       break;
4037     default:
4038       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4039     }
4040   }
4041   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4042   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4043   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4044   PetscFunctionReturn(0);
4045 }
4046 
4047 /*
4048     samestructure indicates that the matrix has not changed its nonzero structure so we
4049     do not need to recompute the inodes
4050 */
4051 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4052 {
4053   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4054   PetscErrorCode ierr;
4055   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4056   PetscBool      flag;
4057   const PetscInt *idx,*idy,*ii;
4058 
4059   PetscFunctionBegin;
4060   if (!a->inode.use) PetscFunctionReturn(0);
4061   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4062 
4063   m = A->rmap->n;
4064   if (a->inode.size) ns = a->inode.size;
4065   else {
4066     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4067   }
4068 
4069   i          = 0;
4070   node_count = 0;
4071   idx        = a->j;
4072   ii         = a->i;
4073   while (i < m) {                /* For each row */
4074     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4075     /* Limits the number of elements in a node to 'a->inode.limit' */
4076     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4077       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4078       if (nzy != nzx) break;
4079       idy += nzx;              /* Same nonzero pattern */
4080       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4081       if (!flag) break;
4082     }
4083     ns[node_count++] = blk_size;
4084     idx             += blk_size*nzx;
4085     i                = j;
4086   }
4087   /* If not enough inodes found,, do not use inode version of the routines */
4088   if (!m || node_count > .8*m) {
4089     ierr = PetscFree(ns);CHKERRQ(ierr);
4090 
4091     a->inode.node_count       = 0;
4092     a->inode.size             = NULL;
4093     a->inode.use              = PETSC_FALSE;
4094     A->ops->mult              = MatMult_SeqAIJ;
4095     A->ops->sor               = MatSOR_SeqAIJ;
4096     A->ops->multadd           = MatMultAdd_SeqAIJ;
4097     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4098     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4099     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4100     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4101     A->ops->coloringpatch     = 0;
4102     A->ops->multdiagonalblock = 0;
4103 
4104     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4105   } else {
4106     if (!A->factortype) {
4107       A->ops->mult              = MatMult_SeqAIJ_Inode;
4108       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4109       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4110       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4111       if (A->rmap->n == A->cmap->n) {
4112         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4113         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4114         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4115         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4116         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4117       }
4118     } else {
4119       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4120     }
4121     a->inode.node_count = node_count;
4122     a->inode.size       = ns;
4123     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4124   }
4125   a->inode.checked          = PETSC_TRUE;
4126   a->inode.mat_nonzerostate = A->nonzerostate;
4127   PetscFunctionReturn(0);
4128 }
4129 
4130 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4131 {
4132   Mat            B =*C;
4133   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4134   PetscErrorCode ierr;
4135   PetscInt       m=A->rmap->n;
4136 
4137   PetscFunctionBegin;
4138   c->inode.use       = a->inode.use;
4139   c->inode.limit     = a->inode.limit;
4140   c->inode.max_limit = a->inode.max_limit;
4141   if (a->inode.size) {
4142     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4143     c->inode.node_count = a->inode.node_count;
4144     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4145     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4146     if (!B->factortype) {
4147       B->ops->mult              = MatMult_SeqAIJ_Inode;
4148       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4149       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4150       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4151       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4152       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4153       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4154       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4155       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4156     } else {
4157       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4158     }
4159   } else {
4160     c->inode.size       = 0;
4161     c->inode.node_count = 0;
4162   }
4163   c->inode.ibdiagvalid = PETSC_FALSE;
4164   c->inode.ibdiag      = 0;
4165   c->inode.bdiag       = 0;
4166   PetscFunctionReturn(0);
4167 }
4168 
4169 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4170 {
4171   PetscInt       k;
4172   const PetscInt *vi;
4173 
4174   PetscFunctionBegin;
4175   vi = aj + ai[row];
4176   for (k=0; k<nzl; k++) cols[k] = vi[k];
4177   vi        = aj + adiag[row];
4178   cols[nzl] = vi[0];
4179   vi        = aj + adiag[row+1]+1;
4180   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4181   PetscFunctionReturn(0);
4182 }
4183 /*
4184    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4185    Modified from MatSeqAIJCheckInode().
4186 
4187    Input Parameters:
4188 .  Mat A - ILU or LU matrix factor
4189 
4190 */
4191 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4192 {
4193   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4194   PetscErrorCode ierr;
4195   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4196   PetscInt       *cols1,*cols2,*ns;
4197   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4198   PetscBool      flag;
4199 
4200   PetscFunctionBegin;
4201   if (!a->inode.use)    PetscFunctionReturn(0);
4202   if (a->inode.checked) PetscFunctionReturn(0);
4203 
4204   m = A->rmap->n;
4205   if (a->inode.size) ns = a->inode.size;
4206   else {
4207     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4208   }
4209 
4210   i          = 0;
4211   node_count = 0;
4212   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4213   while (i < m) {                /* For each row */
4214     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4215     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4216     nzx  = nzl1 + nzu1 + 1;
4217     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4218 
4219     /* Limits the number of elements in a node to 'a->inode.limit' */
4220     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4221       nzl2 = ai[j+1] - ai[j];
4222       nzu2 = adiag[j] - adiag[j+1] - 1;
4223       nzy  = nzl2 + nzu2 + 1;
4224       if (nzy != nzx) break;
4225       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4226       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4227       if (!flag) break;
4228     }
4229     ns[node_count++] = blk_size;
4230     i                = j;
4231   }
4232   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4233   /* If not enough inodes found,, do not use inode version of the routines */
4234   if (!m || node_count > .8*m) {
4235     ierr = PetscFree(ns);CHKERRQ(ierr);
4236 
4237     a->inode.node_count = 0;
4238     a->inode.size       = NULL;
4239     a->inode.use        = PETSC_FALSE;
4240 
4241     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4242   } else {
4243     A->ops->mult              = 0;
4244     A->ops->sor               = 0;
4245     A->ops->multadd           = 0;
4246     A->ops->getrowij          = 0;
4247     A->ops->restorerowij      = 0;
4248     A->ops->getcolumnij       = 0;
4249     A->ops->restorecolumnij   = 0;
4250     A->ops->coloringpatch     = 0;
4251     A->ops->multdiagonalblock = 0;
4252     a->inode.node_count       = node_count;
4253     a->inode.size             = ns;
4254 
4255     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4256   }
4257   a->inode.checked = PETSC_TRUE;
4258   PetscFunctionReturn(0);
4259 }
4260 
4261 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4262 {
4263   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4264 
4265   PetscFunctionBegin;
4266   a->inode.ibdiagvalid = PETSC_FALSE;
4267   PetscFunctionReturn(0);
4268 }
4269 
4270 /*
4271      This is really ugly. if inodes are used this replaces the
4272   permutations with ones that correspond to rows/cols of the matrix
4273   rather then inode blocks
4274 */
4275 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4276 {
4277   PetscErrorCode ierr;
4278 
4279   PetscFunctionBegin;
4280   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4281   PetscFunctionReturn(0);
4282 }
4283 
4284 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4285 {
4286   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4287   PetscErrorCode ierr;
4288   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4289   const PetscInt *ridx,*cidx;
4290   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4291   PetscInt       nslim_col,*ns_col;
4292   IS             ris = *rperm,cis = *cperm;
4293 
4294   PetscFunctionBegin;
4295   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4296   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4297 
4298   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4299   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4300   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4301 
4302   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4303   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4304 
4305   /* Form the inode structure for the rows of permuted matric using inv perm*/
4306   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4307 
4308   /* Construct the permutations for rows*/
4309   for (i=0,row = 0; i<nslim_row; ++i) {
4310     indx      = ridx[i];
4311     start_val = tns[indx];
4312     end_val   = tns[indx + 1];
4313     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4314   }
4315 
4316   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4317   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4318 
4319   /* Construct permutations for columns */
4320   for (i=0,col=0; i<nslim_col; ++i) {
4321     indx      = cidx[i];
4322     start_val = tns[indx];
4323     end_val   = tns[indx + 1];
4324     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4325   }
4326 
4327   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4328   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4329   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4330   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4331 
4332   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4333   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4334 
4335   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4336   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4337   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4338   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4339   ierr = PetscFree(tns);CHKERRQ(ierr);
4340   PetscFunctionReturn(0);
4341 }
4342 
4343 /*@C
4344    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4345 
4346    Not Collective
4347 
4348    Input Parameter:
4349 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4350 
4351    Output Parameter:
4352 +  node_count - no of inodes present in the matrix.
4353 .  sizes      - an array of size node_count,with sizes of each inode.
4354 -  limit      - the max size used to generate the inodes.
4355 
4356    Level: advanced
4357 
4358    Notes:
4359     This routine returns some internal storage information
4360    of the matrix, it is intended to be used by advanced users.
4361    It should be called after the matrix is assembled.
4362    The contents of the sizes[] array should not be changed.
4363    NULL may be passed for information not requested.
4364 
4365 .keywords: matrix, seqaij, get, inode
4366 
4367 .seealso: MatGetInfo()
4368 @*/
4369 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4370 {
4371   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4372 
4373   PetscFunctionBegin;
4374   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4375   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4376   if (f) {
4377     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4378   }
4379   PetscFunctionReturn(0);
4380 }
4381 
4382 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4383 {
4384   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4385 
4386   PetscFunctionBegin;
4387   if (node_count) *node_count = a->inode.node_count;
4388   if (sizes)      *sizes      = a->inode.size;
4389   if (limit)      *limit      = a->inode.limit;
4390   PetscFunctionReturn(0);
4391 }
4392