xref: /petsc/src/mat/impls/aij/seq/inode.c (revision a6dfd86ebdf75fa024070af26d51b62fd16650a3)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11 {
12   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24     for (; count+1 < n; count++,i++);
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n)  {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1) {
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col)
86       tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124  /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
157 
158   PetscFunctionBegin;
159   nslim_row = a->inode.node_count;
160   n         = A->cmap->n;
161 
162   /* Create The column_inode for this matrix */
163   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
164 
165   /* allocate space for reformated column_inode structure */
166   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
167   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
168   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
169 
170   for (i1=0,col=0; i1<nslim_col; ++i1) {
171     nsz = ns_col[i1];
172     for (i2=0; i2<nsz; ++i2,++col)
173       tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208  /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n     = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263 
264   PetscFunctionReturn(0);
265 }
266 
267 /* ----------------------------------------------------------- */
268 
269 #undef __FUNCT__
270 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
271 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
272 {
273   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
274   PetscErrorCode ierr;
275   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
276   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
277 
278   PetscFunctionBegin;
279   nslim_row = a->inode.node_count;
280   n         = A->cmap->n;
281 
282   /* Create The column_inode for this matrix */
283   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
284 
285   /* allocate space for reformated column_inode structure */
286   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
287   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
288   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
289 
290   for (i1=0,col=0; i1<nslim_col; ++i1) {
291     nsz = ns_col[i1];
292     for (i2=0; i2<nsz; ++i2,++col)
293       tvc[col] = i1;
294   }
295   /* allocate space for column pointers */
296   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
297   *iia = ia;
298   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
299   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
300 
301   /* determine the number of columns in each row */
302   ia[0] = oshift;
303   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
304     j   = aj + ai[row] + ishift;
305     col = *j++ + ishift;
306     i2  = tvc[col];
307     nz  = ai[row+1] - ai[row];
308     while (nz-- > 0) {           /* off-diagonal elemets */
309       /* ia[i1+1]++; */
310       ia[i2+1]++;
311       i2++;
312       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
313       if (nz > 0) i2 = tvc[col];
314     }
315   }
316 
317   /* shift ia[i] to point to next col */
318   for (i1=1; i1<nslim_col+1; i1++) {
319     col        = ia[i1-1];
320     ia[i1]    += col;
321     work[i1-1] = col - oshift;
322   }
323 
324   /* allocate space for column pointers */
325   nz   = ia[nslim_col] + (!ishift);
326   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
327   *jja = ja;
328 
329  /* loop over matrix putting into ja */
330   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
331     j   = aj + ai[row] + ishift;
332     col = *j++ + ishift;
333     i2  = tvc[col];
334     nz  = ai[row+1] - ai[row];
335     while (nz-- > 0) {
336       /* ja[work[i1]++] = i2 + oshift; */
337       ja[work[i2]++] = i1 + oshift;
338       i2++;
339       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
340       if (nz > 0) i2 = tvc[col];
341     }
342   }
343   ierr = PetscFree(ns_col);CHKERRQ(ierr);
344   ierr = PetscFree(work);CHKERRQ(ierr);
345   ierr = PetscFree(tns);CHKERRQ(ierr);
346   ierr = PetscFree(tvc);CHKERRQ(ierr);
347   PetscFunctionReturn(0);
348 }
349 
350 #undef __FUNCT__
351 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
352 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
353 {
354   PetscErrorCode ierr;
355 
356   PetscFunctionBegin;
357   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
358   if (!ia) PetscFunctionReturn(0);
359 
360   if (!blockcompressed) {
361     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
362   } else if (symmetric) {
363     /* Since the indices are symmetric it does'nt matter */
364     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   } else {
366     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
367   }
368   PetscFunctionReturn(0);
369 }
370 
371 #undef __FUNCT__
372 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
373 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
374 {
375   PetscErrorCode ierr;
376 
377   PetscFunctionBegin;
378   if (!ia) PetscFunctionReturn(0);
379   if (!blockcompressed) {
380     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
381   } else {
382     ierr = PetscFree(*ia);CHKERRQ(ierr);
383     ierr = PetscFree(*ja);CHKERRQ(ierr);
384   }
385   PetscFunctionReturn(0);
386 }
387 
388 /* ----------------------------------------------------------- */
389 
390 #undef __FUNCT__
391 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
392 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
393 {
394   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
395   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
396   PetscScalar       *y;
397   const PetscScalar *x;
398   const MatScalar   *v1,*v2,*v3,*v4,*v5;
399   PetscErrorCode    ierr;
400   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
401 
402 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
403 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
404 #endif
405 
406   PetscFunctionBegin;
407   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
408   node_max = a->inode.node_count;
409   ns       = a->inode.size;     /* Node Size array */
410   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
411   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
412   idx  = a->j;
413   v1   = a->a;
414   ii   = a->i;
415 
416   for (i = 0,row = 0; i< node_max; ++i) {
417     nsz  = ns[i];
418     n    = ii[1] - ii[0];
419     nonzerorow += (n>0)*nsz;
420     ii  += nsz;
421     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
422     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
423     sz   = n;                   /* No of non zeros in this row */
424                                 /* Switch on the size of Node */
425     switch (nsz) {               /* Each loop in 'case' is unrolled */
426     case 1 :
427       sum1  = 0.;
428 
429       for (n = 0; n< sz-1; n+=2) {
430         i1   = idx[0];          /* The instructions are ordered to */
431         i2   = idx[1];          /* make the compiler's job easy */
432         idx += 2;
433         tmp0 = x[i1];
434         tmp1 = x[i2];
435         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436        }
437 
438       if (n == sz-1) {          /* Take care of the last nonzero  */
439         tmp0  = x[*idx++];
440         sum1 += *v1++ * tmp0;
441       }
442       y[row++]=sum1;
443       break;
444     case 2:
445       sum1  = 0.;
446       sum2  = 0.;
447       v2    = v1 + n;
448 
449       for (n = 0; n< sz-1; n+=2) {
450         i1   = idx[0];
451         i2   = idx[1];
452         idx += 2;
453         tmp0 = x[i1];
454         tmp1 = x[i2];
455         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
456         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
457       }
458       if (n == sz-1) {
459         tmp0  = x[*idx++];
460         sum1 += *v1++ * tmp0;
461         sum2 += *v2++ * tmp0;
462       }
463       y[row++]=sum1;
464       y[row++]=sum2;
465       v1      =v2;              /* Since the next block to be processed starts there*/
466       idx    +=sz;
467       break;
468     case 3:
469       sum1  = 0.;
470       sum2  = 0.;
471       sum3  = 0.;
472       v2    = v1 + n;
473       v3    = v2 + n;
474 
475       for (n = 0; n< sz-1; n+=2) {
476         i1   = idx[0];
477         i2   = idx[1];
478         idx += 2;
479         tmp0 = x[i1];
480         tmp1 = x[i2];
481         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
482         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
483         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
484       }
485       if (n == sz-1) {
486         tmp0  = x[*idx++];
487         sum1 += *v1++ * tmp0;
488         sum2 += *v2++ * tmp0;
489         sum3 += *v3++ * tmp0;
490       }
491       y[row++]=sum1;
492       y[row++]=sum2;
493       y[row++]=sum3;
494       v1       =v3;             /* Since the next block to be processed starts there*/
495       idx     +=2*sz;
496       break;
497     case 4:
498       sum1  = 0.;
499       sum2  = 0.;
500       sum3  = 0.;
501       sum4  = 0.;
502       v2    = v1 + n;
503       v3    = v2 + n;
504       v4    = v3 + n;
505 
506       for (n = 0; n< sz-1; n+=2) {
507         i1   = idx[0];
508         i2   = idx[1];
509         idx += 2;
510         tmp0 = x[i1];
511         tmp1 = x[i2];
512         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
513         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
514         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
515         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
516       }
517       if (n == sz-1) {
518         tmp0  = x[*idx++];
519         sum1 += *v1++ * tmp0;
520         sum2 += *v2++ * tmp0;
521         sum3 += *v3++ * tmp0;
522         sum4 += *v4++ * tmp0;
523       }
524       y[row++]=sum1;
525       y[row++]=sum2;
526       y[row++]=sum3;
527       y[row++]=sum4;
528       v1      =v4;              /* Since the next block to be processed starts there*/
529       idx    +=3*sz;
530       break;
531     case 5:
532       sum1  = 0.;
533       sum2  = 0.;
534       sum3  = 0.;
535       sum4  = 0.;
536       sum5  = 0.;
537       v2    = v1 + n;
538       v3    = v2 + n;
539       v4    = v3 + n;
540       v5    = v4 + n;
541 
542       for (n = 0; n<sz-1; n+=2) {
543         i1   = idx[0];
544         i2   = idx[1];
545         idx += 2;
546         tmp0 = x[i1];
547         tmp1 = x[i2];
548         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
549         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
550         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
551         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
552         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
553       }
554       if (n == sz-1) {
555         tmp0  = x[*idx++];
556         sum1 += *v1++ * tmp0;
557         sum2 += *v2++ * tmp0;
558         sum3 += *v3++ * tmp0;
559         sum4 += *v4++ * tmp0;
560         sum5 += *v5++ * tmp0;
561       }
562       y[row++]=sum1;
563       y[row++]=sum2;
564       y[row++]=sum3;
565       y[row++]=sum4;
566       y[row++]=sum5;
567       v1      =v5;       /* Since the next block to be processed starts there */
568       idx    +=4*sz;
569       break;
570     default :
571       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
572     }
573   }
574   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
575   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
576   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
577   PetscFunctionReturn(0);
578 }
579 /* ----------------------------------------------------------- */
580 /* Almost same code as the MatMult_SeqAIJ_Inode() */
581 #undef __FUNCT__
582 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
583 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
584 {
585   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
586   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
587   MatScalar      *v1,*v2,*v3,*v4,*v5;
588   PetscScalar    *x,*y,*z,*zt;
589   PetscErrorCode ierr;
590   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
591 
592   PetscFunctionBegin;
593   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
594   node_max = a->inode.node_count;
595   ns       = a->inode.size;     /* Node Size array */
596   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
597   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
598   if (zz != yy) {
599     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
600   } else {
601     z = y;
602   }
603   zt = z;
604 
605   idx  = a->j;
606   v1   = a->a;
607   ii   = a->i;
608 
609   for (i = 0,row = 0; i< node_max; ++i) {
610     nsz  = ns[i];
611     n    = ii[1] - ii[0];
612     ii  += nsz;
613     sz   = n;                   /* No of non zeros in this row */
614                                 /* Switch on the size of Node */
615     switch (nsz) {               /* Each loop in 'case' is unrolled */
616     case 1 :
617       sum1  = *zt++;
618 
619       for (n = 0; n< sz-1; n+=2) {
620         i1   = idx[0];          /* The instructions are ordered to */
621         i2   = idx[1];          /* make the compiler's job easy */
622         idx += 2;
623         tmp0 = x[i1];
624         tmp1 = x[i2];
625         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
626        }
627 
628       if (n   == sz-1) {          /* Take care of the last nonzero  */
629         tmp0  = x[*idx++];
630         sum1 += *v1++ * tmp0;
631       }
632       y[row++]=sum1;
633       break;
634     case 2:
635       sum1  = *zt++;
636       sum2  = *zt++;
637       v2    = v1 + n;
638 
639       for (n = 0; n< sz-1; n+=2) {
640         i1   = idx[0];
641         i2   = idx[1];
642         idx += 2;
643         tmp0 = x[i1];
644         tmp1 = x[i2];
645         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
646         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
647       }
648       if (n   == sz-1) {
649         tmp0  = x[*idx++];
650         sum1 += *v1++ * tmp0;
651         sum2 += *v2++ * tmp0;
652       }
653       y[row++]=sum1;
654       y[row++]=sum2;
655       v1      =v2;              /* Since the next block to be processed starts there*/
656       idx    +=sz;
657       break;
658     case 3:
659       sum1  = *zt++;
660       sum2  = *zt++;
661       sum3  = *zt++;
662       v2    = v1 + n;
663       v3    = v2 + n;
664 
665       for (n = 0; n< sz-1; n+=2) {
666         i1   = idx[0];
667         i2   = idx[1];
668         idx += 2;
669         tmp0 = x[i1];
670         tmp1 = x[i2];
671         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
672         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
673         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
674       }
675       if (n == sz-1) {
676         tmp0  = x[*idx++];
677         sum1 += *v1++ * tmp0;
678         sum2 += *v2++ * tmp0;
679         sum3 += *v3++ * tmp0;
680       }
681       y[row++]=sum1;
682       y[row++]=sum2;
683       y[row++]=sum3;
684       v1       =v3;             /* Since the next block to be processed starts there*/
685       idx     +=2*sz;
686       break;
687     case 4:
688       sum1  = *zt++;
689       sum2  = *zt++;
690       sum3  = *zt++;
691       sum4  = *zt++;
692       v2    = v1 + n;
693       v3    = v2 + n;
694       v4    = v3 + n;
695 
696       for (n = 0; n< sz-1; n+=2) {
697         i1   = idx[0];
698         i2   = idx[1];
699         idx += 2;
700         tmp0 = x[i1];
701         tmp1 = x[i2];
702         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
703         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
704         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
705         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
706       }
707       if (n == sz-1) {
708         tmp0  = x[*idx++];
709         sum1 += *v1++ * tmp0;
710         sum2 += *v2++ * tmp0;
711         sum3 += *v3++ * tmp0;
712         sum4 += *v4++ * tmp0;
713       }
714       y[row++]=sum1;
715       y[row++]=sum2;
716       y[row++]=sum3;
717       y[row++]=sum4;
718       v1      =v4;              /* Since the next block to be processed starts there*/
719       idx    +=3*sz;
720       break;
721     case 5:
722       sum1  = *zt++;
723       sum2  = *zt++;
724       sum3  = *zt++;
725       sum4  = *zt++;
726       sum5  = *zt++;
727       v2    = v1 + n;
728       v3    = v2 + n;
729       v4    = v3 + n;
730       v5    = v4 + n;
731 
732       for (n = 0; n<sz-1; n+=2) {
733         i1   = idx[0];
734         i2   = idx[1];
735         idx += 2;
736         tmp0 = x[i1];
737         tmp1 = x[i2];
738         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
739         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
740         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
741         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
742         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
743       }
744       if (n   == sz-1) {
745         tmp0  = x[*idx++];
746         sum1 += *v1++ * tmp0;
747         sum2 += *v2++ * tmp0;
748         sum3 += *v3++ * tmp0;
749         sum4 += *v4++ * tmp0;
750         sum5 += *v5++ * tmp0;
751       }
752       y[row++]=sum1;
753       y[row++]=sum2;
754       y[row++]=sum3;
755       y[row++]=sum4;
756       y[row++]=sum5;
757       v1      =v5;       /* Since the next block to be processed starts there */
758       idx    +=4*sz;
759       break;
760     default :
761       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
762     }
763   }
764   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
765   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
766   if (zz != yy) {
767     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
768   }
769   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
770   PetscFunctionReturn(0);
771 }
772 
773 /* ----------------------------------------------------------- */
774 #undef __FUNCT__
775 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
776 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
777 {
778   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
779   IS                iscol = a->col,isrow = a->row;
780   PetscErrorCode    ierr;
781   const PetscInt    *r,*c,*rout,*cout;
782   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
783   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
784   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
785   PetscScalar       sum1,sum2,sum3,sum4,sum5;
786   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
787   const PetscScalar *b;
788 
789   PetscFunctionBegin;
790   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
791   node_max = a->inode.node_count;
792   ns       = a->inode.size;     /* Node Size array */
793 
794   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
795   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
796   tmp  = a->solve_work;
797 
798   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
799   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
800 
801   /* forward solve the lower triangular */
802   tmps = tmp ;
803   aa   = a_a ;
804   aj   = a_j ;
805   ad   = a->diag;
806 
807   for (i = 0,row = 0; i< node_max; ++i) {
808     nsz = ns[i];
809     aii = ai[row];
810     v1  = aa + aii;
811     vi  = aj + aii;
812     nz  = ad[row]- aii;
813     if (i < node_max-1) {
814       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
815       * but our indexing to determine it's size could. */
816       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
817       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
818       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
819       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
820     }
821 
822     switch (nsz) {               /* Each loop in 'case' is unrolled */
823     case 1 :
824       sum1 = b[*r++];
825       for (j=0; j<nz-1; j+=2) {
826         i0   = vi[0];
827         i1   = vi[1];
828         vi  +=2;
829         tmp0 = tmps[i0];
830         tmp1 = tmps[i1];
831         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
832       }
833       if (j == nz-1) {
834         tmp0 = tmps[*vi++];
835         sum1 -= *v1++ *tmp0;
836       }
837       tmp[row ++]=sum1;
838       break;
839     case 2:
840       sum1 = b[*r++];
841       sum2 = b[*r++];
842       v2   = aa + ai[row+1];
843 
844       for (j=0; j<nz-1; j+=2) {
845         i0   = vi[0];
846         i1   = vi[1];
847         vi  +=2;
848         tmp0 = tmps[i0];
849         tmp1 = tmps[i1];
850         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
851         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
852       }
853       if (j == nz-1) {
854         tmp0 = tmps[*vi++];
855         sum1 -= *v1++ *tmp0;
856         sum2 -= *v2++ *tmp0;
857       }
858       sum2 -= *v2++ * sum1;
859       tmp[row ++]=sum1;
860       tmp[row ++]=sum2;
861       break;
862     case 3:
863       sum1 = b[*r++];
864       sum2 = b[*r++];
865       sum3 = b[*r++];
866       v2   = aa + ai[row+1];
867       v3   = aa + ai[row+2];
868 
869       for (j=0; j<nz-1; j+=2) {
870         i0   = vi[0];
871         i1   = vi[1];
872         vi  +=2;
873         tmp0 = tmps[i0];
874         tmp1 = tmps[i1];
875         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
876         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
877         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
878       }
879       if (j == nz-1) {
880         tmp0 = tmps[*vi++];
881         sum1 -= *v1++ *tmp0;
882         sum2 -= *v2++ *tmp0;
883         sum3 -= *v3++ *tmp0;
884       }
885       sum2 -= *v2++ * sum1;
886       sum3 -= *v3++ * sum1;
887       sum3 -= *v3++ * sum2;
888       tmp[row ++]=sum1;
889       tmp[row ++]=sum2;
890       tmp[row ++]=sum3;
891       break;
892 
893     case 4:
894       sum1 = b[*r++];
895       sum2 = b[*r++];
896       sum3 = b[*r++];
897       sum4 = b[*r++];
898       v2   = aa + ai[row+1];
899       v3   = aa + ai[row+2];
900       v4   = aa + ai[row+3];
901 
902       for (j=0; j<nz-1; j+=2) {
903         i0   = vi[0];
904         i1   = vi[1];
905         vi  +=2;
906         tmp0 = tmps[i0];
907         tmp1 = tmps[i1];
908         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
909         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
910         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
911         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
912       }
913       if (j == nz-1) {
914         tmp0 = tmps[*vi++];
915         sum1 -= *v1++ *tmp0;
916         sum2 -= *v2++ *tmp0;
917         sum3 -= *v3++ *tmp0;
918         sum4 -= *v4++ *tmp0;
919       }
920       sum2 -= *v2++ * sum1;
921       sum3 -= *v3++ * sum1;
922       sum4 -= *v4++ * sum1;
923       sum3 -= *v3++ * sum2;
924       sum4 -= *v4++ * sum2;
925       sum4 -= *v4++ * sum3;
926 
927       tmp[row ++]=sum1;
928       tmp[row ++]=sum2;
929       tmp[row ++]=sum3;
930       tmp[row ++]=sum4;
931       break;
932     case 5:
933       sum1 = b[*r++];
934       sum2 = b[*r++];
935       sum3 = b[*r++];
936       sum4 = b[*r++];
937       sum5 = b[*r++];
938       v2   = aa + ai[row+1];
939       v3   = aa + ai[row+2];
940       v4   = aa + ai[row+3];
941       v5   = aa + ai[row+4];
942 
943       for (j=0; j<nz-1; j+=2) {
944         i0   = vi[0];
945         i1   = vi[1];
946         vi  +=2;
947         tmp0 = tmps[i0];
948         tmp1 = tmps[i1];
949         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
950         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
951         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
952         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
953         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
954       }
955       if (j == nz-1) {
956         tmp0 = tmps[*vi++];
957         sum1 -= *v1++ *tmp0;
958         sum2 -= *v2++ *tmp0;
959         sum3 -= *v3++ *tmp0;
960         sum4 -= *v4++ *tmp0;
961         sum5 -= *v5++ *tmp0;
962       }
963 
964       sum2 -= *v2++ * sum1;
965       sum3 -= *v3++ * sum1;
966       sum4 -= *v4++ * sum1;
967       sum5 -= *v5++ * sum1;
968       sum3 -= *v3++ * sum2;
969       sum4 -= *v4++ * sum2;
970       sum5 -= *v5++ * sum2;
971       sum4 -= *v4++ * sum3;
972       sum5 -= *v5++ * sum3;
973       sum5 -= *v5++ * sum4;
974 
975       tmp[row ++]=sum1;
976       tmp[row ++]=sum2;
977       tmp[row ++]=sum3;
978       tmp[row ++]=sum4;
979       tmp[row ++]=sum5;
980       break;
981     default:
982       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
983     }
984   }
985   /* backward solve the upper triangular */
986   for (i=node_max -1 ,row = n-1 ; i>=0; i--) {
987     nsz = ns[i];
988     aii = ai[row+1] -1;
989     v1  = aa + aii;
990     vi  = aj + aii;
991     nz  = aii- ad[row];
992     switch (nsz) {               /* Each loop in 'case' is unrolled */
993     case 1 :
994       sum1 = tmp[row];
995 
996       for (j=nz ; j>1; j-=2) {
997         vi  -=2;
998         i0   = vi[2];
999         i1   = vi[1];
1000         tmp0 = tmps[i0];
1001         tmp1 = tmps[i1];
1002         v1   -= 2;
1003         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1004       }
1005       if (j==1) {
1006         tmp0  = tmps[*vi--];
1007         sum1 -= *v1-- * tmp0;
1008       }
1009       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010       break;
1011     case 2 :
1012       sum1 = tmp[row];
1013       sum2 = tmp[row -1];
1014       v2   = aa + ai[row]-1;
1015       for (j=nz ; j>1; j-=2) {
1016         vi  -=2;
1017         i0   = vi[2];
1018         i1   = vi[1];
1019         tmp0 = tmps[i0];
1020         tmp1 = tmps[i1];
1021         v1   -= 2;
1022         v2   -= 2;
1023         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1024         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1025       }
1026       if (j==1) {
1027         tmp0  = tmps[*vi--];
1028         sum1 -= *v1-- * tmp0;
1029         sum2 -= *v2-- * tmp0;
1030       }
1031 
1032       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1033       sum2   -= *v2-- * tmp0;
1034       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1035       break;
1036     case 3 :
1037       sum1 = tmp[row];
1038       sum2 = tmp[row -1];
1039       sum3 = tmp[row -2];
1040       v2   = aa + ai[row]-1;
1041       v3   = aa + ai[row -1]-1;
1042       for (j=nz ; j>1; j-=2) {
1043         vi  -=2;
1044         i0   = vi[2];
1045         i1   = vi[1];
1046         tmp0 = tmps[i0];
1047         tmp1 = tmps[i1];
1048         v1   -= 2;
1049         v2   -= 2;
1050         v3   -= 2;
1051         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1052         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1053         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1054       }
1055       if (j==1) {
1056         tmp0  = tmps[*vi--];
1057         sum1 -= *v1-- * tmp0;
1058         sum2 -= *v2-- * tmp0;
1059         sum3 -= *v3-- * tmp0;
1060       }
1061       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1062       sum2   -= *v2-- * tmp0;
1063       sum3   -= *v3-- * tmp0;
1064       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1065       sum3   -= *v3-- * tmp0;
1066       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1067 
1068       break;
1069     case 4 :
1070       sum1 = tmp[row];
1071       sum2 = tmp[row -1];
1072       sum3 = tmp[row -2];
1073       sum4 = tmp[row -3];
1074       v2   = aa + ai[row]-1;
1075       v3   = aa + ai[row -1]-1;
1076       v4   = aa + ai[row -2]-1;
1077 
1078       for (j=nz ; j>1; j-=2) {
1079         vi  -=2;
1080         i0   = vi[2];
1081         i1   = vi[1];
1082         tmp0 = tmps[i0];
1083         tmp1 = tmps[i1];
1084         v1  -= 2;
1085         v2  -= 2;
1086         v3  -= 2;
1087         v4  -= 2;
1088         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1089         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1090         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1091         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1092       }
1093       if (j==1) {
1094         tmp0  = tmps[*vi--];
1095         sum1 -= *v1-- * tmp0;
1096         sum2 -= *v2-- * tmp0;
1097         sum3 -= *v3-- * tmp0;
1098         sum4 -= *v4-- * tmp0;
1099       }
1100 
1101       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1102       sum2   -= *v2-- * tmp0;
1103       sum3   -= *v3-- * tmp0;
1104       sum4   -= *v4-- * tmp0;
1105       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1106       sum3   -= *v3-- * tmp0;
1107       sum4   -= *v4-- * tmp0;
1108       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1109       sum4   -= *v4-- * tmp0;
1110       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1111       break;
1112     case 5 :
1113       sum1 = tmp[row];
1114       sum2 = tmp[row -1];
1115       sum3 = tmp[row -2];
1116       sum4 = tmp[row -3];
1117       sum5 = tmp[row -4];
1118       v2   = aa + ai[row]-1;
1119       v3   = aa + ai[row -1]-1;
1120       v4   = aa + ai[row -2]-1;
1121       v5   = aa + ai[row -3]-1;
1122       for (j=nz ; j>1; j-=2) {
1123         vi  -= 2;
1124         i0   = vi[2];
1125         i1   = vi[1];
1126         tmp0 = tmps[i0];
1127         tmp1 = tmps[i1];
1128         v1   -= 2;
1129         v2   -= 2;
1130         v3   -= 2;
1131         v4   -= 2;
1132         v5   -= 2;
1133         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1134         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1135         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1136         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1137         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1138       }
1139       if (j==1) {
1140         tmp0  = tmps[*vi--];
1141         sum1 -= *v1-- * tmp0;
1142         sum2 -= *v2-- * tmp0;
1143         sum3 -= *v3-- * tmp0;
1144         sum4 -= *v4-- * tmp0;
1145         sum5 -= *v5-- * tmp0;
1146       }
1147 
1148       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1149       sum2   -= *v2-- * tmp0;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1158       sum4   -= *v4-- * tmp0;
1159       sum5   -= *v5-- * tmp0;
1160       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1161       sum5   -= *v5-- * tmp0;
1162       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1163       break;
1164     default:
1165       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1166     }
1167   }
1168   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1169   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1170   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1173   PetscFunctionReturn(0);
1174 }
1175 
1176 #undef __FUNCT__
1177 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1178 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1179 {
1180   Mat              C=B;
1181   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1182   IS               isrow = b->row,isicol = b->icol;
1183   PetscErrorCode   ierr;
1184   const PetscInt   *r,*ic,*ics;
1185   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1186   PetscInt         i,j,k,nz,nzL,row,*pj;
1187   const PetscInt   *ajtmp,*bjtmp;
1188   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1189   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1190   FactorShiftCtx   sctx;
1191   const PetscInt   *ddiag;
1192   PetscReal        rs;
1193   MatScalar        d;
1194   PetscInt         inod,nodesz,node_max,col;
1195   const PetscInt   *ns;
1196   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1197 
1198   PetscFunctionBegin;
1199   /* MatPivotSetUp(): initialize shift context sctx */
1200   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1201 
1202   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1203     ddiag          = a->diag;
1204     sctx.shift_top = info->zeropivot;
1205     for (i=0; i<n; i++) {
1206       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1207       d  = (aa)[ddiag[i]];
1208       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1209       v  = aa+ai[i];
1210       nz = ai[i+1] - ai[i];
1211       for (j=0; j<nz; j++)
1212         rs += PetscAbsScalar(v[j]);
1213       if (rs>sctx.shift_top) sctx.shift_top = rs;
1214     }
1215     sctx.shift_top   *= 1.1;
1216     sctx.nshift_max   = 5;
1217     sctx.shift_lo     = 0.;
1218     sctx.shift_hi     = 1.;
1219   }
1220 
1221   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1222   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1223 
1224   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1225   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1226   rtmp2 = rtmp1 + n;
1227   rtmp3 = rtmp2 + n;
1228   rtmp4 = rtmp3 + n;
1229   ics   = ic;
1230 
1231   node_max = a->inode.node_count;
1232   ns       = a->inode.size;
1233   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1234 
1235   /* If max inode size > 4, split it into two inodes.*/
1236   /* also map the inode sizes according to the ordering */
1237   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1238   for (i=0,j=0; i<node_max; ++i,++j) {
1239     if (ns[i] > 4) {
1240       tmp_vec1[j] = 4;
1241       ++j;
1242       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1243     } else {
1244       tmp_vec1[j] = ns[i];
1245     }
1246   }
1247   /* Use the correct node_max */
1248   node_max = j;
1249 
1250   /* Now reorder the inode info based on mat re-ordering info */
1251   /* First create a row -> inode_size_array_index map */
1252   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1253   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1254   for (i=0,row=0; i<node_max; i++) {
1255     nodesz = tmp_vec1[i];
1256     for (j=0; j<nodesz; j++,row++) {
1257       nsmap[row] = i;
1258     }
1259   }
1260   /* Using nsmap, create a reordered ns structure */
1261   for (i=0,j=0; i< node_max; i++) {
1262     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1263     tmp_vec2[i]  = nodesz;
1264     j           += nodesz;
1265   }
1266   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1267   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1268 
1269   /* Now use the correct ns */
1270   ns = tmp_vec2;
1271 
1272   do {
1273     sctx.newshift = PETSC_FALSE;
1274     /* Now loop over each block-row, and do the factorization */
1275     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1276       nodesz = ns[inod];
1277 
1278       switch (nodesz) {
1279       case 1:
1280       /*----------*/
1281         /* zero rtmp1 */
1282         /* L part */
1283         nz    = bi[i+1] - bi[i];
1284         bjtmp = bj + bi[i];
1285         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1286 
1287         /* U part */
1288         nz = bdiag[i]-bdiag[i+1];
1289         bjtmp = bj + bdiag[i+1]+1;
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* load in initial (unfactored row) */
1293         nz    = ai[r[i]+1] - ai[r[i]];
1294         ajtmp = aj + ai[r[i]];
1295         v     = aa + ai[r[i]];
1296         for (j=0; j<nz; j++) {
1297           rtmp1[ics[ajtmp[j]]] = v[j];
1298         }
1299         /* ZeropivotApply() */
1300         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1301 
1302         /* elimination */
1303         bjtmp = bj + bi[i];
1304         row   = *bjtmp++;
1305         nzL   = bi[i+1] - bi[i];
1306         for (k=0; k < nzL;k++) {
1307           pc = rtmp1 + row;
1308           if (*pc != 0.0) {
1309             pv   = b->a + bdiag[row];
1310             mul1 = *pc * (*pv);
1311             *pc  = mul1;
1312             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1313             pv = b->a + bdiag[row+1]+1;
1314             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1315             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1316             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1317           }
1318           row = *bjtmp++;
1319         }
1320 
1321         /* finished row so stick it into b->a */
1322         rs = 0.0;
1323         /* L part */
1324         pv = b->a + bi[i] ;
1325         pj = b->j + bi[i] ;
1326         nz = bi[i+1] - bi[i];
1327         for (j=0; j<nz; j++) {
1328           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1329         }
1330 
1331         /* U part */
1332         pv = b->a + bdiag[i+1]+1;
1333         pj = b->j + bdiag[i+1]+1;
1334         nz = bdiag[i] - bdiag[i+1]-1;
1335         for (j=0; j<nz; j++) {
1336           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1337         }
1338 
1339         /* Check zero pivot */
1340         sctx.rs = rs;
1341         sctx.pv = rtmp1[i];
1342         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1343         if (sctx.newshift) break;
1344 
1345         /* Mark diagonal and invert diagonal for simplier triangular solves */
1346         pv  = b->a + bdiag[i];
1347         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1348         break;
1349 
1350       case 2:
1351       /*----------*/
1352         /* zero rtmp1 and rtmp2 */
1353         /* L part */
1354         nz    = bi[i+1] - bi[i];
1355         bjtmp = bj + bi[i];
1356         for  (j=0; j<nz; j++) {
1357           col = bjtmp[j];
1358           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1359         }
1360 
1361         /* U part */
1362         nz = bdiag[i]-bdiag[i+1];
1363         bjtmp = bj + bdiag[i+1]+1;
1364         for  (j=0; j<nz; j++) {
1365           col = bjtmp[j];
1366           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1367         }
1368 
1369         /* load in initial (unfactored row) */
1370         nz    = ai[r[i]+1] - ai[r[i]];
1371         ajtmp = aj + ai[r[i]];
1372         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1373         for (j=0; j<nz; j++) {
1374           col = ics[ajtmp[j]];
1375           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1376         }
1377         /* ZeropivotApply(): shift the diagonal of the matrix  */
1378         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1379 
1380         /* elimination */
1381         bjtmp = bj + bi[i];
1382         row   = *bjtmp++; /* pivot row */
1383         nzL   = bi[i+1] - bi[i];
1384         for (k=0; k < nzL;k++) {
1385           pc1 = rtmp1 + row;
1386           pc2 = rtmp2 + row;
1387           if (*pc1 != 0.0 || *pc2 != 0.0) {
1388             pv   = b->a + bdiag[row];
1389             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1390             *pc1 = mul1;       *pc2 = mul2;
1391 
1392             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1393             pv = b->a + bdiag[row+1]+1;
1394             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1395             for (j=0; j<nz; j++) {
1396               col = pj[j];
1397               rtmp1[col] -= mul1 * pv[j];
1398               rtmp2[col] -= mul2 * pv[j];
1399             }
1400             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1401           }
1402           row = *bjtmp++;
1403         }
1404 
1405         /* finished row i; check zero pivot, then stick row i into b->a */
1406         rs  = 0.0;
1407         /* L part */
1408         pc1 = b->a + bi[i];
1409         pj  = b->j + bi[i] ;
1410         nz  = bi[i+1] - bi[i];
1411         for (j=0; j<nz; j++) {
1412           col = pj[j];
1413           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1414         }
1415         /* U part */
1416         pc1 = b->a + bdiag[i+1]+1;
1417         pj  = b->j + bdiag[i+1]+1;
1418         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1419         for (j=0; j<nz; j++) {
1420           col = pj[j];
1421           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1422         }
1423 
1424         sctx.rs  = rs;
1425         sctx.pv  = rtmp1[i];
1426         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1427         if (sctx.newshift) break;
1428         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1429         *pc1 = 1.0/sctx.pv;
1430 
1431         /* Now take care of diagonal 2x2 block. */
1432         pc2 = rtmp2 + i;
1433         if (*pc2 != 0.0) {
1434           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1435           *pc2 = mul1;          /* insert L entry */
1436           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1437           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1438           for (j=0; j<nz; j++) {
1439             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1440           }
1441           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1442         }
1443 
1444         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1445         rs = 0.0;
1446         /* L part */
1447         pc2 = b->a + bi[i+1];
1448         pj  = b->j + bi[i+1] ;
1449         nz  = bi[i+2] - bi[i+1];
1450         for (j=0; j<nz; j++) {
1451           col = pj[j];
1452           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1453         }
1454         /* U part */
1455         pc2 = b->a + bdiag[i+2]+1;
1456         pj  = b->j + bdiag[i+2]+1;
1457         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1458         for (j=0; j<nz; j++) {
1459           col = pj[j];
1460           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1461         }
1462 
1463         sctx.rs  = rs;
1464         sctx.pv  = rtmp2[i+1];
1465         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1466         if (sctx.newshift) break;
1467         pc2  = b->a + bdiag[i+1];
1468         *pc2 = 1.0/sctx.pv;
1469         break;
1470 
1471       case 3:
1472       /*----------*/
1473         /* zero rtmp */
1474         /* L part */
1475         nz    = bi[i+1] - bi[i];
1476         bjtmp = bj + bi[i];
1477         for  (j=0; j<nz; j++) {
1478           col = bjtmp[j];
1479           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1480         }
1481 
1482         /* U part */
1483         nz = bdiag[i]-bdiag[i+1];
1484         bjtmp = bj + bdiag[i+1]+1;
1485         for  (j=0; j<nz; j++) {
1486           col = bjtmp[j];
1487           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1488         }
1489 
1490         /* load in initial (unfactored row) */
1491         nz    = ai[r[i]+1] - ai[r[i]];
1492         ajtmp = aj + ai[r[i]];
1493         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1494         for (j=0; j<nz; j++) {
1495           col = ics[ajtmp[j]];
1496           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1497         }
1498         /* ZeropivotApply(): shift the diagonal of the matrix  */
1499         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1500 
1501         /* elimination */
1502         bjtmp = bj + bi[i];
1503         row   = *bjtmp++; /* pivot row */
1504         nzL   = bi[i+1] - bi[i];
1505         for (k=0; k < nzL;k++) {
1506           pc1 = rtmp1 + row;
1507           pc2 = rtmp2 + row;
1508           pc3 = rtmp3 + row;
1509           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1510             pv  = b->a + bdiag[row];
1511             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1512             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1513 
1514             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1515             pv = b->a + bdiag[row+1]+1;
1516             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1517             for (j=0; j<nz; j++) {
1518               col = pj[j];
1519               rtmp1[col] -= mul1 * pv[j];
1520               rtmp2[col] -= mul2 * pv[j];
1521               rtmp3[col] -= mul3 * pv[j];
1522             }
1523             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1524           }
1525           row = *bjtmp++;
1526         }
1527 
1528         /* finished row i; check zero pivot, then stick row i into b->a */
1529         rs  = 0.0;
1530         /* L part */
1531         pc1 = b->a + bi[i];
1532         pj  = b->j + bi[i] ;
1533         nz  = bi[i+1] - bi[i];
1534         for (j=0; j<nz; j++) {
1535           col = pj[j];
1536           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1537         }
1538         /* U part */
1539         pc1 = b->a + bdiag[i+1]+1;
1540         pj  = b->j + bdiag[i+1]+1;
1541         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1542         for (j=0; j<nz; j++) {
1543           col = pj[j];
1544           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1545         }
1546 
1547         sctx.rs  = rs;
1548         sctx.pv  = rtmp1[i];
1549         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1550         if (sctx.newshift) break;
1551         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1552         *pc1 = 1.0/sctx.pv;
1553 
1554         /* Now take care of 1st column of diagonal 3x3 block. */
1555         pc2 = rtmp2 + i;
1556         pc3 = rtmp3 + i;
1557         if (*pc2 != 0.0 || *pc3 != 0.0) {
1558           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1559           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1560           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1561           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1562           for (j=0; j<nz; j++) {
1563             col = pj[j];
1564             rtmp2[col] -= mul2 * rtmp1[col];
1565             rtmp3[col] -= mul3 * rtmp1[col];
1566           }
1567           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1568         }
1569 
1570         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1571         rs = 0.0;
1572         /* L part */
1573         pc2 = b->a + bi[i+1];
1574         pj  = b->j + bi[i+1] ;
1575         nz  = bi[i+2] - bi[i+1];
1576         for (j=0; j<nz; j++) {
1577           col = pj[j];
1578           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1579         }
1580         /* U part */
1581         pc2 = b->a + bdiag[i+2]+1;
1582         pj  = b->j + bdiag[i+2]+1;
1583         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1584         for (j=0; j<nz; j++) {
1585           col = pj[j];
1586           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1587         }
1588 
1589         sctx.rs  = rs;
1590         sctx.pv  = rtmp2[i+1];
1591         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1592         if (sctx.newshift) break;
1593         pc2  = b->a + bdiag[i+1];
1594         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1595 
1596         /* Now take care of 2nd column of diagonal 3x3 block. */
1597         pc3 = rtmp3 + i+1;
1598         if (*pc3 != 0.0) {
1599           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1600           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1601           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1602           for (j=0; j<nz; j++) {
1603             col = pj[j];
1604             rtmp3[col] -= mul3 * rtmp2[col];
1605           }
1606           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1607         }
1608 
1609         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1610         rs = 0.0;
1611         /* L part */
1612         pc3 = b->a + bi[i+2];
1613         pj  = b->j + bi[i+2] ;
1614         nz  = bi[i+3] - bi[i+2];
1615         for (j=0; j<nz; j++) {
1616           col = pj[j];
1617           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1618         }
1619         /* U part */
1620         pc3 = b->a + bdiag[i+3]+1;
1621         pj  = b->j + bdiag[i+3]+1;
1622         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1623         for (j=0; j<nz; j++) {
1624           col = pj[j];
1625           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1626         }
1627 
1628         sctx.rs  = rs;
1629         sctx.pv  = rtmp3[i+2];
1630         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1631         if (sctx.newshift) break;
1632         pc3  = b->a + bdiag[i+2];
1633         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1634         break;
1635       case 4:
1636       /*----------*/
1637         /* zero rtmp */
1638         /* L part */
1639         nz    = bi[i+1] - bi[i];
1640         bjtmp = bj + bi[i];
1641         for  (j=0; j<nz; j++) {
1642           col = bjtmp[j];
1643           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1644         }
1645 
1646         /* U part */
1647         nz = bdiag[i]-bdiag[i+1];
1648         bjtmp = bj + bdiag[i+1]+1;
1649         for  (j=0; j<nz; j++) {
1650           col = bjtmp[j];
1651           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1652         }
1653 
1654         /* load in initial (unfactored row) */
1655         nz    = ai[r[i]+1] - ai[r[i]];
1656         ajtmp = aj + ai[r[i]];
1657         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1658         for (j=0; j<nz; j++) {
1659           col = ics[ajtmp[j]];
1660           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1661         }
1662         /* ZeropivotApply(): shift the diagonal of the matrix  */
1663         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1664 
1665         /* elimination */
1666         bjtmp = bj + bi[i];
1667         row   = *bjtmp++; /* pivot row */
1668         nzL   = bi[i+1] - bi[i];
1669         for (k=0; k < nzL;k++) {
1670           pc1 = rtmp1 + row;
1671           pc2 = rtmp2 + row;
1672           pc3 = rtmp3 + row;
1673           pc4 = rtmp4 + row;
1674           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1675             pv  = b->a + bdiag[row];
1676             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1677             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1678 
1679             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1680             pv = b->a + bdiag[row+1]+1;
1681             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1682             for (j=0; j<nz; j++) {
1683               col = pj[j];
1684               rtmp1[col] -= mul1 * pv[j];
1685               rtmp2[col] -= mul2 * pv[j];
1686               rtmp3[col] -= mul3 * pv[j];
1687               rtmp4[col] -= mul4 * pv[j];
1688             }
1689             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1690           }
1691           row = *bjtmp++;
1692         }
1693 
1694         /* finished row i; check zero pivot, then stick row i into b->a */
1695         rs  = 0.0;
1696         /* L part */
1697         pc1 = b->a + bi[i];
1698         pj  = b->j + bi[i] ;
1699         nz  = bi[i+1] - bi[i];
1700         for (j=0; j<nz; j++) {
1701           col = pj[j];
1702           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1703         }
1704         /* U part */
1705         pc1 = b->a + bdiag[i+1]+1;
1706         pj  = b->j + bdiag[i+1]+1;
1707         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1708         for (j=0; j<nz; j++) {
1709           col = pj[j];
1710           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1711         }
1712 
1713         sctx.rs  = rs;
1714         sctx.pv  = rtmp1[i];
1715         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1716         if (sctx.newshift) break;
1717         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1718         *pc1 = 1.0/sctx.pv;
1719 
1720         /* Now take care of 1st column of diagonal 4x4 block. */
1721         pc2 = rtmp2 + i;
1722         pc3 = rtmp3 + i;
1723         pc4 = rtmp4 + i;
1724         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1725           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1726           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1727           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1728           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1729           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1730           for (j=0; j<nz; j++) {
1731             col = pj[j];
1732             rtmp2[col] -= mul2 * rtmp1[col];
1733             rtmp3[col] -= mul3 * rtmp1[col];
1734             rtmp4[col] -= mul4 * rtmp1[col];
1735           }
1736           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1737         }
1738 
1739         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1740         rs = 0.0;
1741         /* L part */
1742         pc2 = b->a + bi[i+1];
1743         pj  = b->j + bi[i+1] ;
1744         nz  = bi[i+2] - bi[i+1];
1745         for (j=0; j<nz; j++) {
1746           col = pj[j];
1747           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1748         }
1749         /* U part */
1750         pc2 = b->a + bdiag[i+2]+1;
1751         pj  = b->j + bdiag[i+2]+1;
1752         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1753         for (j=0; j<nz; j++) {
1754           col = pj[j];
1755           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1756         }
1757 
1758         sctx.rs  = rs;
1759         sctx.pv  = rtmp2[i+1];
1760         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1761         if (sctx.newshift) break;
1762         pc2  = b->a + bdiag[i+1];
1763         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1764 
1765         /* Now take care of 2nd column of diagonal 4x4 block. */
1766         pc3 = rtmp3 + i+1;
1767         pc4 = rtmp4 + i+1;
1768         if (*pc3 != 0.0 || *pc4 != 0.0) {
1769           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1770           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1771           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1772           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1773           for (j=0; j<nz; j++) {
1774             col = pj[j];
1775             rtmp3[col] -= mul3 * rtmp2[col];
1776             rtmp4[col] -= mul4 * rtmp2[col];
1777           }
1778           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1779         }
1780 
1781         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1782         rs = 0.0;
1783         /* L part */
1784         pc3 = b->a + bi[i+2];
1785         pj  = b->j + bi[i+2] ;
1786         nz  = bi[i+3] - bi[i+2];
1787         for (j=0; j<nz; j++) {
1788           col = pj[j];
1789           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1790         }
1791         /* U part */
1792         pc3 = b->a + bdiag[i+3]+1;
1793         pj  = b->j + bdiag[i+3]+1;
1794         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1795         for (j=0; j<nz; j++) {
1796           col = pj[j];
1797           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1798         }
1799 
1800         sctx.rs  = rs;
1801         sctx.pv  = rtmp3[i+2];
1802         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1803         if (sctx.newshift) break;
1804         pc3  = b->a + bdiag[i+2];
1805         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1806 
1807         /* Now take care of 3rd column of diagonal 4x4 block. */
1808         pc4 = rtmp4 + i+2;
1809         if (*pc4 != 0.0) {
1810           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1811           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1812           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1813           for (j=0; j<nz; j++) {
1814             col = pj[j];
1815             rtmp4[col] -= mul4 * rtmp3[col];
1816           }
1817           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1818         }
1819 
1820         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1821         rs = 0.0;
1822         /* L part */
1823         pc4 = b->a + bi[i+3];
1824         pj  = b->j + bi[i+3] ;
1825         nz  = bi[i+4] - bi[i+3];
1826         for (j=0; j<nz; j++) {
1827           col = pj[j];
1828           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1829         }
1830         /* U part */
1831         pc4 = b->a + bdiag[i+4]+1;
1832         pj  = b->j + bdiag[i+4]+1;
1833         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1834         for (j=0; j<nz; j++) {
1835           col = pj[j];
1836           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1837         }
1838 
1839         sctx.rs  = rs;
1840         sctx.pv  = rtmp4[i+3];
1841         ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1842         if (sctx.newshift) break;
1843         pc4  = b->a + bdiag[i+3];
1844         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1845          break;
1846 
1847       default:
1848         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1849       }
1850       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1851       i += nodesz;                 /* Update the row */
1852     }
1853 
1854     /* MatPivotRefine() */
1855     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1856       /*
1857        * if no shift in this attempt & shifting & started shifting & can refine,
1858        * then try lower shift
1859        */
1860       sctx.shift_hi       = sctx.shift_fraction;
1861       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1862       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1863       sctx.newshift       = PETSC_TRUE;
1864       sctx.nshift++;
1865     }
1866   } while (sctx.newshift);
1867 
1868   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1869   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1870   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1871   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1872 
1873   C->ops->solve              = MatSolve_SeqAIJ;
1874   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1875   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1876   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1877   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1878   C->assembled    = PETSC_TRUE;
1879   C->preallocated = PETSC_TRUE;
1880   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1881 
1882   /* MatShiftView(A,info,&sctx) */
1883   if (sctx.nshift) {
1884     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1885       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1886     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1887       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1888     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1889       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1890     }
1891   }
1892   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1893   PetscFunctionReturn(0);
1894 }
1895 
1896 #undef __FUNCT__
1897 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1898 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1899 {
1900   Mat               C = B;
1901   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1902   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1903   PetscErrorCode    ierr;
1904   const PetscInt    *r,*ic,*c,*ics;
1905   PetscInt          n = A->rmap->n,*bi = b->i;
1906   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1907   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1908   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1909   PetscScalar       mul1,mul2,mul3,tmp;
1910   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1911   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1912   PetscReal         rs=0.0;
1913   FactorShiftCtx    sctx;
1914 
1915   PetscFunctionBegin;
1916   sctx.shift_top      = 0;
1917   sctx.nshift_max     = 0;
1918   sctx.shift_lo       = 0;
1919   sctx.shift_hi       = 0;
1920   sctx.shift_fraction = 0;
1921 
1922   /* if both shift schemes are chosen by user, only use info->shiftpd */
1923   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1924     sctx.shift_top = 0;
1925     for (i=0; i<n; i++) {
1926       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1927       rs    = 0.0;
1928       ajtmp = aj + ai[i];
1929       rtmp1 = aa + ai[i];
1930       nz = ai[i+1] - ai[i];
1931       for (j=0; j<nz; j++) {
1932         if (*ajtmp != i) {
1933           rs += PetscAbsScalar(*rtmp1++);
1934         } else {
1935           rs -= PetscRealPart(*rtmp1++);
1936         }
1937         ajtmp++;
1938       }
1939       if (rs>sctx.shift_top) sctx.shift_top = rs;
1940     }
1941     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1942     sctx.shift_top *= 1.1;
1943     sctx.nshift_max = 5;
1944     sctx.shift_lo   = 0.;
1945     sctx.shift_hi   = 1.;
1946   }
1947   sctx.shift_amount = 0;
1948   sctx.nshift       = 0;
1949 
1950   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1951   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1952   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1953   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1954   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1955   ics   = ic ;
1956   rtmp22 = rtmp11 + n;
1957   rtmp33 = rtmp22 + n;
1958 
1959   node_max = a->inode.node_count;
1960   ns       = a->inode.size;
1961   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1962 
1963   /* If max inode size > 3, split it into two inodes.*/
1964   /* also map the inode sizes according to the ordering */
1965   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1966   for (i=0,j=0; i<node_max; ++i,++j) {
1967     if (ns[i]>3) {
1968       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1969       ++j;
1970       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971     } else {
1972       tmp_vec1[j] = ns[i];
1973     }
1974   }
1975   /* Use the correct node_max */
1976   node_max = j;
1977 
1978   /* Now reorder the inode info based on mat re-ordering info */
1979   /* First create a row -> inode_size_array_index map */
1980   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1981   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1982   for (i=0,row=0; i<node_max; i++) {
1983     nodesz = tmp_vec1[i];
1984     for (j=0; j<nodesz; j++,row++) {
1985       nsmap[row] = i;
1986     }
1987   }
1988   /* Using nsmap, create a reordered ns structure */
1989   for (i=0,j=0; i< node_max; i++) {
1990     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1991     tmp_vec2[i]  = nodesz;
1992     j           += nodesz;
1993   }
1994   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1995   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1996   /* Now use the correct ns */
1997   ns = tmp_vec2;
1998 
1999   do {
2000     sctx.newshift = PETSC_FALSE;
2001     /* Now loop over each block-row, and do the factorization */
2002     for (i=0,row=0; i<node_max; i++) {
2003       nodesz = ns[i];
2004       nz     = bi[row+1] - bi[row];
2005       bjtmp  = bj + bi[row];
2006 
2007       switch (nodesz) {
2008       case 1:
2009         for  (j=0; j<nz; j++) {
2010           idx        = bjtmp[j];
2011           rtmp11[idx] = 0.0;
2012         }
2013 
2014         /* load in initial (unfactored row) */
2015         idx    = r[row];
2016         nz_tmp = ai[idx+1] - ai[idx];
2017         ajtmp  = aj + ai[idx];
2018         v1     = aa + ai[idx];
2019 
2020         for (j=0; j<nz_tmp; j++) {
2021           idx        = ics[ajtmp[j]];
2022           rtmp11[idx] = v1[j];
2023         }
2024         rtmp11[ics[r[row]]] += sctx.shift_amount;
2025 
2026         prow = *bjtmp++ ;
2027         while (prow < row) {
2028           pc1 = rtmp11 + prow;
2029           if (*pc1 != 0.0) {
2030             pv   = ba + bd[prow];
2031             pj   = nbj + bd[prow];
2032             mul1 = *pc1 * *pv++;
2033             *pc1 = mul1;
2034             nz_tmp = bi[prow+1] - bd[prow] - 1;
2035             ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2036             for (j=0; j<nz_tmp; j++) {
2037               tmp = pv[j];
2038               idx = pj[j];
2039               rtmp11[idx] -= mul1 * tmp;
2040             }
2041           }
2042           prow = *bjtmp++ ;
2043         }
2044         pj  = bj + bi[row];
2045         pc1 = ba + bi[row];
2046 
2047         sctx.pv    = rtmp11[row];
2048         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049         rs         = 0.0;
2050         for (j=0; j<nz; j++) {
2051           idx    = pj[j];
2052           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054         }
2055         sctx.rs  = rs;
2056         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2057         if (sctx.newshift) goto endofwhile;
2058         break;
2059 
2060       case 2:
2061         for (j=0; j<nz; j++) {
2062           idx        = bjtmp[j];
2063           rtmp11[idx] = 0.0;
2064           rtmp22[idx] = 0.0;
2065         }
2066 
2067         /* load in initial (unfactored row) */
2068         idx    = r[row];
2069         nz_tmp = ai[idx+1] - ai[idx];
2070         ajtmp  = aj + ai[idx];
2071         v1     = aa + ai[idx];
2072         v2     = aa + ai[idx+1];
2073         for (j=0; j<nz_tmp; j++) {
2074           idx        = ics[ajtmp[j]];
2075           rtmp11[idx] = v1[j];
2076           rtmp22[idx] = v2[j];
2077         }
2078         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2079         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2080 
2081         prow = *bjtmp++ ;
2082         while (prow < row) {
2083           pc1 = rtmp11 + prow;
2084           pc2 = rtmp22 + prow;
2085           if (*pc1 != 0.0 || *pc2 != 0.0) {
2086             pv   = ba + bd[prow];
2087             pj   = nbj + bd[prow];
2088             mul1 = *pc1 * *pv;
2089             mul2 = *pc2 * *pv;
2090             ++pv;
2091             *pc1 = mul1;
2092             *pc2 = mul2;
2093 
2094             nz_tmp = bi[prow+1] - bd[prow] - 1;
2095             for (j=0; j<nz_tmp; j++) {
2096               tmp = pv[j];
2097               idx = pj[j];
2098               rtmp11[idx] -= mul1 * tmp;
2099               rtmp22[idx] -= mul2 * tmp;
2100             }
2101             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2102           }
2103           prow = *bjtmp++ ;
2104         }
2105 
2106         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107         pc1 = rtmp11 + prow;
2108         pc2 = rtmp22 + prow;
2109 
2110         sctx.pv = *pc1;
2111         pj      = bj + bi[prow];
2112         rs      = 0.0;
2113         for (j=0; j<nz; j++) {
2114           idx = pj[j];
2115           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116         }
2117         sctx.rs = rs;
2118         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2119         if (sctx.newshift) goto endofwhile;
2120 
2121         if (*pc2 != 0.0) {
2122           pj     = nbj + bd[prow];
2123           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124           *pc2   = mul2;
2125           nz_tmp = bi[prow+1] - bd[prow] - 1;
2126           for (j=0; j<nz_tmp; j++) {
2127             idx = pj[j] ;
2128             tmp = rtmp11[idx];
2129             rtmp22[idx] -= mul2 * tmp;
2130           }
2131           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2132         }
2133 
2134         pj  = bj + bi[row];
2135         pc1 = ba + bi[row];
2136         pc2 = ba + bi[row+1];
2137 
2138         sctx.pv = rtmp22[row+1];
2139         rs = 0.0;
2140         rtmp11[row]   = 1.0/rtmp11[row];
2141         rtmp22[row+1] = 1.0/rtmp22[row+1];
2142         /* copy row entries from dense representation to sparse */
2143         for (j=0; j<nz; j++) {
2144           idx    = pj[j];
2145           pc1[j] = rtmp11[idx];
2146           pc2[j] = rtmp22[idx];
2147           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148         }
2149         sctx.rs = rs;
2150         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2151         if (sctx.newshift) goto endofwhile;
2152         break;
2153 
2154       case 3:
2155         for  (j=0; j<nz; j++) {
2156           idx        = bjtmp[j];
2157           rtmp11[idx] = 0.0;
2158           rtmp22[idx] = 0.0;
2159           rtmp33[idx] = 0.0;
2160         }
2161         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162         idx    = r[row];
2163         nz_tmp = ai[idx+1] - ai[idx];
2164         ajtmp = aj + ai[idx];
2165         v1    = aa + ai[idx];
2166         v2    = aa + ai[idx+1];
2167         v3    = aa + ai[idx+2];
2168         for (j=0; j<nz_tmp; j++) {
2169           idx        = ics[ajtmp[j]];
2170           rtmp11[idx] = v1[j];
2171           rtmp22[idx] = v2[j];
2172           rtmp33[idx] = v3[j];
2173         }
2174         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2175         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2177 
2178         /* loop over all pivot row blocks above this row block */
2179         prow = *bjtmp++ ;
2180         while (prow < row) {
2181           pc1 = rtmp11 + prow;
2182           pc2 = rtmp22 + prow;
2183           pc3 = rtmp33 + prow;
2184           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185             pv   = ba  + bd[prow];
2186             pj   = nbj + bd[prow];
2187             mul1 = *pc1 * *pv;
2188             mul2 = *pc2 * *pv;
2189             mul3 = *pc3 * *pv;
2190             ++pv;
2191             *pc1 = mul1;
2192             *pc2 = mul2;
2193             *pc3 = mul3;
2194 
2195             nz_tmp = bi[prow+1] - bd[prow] - 1;
2196             /* update this row based on pivot row */
2197             for (j=0; j<nz_tmp; j++) {
2198               tmp = pv[j];
2199               idx = pj[j];
2200               rtmp11[idx] -= mul1 * tmp;
2201               rtmp22[idx] -= mul2 * tmp;
2202               rtmp33[idx] -= mul3 * tmp;
2203             }
2204             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2205           }
2206           prow = *bjtmp++ ;
2207         }
2208 
2209         /* Now take care of diagonal 3x3 block in this set of rows */
2210         /* note: prow = row here */
2211         pc1 = rtmp11 + prow;
2212         pc2 = rtmp22 + prow;
2213         pc3 = rtmp33 + prow;
2214 
2215         sctx.pv = *pc1;
2216         pj      = bj + bi[prow];
2217         rs      = 0.0;
2218         for (j=0; j<nz; j++) {
2219           idx = pj[j];
2220           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221         }
2222         sctx.rs = rs;
2223         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2224         if (sctx.newshift) goto endofwhile;
2225 
2226         if (*pc2 != 0.0 || *pc3 != 0.0) {
2227           mul2 = (*pc2)/(*pc1);
2228           mul3 = (*pc3)/(*pc1);
2229           *pc2 = mul2;
2230           *pc3 = mul3;
2231           nz_tmp = bi[prow+1] - bd[prow] - 1;
2232           pj     = nbj + bd[prow];
2233           for (j=0; j<nz_tmp; j++) {
2234             idx = pj[j] ;
2235             tmp = rtmp11[idx];
2236             rtmp22[idx] -= mul2 * tmp;
2237             rtmp33[idx] -= mul3 * tmp;
2238           }
2239           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2240         }
2241         ++prow;
2242 
2243         pc2 = rtmp22 + prow;
2244         pc3 = rtmp33 + prow;
2245         sctx.pv = *pc2;
2246         pj      = bj + bi[prow];
2247         rs      = 0.0;
2248         for (j=0; j<nz; j++) {
2249           idx = pj[j];
2250           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251         }
2252         sctx.rs = rs;
2253         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2254         if (sctx.newshift) goto endofwhile;
2255 
2256         if (*pc3 != 0.0) {
2257           mul3   = (*pc3)/(*pc2);
2258           *pc3   = mul3;
2259           pj     = nbj + bd[prow];
2260           nz_tmp = bi[prow+1] - bd[prow] - 1;
2261           for (j=0; j<nz_tmp; j++) {
2262             idx = pj[j] ;
2263             tmp = rtmp22[idx];
2264             rtmp33[idx] -= mul3 * tmp;
2265           }
2266           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2267         }
2268 
2269         pj  = bj + bi[row];
2270         pc1 = ba + bi[row];
2271         pc2 = ba + bi[row+1];
2272         pc3 = ba + bi[row+2];
2273 
2274         sctx.pv = rtmp33[row+2];
2275         rs = 0.0;
2276         rtmp11[row]   = 1.0/rtmp11[row];
2277         rtmp22[row+1] = 1.0/rtmp22[row+1];
2278         rtmp33[row+2] = 1.0/rtmp33[row+2];
2279         /* copy row entries from dense representation to sparse */
2280         for (j=0; j<nz; j++) {
2281           idx    = pj[j];
2282           pc1[j] = rtmp11[idx];
2283           pc2[j] = rtmp22[idx];
2284           pc3[j] = rtmp33[idx];
2285           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286         }
2287 
2288         sctx.rs = rs;
2289         ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2290         if (sctx.newshift) goto endofwhile;
2291         break;
2292 
2293       default:
2294         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295       }
2296       row += nodesz;                 /* Update the row */
2297     }
2298     endofwhile:;
2299   } while (sctx.newshift);
2300   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2301   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2302   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2303   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2304   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2305   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2306   /* do not set solve add, since MatSolve_Inode + Add is faster */
2307   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2308   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2309   C->assembled   = PETSC_TRUE;
2310   C->preallocated = PETSC_TRUE;
2311   if (sctx.nshift) {
2312     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2313       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2314     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2315       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2316     }
2317   }
2318   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2319   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2320   PetscFunctionReturn(0);
2321 }
2322 
2323 
2324 /* ----------------------------------------------------------- */
2325 #undef __FUNCT__
2326 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2327 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2328 {
2329   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2330   IS                iscol = a->col,isrow = a->row;
2331   PetscErrorCode    ierr;
2332   const PetscInt    *r,*c,*rout,*cout;
2333   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2334   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2335   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2336   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2337   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2338   const PetscScalar *b;
2339 
2340   PetscFunctionBegin;
2341   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2342   node_max = a->inode.node_count;
2343   ns       = a->inode.size;     /* Node Size array */
2344 
2345   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2346   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2347   tmp  = a->solve_work;
2348 
2349   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2350   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2351 
2352   /* forward solve the lower triangular */
2353   tmps = tmp ;
2354   aa   = a_a ;
2355   aj   = a_j ;
2356   ad   = a->diag;
2357 
2358   for (i = 0,row = 0; i< node_max; ++i) {
2359     nsz = ns[i];
2360     aii = ai[row];
2361     v1  = aa + aii;
2362     vi  = aj + aii;
2363     nz  = ai[row+1]- ai[row];
2364 
2365     if (i < node_max-1) {
2366       /* Prefetch the indices for the next block */
2367       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2368       /* Prefetch the data for the next block */
2369       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2370     }
2371 
2372     switch (nsz) {               /* Each loop in 'case' is unrolled */
2373     case 1 :
2374       sum1 = b[r[row]];
2375       for (j=0; j<nz-1; j+=2) {
2376         i0   = vi[j];
2377         i1   = vi[j+1];
2378         tmp0 = tmps[i0];
2379         tmp1 = tmps[i1];
2380         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2381       }
2382       if (j == nz-1) {
2383         tmp0 = tmps[vi[j]];
2384         sum1 -= v1[j]*tmp0;
2385       }
2386       tmp[row++]=sum1;
2387       break;
2388     case 2:
2389       sum1 = b[r[row]];
2390       sum2 = b[r[row+1]];
2391       v2   = aa + ai[row+1];
2392 
2393       for (j=0; j<nz-1; j+=2) {
2394         i0   = vi[j];
2395         i1   = vi[j+1];
2396         tmp0 = tmps[i0];
2397         tmp1 = tmps[i1];
2398         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2399         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2400       }
2401       if (j == nz-1) {
2402         tmp0 = tmps[vi[j]];
2403         sum1 -= v1[j] *tmp0;
2404         sum2 -= v2[j] *tmp0;
2405       }
2406       sum2 -= v2[nz] * sum1;
2407       tmp[row ++]=sum1;
2408       tmp[row ++]=sum2;
2409       break;
2410     case 3:
2411       sum1 = b[r[row]];
2412       sum2 = b[r[row+1]];
2413       sum3 = b[r[row+2]];
2414       v2   = aa + ai[row+1];
2415       v3   = aa + ai[row+2];
2416 
2417       for (j=0; j<nz-1; j+=2) {
2418         i0   = vi[j];
2419         i1   = vi[j+1];
2420         tmp0 = tmps[i0];
2421         tmp1 = tmps[i1];
2422         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2423         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2424         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2425       }
2426       if (j == nz-1) {
2427         tmp0 = tmps[vi[j]];
2428         sum1 -= v1[j] *tmp0;
2429         sum2 -= v2[j] *tmp0;
2430         sum3 -= v3[j] *tmp0;
2431       }
2432       sum2 -= v2[nz] * sum1;
2433       sum3 -= v3[nz] * sum1;
2434       sum3 -= v3[nz+1] * sum2;
2435       tmp[row ++]=sum1;
2436       tmp[row ++]=sum2;
2437       tmp[row ++]=sum3;
2438       break;
2439 
2440     case 4:
2441       sum1 = b[r[row]];
2442       sum2 = b[r[row+1]];
2443       sum3 = b[r[row+2]];
2444       sum4 = b[r[row+3]];
2445       v2   = aa + ai[row+1];
2446       v3   = aa + ai[row+2];
2447       v4   = aa + ai[row+3];
2448 
2449       for (j=0; j<nz-1; j+=2) {
2450         i0   = vi[j];
2451         i1   = vi[j+1];
2452         tmp0 = tmps[i0];
2453         tmp1 = tmps[i1];
2454         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2455         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2456         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2457         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2458       }
2459       if (j == nz-1) {
2460         tmp0 = tmps[vi[j]];
2461         sum1 -= v1[j] *tmp0;
2462         sum2 -= v2[j] *tmp0;
2463         sum3 -= v3[j] *tmp0;
2464         sum4 -= v4[j] *tmp0;
2465       }
2466       sum2 -= v2[nz] * sum1;
2467       sum3 -= v3[nz] * sum1;
2468       sum4 -= v4[nz] * sum1;
2469       sum3 -= v3[nz+1] * sum2;
2470       sum4 -= v4[nz+1] * sum2;
2471       sum4 -= v4[nz+2] * sum3;
2472 
2473       tmp[row ++]=sum1;
2474       tmp[row ++]=sum2;
2475       tmp[row ++]=sum3;
2476       tmp[row ++]=sum4;
2477       break;
2478     case 5:
2479       sum1 = b[r[row]];
2480       sum2 = b[r[row+1]];
2481       sum3 = b[r[row+2]];
2482       sum4 = b[r[row+3]];
2483       sum5 = b[r[row+4]];
2484       v2   = aa + ai[row+1];
2485       v3   = aa + ai[row+2];
2486       v4   = aa + ai[row+3];
2487       v5   = aa + ai[row+4];
2488 
2489       for (j=0; j<nz-1; j+=2) {
2490         i0   = vi[j];
2491         i1   = vi[j+1];
2492         tmp0 = tmps[i0];
2493         tmp1 = tmps[i1];
2494         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2495         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2496         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2497         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2498         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2499       }
2500       if (j == nz-1) {
2501         tmp0 = tmps[vi[j]];
2502         sum1 -= v1[j] *tmp0;
2503         sum2 -= v2[j] *tmp0;
2504         sum3 -= v3[j] *tmp0;
2505         sum4 -= v4[j] *tmp0;
2506         sum5 -= v5[j] *tmp0;
2507       }
2508 
2509       sum2 -= v2[nz] * sum1;
2510       sum3 -= v3[nz] * sum1;
2511       sum4 -= v4[nz] * sum1;
2512       sum5 -= v5[nz] * sum1;
2513       sum3 -= v3[nz+1] * sum2;
2514       sum4 -= v4[nz+1] * sum2;
2515       sum5 -= v5[nz+1] * sum2;
2516       sum4 -= v4[nz+2] * sum3;
2517       sum5 -= v5[nz+2] * sum3;
2518       sum5 -= v5[nz+3] * sum4;
2519 
2520       tmp[row ++]=sum1;
2521       tmp[row ++]=sum2;
2522       tmp[row ++]=sum3;
2523       tmp[row ++]=sum4;
2524       tmp[row ++]=sum5;
2525       break;
2526     default:
2527       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2528     }
2529   }
2530   /* backward solve the upper triangular */
2531   for (i=node_max -1 ,row = n-1 ; i>=0; i--) {
2532     nsz = ns[i];
2533     aii = ad[row+1] + 1;
2534     v1  = aa + aii;
2535     vi  = aj + aii;
2536     nz  = ad[row]- ad[row+1] - 1;
2537 
2538     if (i > 0) {
2539       /* Prefetch the indices for the next block */
2540       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2541       /* Prefetch the data for the next block */
2542       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543     }
2544 
2545     switch (nsz) {               /* Each loop in 'case' is unrolled */
2546     case 1 :
2547       sum1 = tmp[row];
2548 
2549       for (j=0 ; j<nz-1; j+=2) {
2550         i0   = vi[j];
2551         i1   = vi[j+1];
2552         tmp0 = tmps[i0];
2553         tmp1 = tmps[i1];
2554         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2555       }
2556       if (j == nz-1) {
2557         tmp0  = tmps[vi[j]];
2558         sum1 -= v1[j]*tmp0;
2559       }
2560       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2561       break;
2562     case 2 :
2563       sum1 = tmp[row];
2564       sum2 = tmp[row-1];
2565       v2   = aa + ad[row] + 1;
2566       for (j=0 ; j<nz-1; j+=2) {
2567         i0   = vi[j];
2568         i1   = vi[j+1];
2569         tmp0 = tmps[i0];
2570         tmp1 = tmps[i1];
2571         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2572         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2573       }
2574       if (j == nz-1) {
2575         tmp0  = tmps[vi[j]];
2576         sum1 -= v1[j]* tmp0;
2577         sum2 -= v2[j+1]* tmp0;
2578       }
2579 
2580       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2581       sum2   -= v2[0] * tmp0;
2582       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2583       break;
2584     case 3 :
2585       sum1 = tmp[row];
2586       sum2 = tmp[row -1];
2587       sum3 = tmp[row -2];
2588       v2   = aa + ad[row] + 1;
2589       v3   = aa + ad[row -1] + 1;
2590       for (j=0 ; j<nz-1; j+=2) {
2591         i0   = vi[j];
2592         i1   = vi[j+1];
2593         tmp0 = tmps[i0];
2594         tmp1 = tmps[i1];
2595         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2596         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2597         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2598       }
2599       if (j== nz-1) {
2600         tmp0  = tmps[vi[j]];
2601         sum1 -= v1[j] * tmp0;
2602         sum2 -= v2[j+1] * tmp0;
2603         sum3 -= v3[j+2] * tmp0;
2604       }
2605       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2606       sum2   -= v2[0]* tmp0;
2607       sum3   -= v3[1] * tmp0;
2608       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2609       sum3   -= v3[0]* tmp0;
2610       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2611 
2612       break;
2613     case 4 :
2614       sum1 = tmp[row];
2615       sum2 = tmp[row -1];
2616       sum3 = tmp[row -2];
2617       sum4 = tmp[row -3];
2618       v2   = aa + ad[row]+1;
2619       v3   = aa + ad[row -1]+1;
2620       v4   = aa + ad[row -2]+1;
2621 
2622       for (j=0 ; j<nz-1; j+=2) {
2623         i0   = vi[j];
2624         i1   = vi[j+1];
2625         tmp0 = tmps[i0];
2626         tmp1 = tmps[i1];
2627         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2628         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2629         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2630         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2631       }
2632       if (j== nz-1) {
2633         tmp0  = tmps[vi[j]];
2634         sum1 -= v1[j] * tmp0;
2635         sum2 -= v2[j+1] * tmp0;
2636         sum3 -= v3[j+2] * tmp0;
2637         sum4 -= v4[j+3] * tmp0;
2638       }
2639 
2640       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2641       sum2   -= v2[0] * tmp0;
2642       sum3   -= v3[1] * tmp0;
2643       sum4   -= v4[2] * tmp0;
2644       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2645       sum3   -= v3[0] * tmp0;
2646       sum4   -= v4[1] * tmp0;
2647       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2648       sum4   -= v4[0] * tmp0;
2649       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2650       break;
2651     case 5 :
2652       sum1 = tmp[row];
2653       sum2 = tmp[row -1];
2654       sum3 = tmp[row -2];
2655       sum4 = tmp[row -3];
2656       sum5 = tmp[row -4];
2657       v2   = aa + ad[row]+1;
2658       v3   = aa + ad[row -1]+1;
2659       v4   = aa + ad[row -2]+1;
2660       v5   = aa + ad[row -3]+1;
2661       for (j=0 ; j<nz-1; j+=2) {
2662         i0   = vi[j];
2663         i1   = vi[j+1];
2664         tmp0 = tmps[i0];
2665         tmp1 = tmps[i1];
2666         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2667         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2668         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2669         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2670         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2671       }
2672       if (j==nz-1) {
2673         tmp0  = tmps[vi[j]];
2674         sum1 -= v1[j] * tmp0;
2675         sum2 -= v2[j+1] * tmp0;
2676         sum3 -= v3[j+2] * tmp0;
2677         sum4 -= v4[j+3] * tmp0;
2678         sum5 -= v5[j+4] * tmp0;
2679       }
2680 
2681       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2682       sum2   -= v2[0] * tmp0;
2683       sum3   -= v3[1] * tmp0;
2684       sum4   -= v4[2] * tmp0;
2685       sum5   -= v5[3] * tmp0;
2686       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2687       sum3   -= v3[0] * tmp0;
2688       sum4   -= v4[1] * tmp0;
2689       sum5   -= v5[2] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2691       sum4   -= v4[0] * tmp0;
2692       sum5   -= v5[1] * tmp0;
2693       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2694       sum5   -= v5[0] * tmp0;
2695       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2696       break;
2697     default:
2698       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2699     }
2700   }
2701   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2702   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2703   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2704   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2705   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2706   PetscFunctionReturn(0);
2707 }
2708 
2709 
2710 /*
2711      Makes a longer coloring[] array and calls the usual code with that
2712 */
2713 #undef __FUNCT__
2714 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2715 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2716 {
2717   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2718   PetscErrorCode  ierr;
2719   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2720   PetscInt        *colorused,i;
2721   ISColoringValue *newcolor;
2722 
2723   PetscFunctionBegin;
2724   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2725   /* loop over inodes, marking a color for each column*/
2726   row = 0;
2727   for (i=0; i<m; i++) {
2728     for (j=0; j<ns[i]; j++) {
2729       newcolor[row++] = coloring[i] + j*ncolors;
2730     }
2731   }
2732 
2733   /* eliminate unneeded colors */
2734   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2735   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2736   for (i=0; i<n; i++) {
2737     colorused[newcolor[i]] = 1;
2738   }
2739 
2740   for (i=1; i<5*ncolors; i++) {
2741     colorused[i] += colorused[i-1];
2742   }
2743   ncolors = colorused[5*ncolors-1];
2744   for (i=0; i<n; i++) {
2745     newcolor[i] = colorused[newcolor[i]]-1;
2746   }
2747   ierr = PetscFree(colorused);CHKERRQ(ierr);
2748   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2749   ierr = PetscFree(coloring);CHKERRQ(ierr);
2750   PetscFunctionReturn(0);
2751 }
2752 
2753 #include <../src/mat/blockinvert.h>
2754 
2755 #undef __FUNCT__
2756 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2757 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2758 {
2759   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2760   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2761   MatScalar          *ibdiag,*bdiag,work[25],*t;
2762   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2763   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2764   const PetscScalar  *xb, *b;
2765   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2766   PetscErrorCode     ierr;
2767   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2768   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2769 
2770   PetscFunctionBegin;
2771   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2772   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2773 
2774   if (!a->inode.ibdiagvalid) {
2775     if (!a->inode.ibdiag) {
2776       /* calculate space needed for diagonal blocks */
2777       for (i=0; i<m; i++) {
2778         cnt += sizes[i]*sizes[i];
2779       }
2780       a->inode.bdiagsize = cnt;
2781       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2782     }
2783 
2784     /* copy over the diagonal blocks and invert them */
2785     ibdiag = a->inode.ibdiag;
2786     bdiag  = a->inode.bdiag;
2787     cnt = 0;
2788     for (i=0, row = 0; i<m; i++) {
2789       for (j=0; j<sizes[i]; j++) {
2790         for (k=0; k<sizes[i]; k++) {
2791           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2792         }
2793       }
2794       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2795 
2796       switch(sizes[i]) {
2797         case 1:
2798           /* Create matrix data structure */
2799           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2800           ibdiag[cnt] = 1.0/ibdiag[cnt];
2801           break;
2802         case 2:
2803           ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2804           break;
2805         case 3:
2806           ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2807           break;
2808         case 4:
2809           ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2810           break;
2811         case 5:
2812           ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2813           break;
2814        default:
2815          SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2816       }
2817       cnt += sizes[i]*sizes[i];
2818       row += sizes[i];
2819     }
2820     a->inode.ibdiagvalid = PETSC_TRUE;
2821   }
2822   ibdiag = a->inode.ibdiag;
2823   bdiag  = a->inode.bdiag;
2824   t = a->inode.ssor_work;
2825 
2826   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2827   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2828   CHKMEMQ;
2829   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2830   if (flag & SOR_ZERO_INITIAL_GUESS) {
2831     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2832 
2833       for (i=0, row=0; i<m; i++) {
2834         sz  = diag[row] - ii[row];
2835         v1  = a->a + ii[row];
2836         idx = a->j + ii[row];
2837 
2838         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2839         switch (sizes[i]) {
2840           case 1:
2841 
2842             sum1  = b[row];
2843             for (n = 0; n<sz-1; n+=2) {
2844               i1   = idx[0];
2845               i2   = idx[1];
2846               idx += 2;
2847               tmp0 = x[i1];
2848               tmp1 = x[i2];
2849               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2850             }
2851 
2852             if (n == sz-1) {
2853               tmp0  = x[*idx];
2854               sum1 -= *v1 * tmp0;
2855             }
2856             t[row] = sum1;
2857             x[row++] = sum1*(*ibdiag++);
2858             break;
2859           case 2:
2860             v2    = a->a + ii[row+1];
2861             sum1  = b[row];
2862             sum2  = b[row+1];
2863             for (n = 0; n<sz-1; n+=2) {
2864               i1   = idx[0];
2865               i2   = idx[1];
2866               idx += 2;
2867               tmp0 = x[i1];
2868               tmp1 = x[i2];
2869               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2870               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2871             }
2872 
2873             if (n == sz-1) {
2874               tmp0  = x[*idx];
2875               sum1 -= v1[0] * tmp0;
2876               sum2 -= v2[0] * tmp0;
2877             }
2878             t[row]   = sum1;
2879             t[row+1] = sum2;
2880             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2881             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2882             ibdiag  += 4;
2883             break;
2884           case 3:
2885             v2    = a->a + ii[row+1];
2886             v3    = a->a + ii[row+2];
2887             sum1  = b[row];
2888             sum2  = b[row+1];
2889             sum3  = b[row+2];
2890             for (n = 0; n<sz-1; n+=2) {
2891               i1   = idx[0];
2892               i2   = idx[1];
2893               idx += 2;
2894               tmp0 = x[i1];
2895               tmp1 = x[i2];
2896               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2897               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2898               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2899             }
2900 
2901             if (n == sz-1) {
2902               tmp0  = x[*idx];
2903               sum1 -= v1[0] * tmp0;
2904               sum2 -= v2[0] * tmp0;
2905               sum3 -= v3[0] * tmp0;
2906             }
2907             t[row]   = sum1;
2908             t[row+1] = sum2;
2909             t[row+2] = sum3;
2910             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2911             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2912             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2913             ibdiag  += 9;
2914             break;
2915           case 4:
2916             v2    = a->a + ii[row+1];
2917             v3    = a->a + ii[row+2];
2918             v4    = a->a + ii[row+3];
2919             sum1  = b[row];
2920             sum2  = b[row+1];
2921             sum3  = b[row+2];
2922             sum4  = b[row+3];
2923             for (n = 0; n<sz-1; n+=2) {
2924               i1   = idx[0];
2925               i2   = idx[1];
2926               idx += 2;
2927               tmp0 = x[i1];
2928               tmp1 = x[i2];
2929               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2930               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2931               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2932               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2933             }
2934 
2935             if (n == sz-1) {
2936               tmp0  = x[*idx];
2937               sum1 -= v1[0] * tmp0;
2938               sum2 -= v2[0] * tmp0;
2939               sum3 -= v3[0] * tmp0;
2940               sum4 -= v4[0] * tmp0;
2941             }
2942             t[row]   = sum1;
2943             t[row+1] = sum2;
2944             t[row+2] = sum3;
2945             t[row+3] = sum4;
2946             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2947             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2948             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2949             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2950             ibdiag  += 16;
2951             break;
2952           case 5:
2953             v2    = a->a + ii[row+1];
2954             v3    = a->a + ii[row+2];
2955             v4    = a->a + ii[row+3];
2956             v5    = a->a + ii[row+4];
2957             sum1  = b[row];
2958             sum2  = b[row+1];
2959             sum3  = b[row+2];
2960             sum4  = b[row+3];
2961             sum5  = b[row+4];
2962             for (n = 0; n<sz-1; n+=2) {
2963               i1   = idx[0];
2964               i2   = idx[1];
2965               idx += 2;
2966               tmp0 = x[i1];
2967               tmp1 = x[i2];
2968               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2969               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2970               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2971               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2972               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2973             }
2974 
2975             if (n == sz-1) {
2976               tmp0  = x[*idx];
2977               sum1 -= v1[0] * tmp0;
2978               sum2 -= v2[0] * tmp0;
2979               sum3 -= v3[0] * tmp0;
2980               sum4 -= v4[0] * tmp0;
2981               sum5 -= v5[0] * tmp0;
2982             }
2983             t[row]   = sum1;
2984             t[row+1] = sum2;
2985             t[row+2] = sum3;
2986             t[row+3] = sum4;
2987             t[row+4] = sum5;
2988             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2989             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2990             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2991             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2992             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2993             ibdiag  += 25;
2994             break;
2995           default:
2996             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2997         }
2998       }
2999 
3000       xb = t;
3001       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3002     } else xb = b;
3003     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3004 
3005       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3006       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3007         ibdiag -= sizes[i]*sizes[i];
3008         sz      = ii[row+1] - diag[row] - 1;
3009         v1      = a->a + diag[row] + 1;
3010         idx     = a->j + diag[row] + 1;
3011 
3012         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3013         switch (sizes[i]) {
3014           case 1:
3015 
3016             sum1  = xb[row];
3017             for (n = 0; n<sz-1; n+=2) {
3018               i1   = idx[0];
3019               i2   = idx[1];
3020               idx += 2;
3021               tmp0 = x[i1];
3022               tmp1 = x[i2];
3023               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3024             }
3025 
3026             if (n == sz-1) {
3027               tmp0  = x[*idx];
3028               sum1 -= *v1*tmp0;
3029             }
3030             x[row--] = sum1*(*ibdiag);
3031             break;
3032 
3033           case 2:
3034 
3035             sum1  = xb[row];
3036             sum2  = xb[row-1];
3037             /* note that sum1 is associated with the second of the two rows */
3038             v2    = a->a + diag[row-1] + 2;
3039             for (n = 0; n<sz-1; n+=2) {
3040               i1   = idx[0];
3041               i2   = idx[1];
3042               idx += 2;
3043               tmp0 = x[i1];
3044               tmp1 = x[i2];
3045               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3046               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3047             }
3048 
3049             if (n == sz-1) {
3050               tmp0  = x[*idx];
3051               sum1 -= *v1*tmp0;
3052               sum2 -= *v2*tmp0;
3053             }
3054             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3055             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3056             break;
3057           case 3:
3058 
3059             sum1  = xb[row];
3060             sum2  = xb[row-1];
3061             sum3  = xb[row-2];
3062             v2    = a->a + diag[row-1] + 2;
3063             v3    = a->a + diag[row-2] + 3;
3064             for (n = 0; n<sz-1; n+=2) {
3065               i1   = idx[0];
3066               i2   = idx[1];
3067               idx += 2;
3068               tmp0 = x[i1];
3069               tmp1 = x[i2];
3070               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3071               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3072               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3073             }
3074 
3075             if (n == sz-1) {
3076               tmp0  = x[*idx];
3077               sum1 -= *v1*tmp0;
3078               sum2 -= *v2*tmp0;
3079               sum3 -= *v3*tmp0;
3080             }
3081             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3082             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3083             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3084             break;
3085           case 4:
3086 
3087             sum1  = xb[row];
3088             sum2  = xb[row-1];
3089             sum3  = xb[row-2];
3090             sum4  = xb[row-3];
3091             v2    = a->a + diag[row-1] + 2;
3092             v3    = a->a + diag[row-2] + 3;
3093             v4    = a->a + diag[row-3] + 4;
3094             for (n = 0; n<sz-1; n+=2) {
3095               i1   = idx[0];
3096               i2   = idx[1];
3097               idx += 2;
3098               tmp0 = x[i1];
3099               tmp1 = x[i2];
3100               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3101               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3102               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3103               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3104             }
3105 
3106             if (n == sz-1) {
3107               tmp0  = x[*idx];
3108               sum1 -= *v1*tmp0;
3109               sum2 -= *v2*tmp0;
3110               sum3 -= *v3*tmp0;
3111               sum4 -= *v4*tmp0;
3112             }
3113             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3114             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3115             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3116             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3117             break;
3118           case 5:
3119 
3120             sum1  = xb[row];
3121             sum2  = xb[row-1];
3122             sum3  = xb[row-2];
3123             sum4  = xb[row-3];
3124             sum5  = xb[row-4];
3125             v2    = a->a + diag[row-1] + 2;
3126             v3    = a->a + diag[row-2] + 3;
3127             v4    = a->a + diag[row-3] + 4;
3128             v5    = a->a + diag[row-4] + 5;
3129             for (n = 0; n<sz-1; n+=2) {
3130               i1   = idx[0];
3131               i2   = idx[1];
3132               idx += 2;
3133               tmp0 = x[i1];
3134               tmp1 = x[i2];
3135               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3136               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3137               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3138               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3139               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3140             }
3141 
3142             if (n == sz-1) {
3143               tmp0  = x[*idx];
3144               sum1 -= *v1*tmp0;
3145               sum2 -= *v2*tmp0;
3146               sum3 -= *v3*tmp0;
3147               sum4 -= *v4*tmp0;
3148               sum5 -= *v5*tmp0;
3149             }
3150             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3151             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3152             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3153             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3154             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3155             break;
3156           default:
3157             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3158         }
3159       }
3160 
3161       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3162     }
3163     its--;
3164   }
3165   while (its--) {
3166 
3167     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3168       ibdiag = a->inode.ibdiag;
3169 
3170       for (i=0, row=0; i<m; i++) {
3171         sz  = ii[row + 1] - ii[row];
3172         v1  = a->a + ii[row];
3173         idx = a->j + ii[row];
3174 
3175         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3176         switch (sizes[i]) {
3177           case 1:
3178 
3179             sum1  = b[row];
3180             for (n = 0; n<sz-1; n+=2) {
3181               i1   = idx[0];
3182               i2   = idx[1];
3183               idx += 2;
3184               tmp0 = x[i1];
3185               tmp1 = x[i2];
3186               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3187             }
3188 
3189             if (n == sz-1) {
3190               tmp0  = x[*idx];
3191               sum1 -= *v1 * tmp0;
3192             }
3193 
3194             /* in MatSOR_SeqAIJ this line would be
3195              *
3196              * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3197              *
3198              * but omega == 1, so this becomes
3199              *
3200              * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3201              *
3202              * but bdiag and ibdiag cancel each other, so we can change this
3203              * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3204              * block sizes as well
3205              */
3206             x[row++] += sum1*(*ibdiag++);
3207             break;
3208           case 2:
3209             v2    = a->a + ii[row+1];
3210             sum1  = b[row];
3211             sum2  = b[row+1];
3212             for (n = 0; n<sz-1; n+=2) {
3213               i1   = idx[0];
3214               i2   = idx[1];
3215               idx += 2;
3216               tmp0 = x[i1];
3217               tmp1 = x[i2];
3218               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3219               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3220             }
3221 
3222             if (n == sz-1) {
3223               tmp0  = x[*idx];
3224               sum1 -= v1[0] * tmp0;
3225               sum2 -= v2[0] * tmp0;
3226             }
3227             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3228             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3229             ibdiag  += 4;
3230             break;
3231           case 3:
3232             v2    = a->a + ii[row+1];
3233             v3    = a->a + ii[row+2];
3234             sum1  = b[row];
3235             sum2  = b[row+1];
3236             sum3  = b[row+2];
3237             for (n = 0; n<sz-1; n+=2) {
3238               i1   = idx[0];
3239               i2   = idx[1];
3240               idx += 2;
3241               tmp0 = x[i1];
3242               tmp1 = x[i2];
3243               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3244               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3245               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3246             }
3247 
3248             if (n == sz-1) {
3249               tmp0  = x[*idx];
3250               sum1 -= v1[0] * tmp0;
3251               sum2 -= v2[0] * tmp0;
3252               sum3 -= v3[0] * tmp0;
3253             }
3254             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3255             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3256             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3257             ibdiag  += 9;
3258             break;
3259           case 4:
3260             v2    = a->a + ii[row+1];
3261             v3    = a->a + ii[row+2];
3262             v4    = a->a + ii[row+3];
3263             sum1  = b[row];
3264             sum2  = b[row+1];
3265             sum3  = b[row+2];
3266             sum4  = b[row+3];
3267             for (n = 0; n<sz-1; n+=2) {
3268               i1   = idx[0];
3269               i2   = idx[1];
3270               idx += 2;
3271               tmp0 = x[i1];
3272               tmp1 = x[i2];
3273               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3274               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3275               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3276               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3277             }
3278 
3279             if (n == sz-1) {
3280               tmp0  = x[*idx];
3281               sum1 -= v1[0] * tmp0;
3282               sum2 -= v2[0] * tmp0;
3283               sum3 -= v3[0] * tmp0;
3284               sum4 -= v4[0] * tmp0;
3285             }
3286             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3287             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3288             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3289             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3290             ibdiag  += 16;
3291             break;
3292           case 5:
3293             v2    = a->a + ii[row+1];
3294             v3    = a->a + ii[row+2];
3295             v4    = a->a + ii[row+3];
3296             v5    = a->a + ii[row+4];
3297             sum1  = b[row];
3298             sum2  = b[row+1];
3299             sum3  = b[row+2];
3300             sum4  = b[row+3];
3301             sum5  = b[row+4];
3302             for (n = 0; n<sz-1; n+=2) {
3303               i1   = idx[0];
3304               i2   = idx[1];
3305               idx += 2;
3306               tmp0 = x[i1];
3307               tmp1 = x[i2];
3308               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3309               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3310               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3311               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3312               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3313             }
3314 
3315             if (n == sz-1) {
3316               tmp0  = x[*idx];
3317               sum1 -= v1[0] * tmp0;
3318               sum2 -= v2[0] * tmp0;
3319               sum3 -= v3[0] * tmp0;
3320               sum4 -= v4[0] * tmp0;
3321               sum5 -= v5[0] * tmp0;
3322             }
3323             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3324             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3325             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3326             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3327             x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3328             ibdiag  += 25;
3329             break;
3330           default:
3331             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3332         }
3333       }
3334 
3335       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3336     }
3337     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3338 
3339       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3340       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3341         ibdiag -= sizes[i]*sizes[i];
3342         sz      = ii[row+1] - ii[row];
3343         v1      = a->a + ii[row];
3344         idx     = a->j + ii[row];
3345 
3346         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3347         switch (sizes[i]) {
3348           case 1:
3349 
3350             sum1  = b[row];
3351             for (n = 0; n<sz-1; n+=2) {
3352               i1   = idx[0];
3353               i2   = idx[1];
3354               idx += 2;
3355               tmp0 = x[i1];
3356               tmp1 = x[i2];
3357               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3358             }
3359 
3360             if (n == sz-1) {
3361               tmp0  = x[*idx];
3362               sum1 -= *v1*tmp0;
3363             }
3364             x[row--] += sum1*(*ibdiag);
3365             break;
3366 
3367           case 2:
3368 
3369             sum1  = b[row];
3370             sum2  = b[row-1];
3371             /* note that sum1 is associated with the second of the two rows */
3372             v2    = a->a + ii[row - 1];
3373             for (n = 0; n<sz-1; n+=2) {
3374               i1   = idx[0];
3375               i2   = idx[1];
3376               idx += 2;
3377               tmp0 = x[i1];
3378               tmp1 = x[i2];
3379               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3380               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3381             }
3382 
3383             if (n == sz-1) {
3384               tmp0  = x[*idx];
3385               sum1 -= *v1*tmp0;
3386               sum2 -= *v2*tmp0;
3387             }
3388             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3389             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3390             break;
3391           case 3:
3392 
3393             sum1  = b[row];
3394             sum2  = b[row-1];
3395             sum3  = b[row-2];
3396             v2    = a->a + ii[row-1];
3397             v3    = a->a + ii[row-2];
3398             for (n = 0; n<sz-1; n+=2) {
3399               i1   = idx[0];
3400               i2   = idx[1];
3401               idx += 2;
3402               tmp0 = x[i1];
3403               tmp1 = x[i2];
3404               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3405               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3406               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3407             }
3408 
3409             if (n == sz-1) {
3410               tmp0  = x[*idx];
3411               sum1 -= *v1*tmp0;
3412               sum2 -= *v2*tmp0;
3413               sum3 -= *v3*tmp0;
3414             }
3415             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3416             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3417             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3418             break;
3419           case 4:
3420 
3421             sum1  = b[row];
3422             sum2  = b[row-1];
3423             sum3  = b[row-2];
3424             sum4  = b[row-3];
3425             v2    = a->a + ii[row-1];
3426             v3    = a->a + ii[row-2];
3427             v4    = a->a + ii[row-3];
3428             for (n = 0; n<sz-1; n+=2) {
3429               i1   = idx[0];
3430               i2   = idx[1];
3431               idx += 2;
3432               tmp0 = x[i1];
3433               tmp1 = x[i2];
3434               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3435               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3436               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3437               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3438             }
3439 
3440             if (n == sz-1) {
3441               tmp0  = x[*idx];
3442               sum1 -= *v1*tmp0;
3443               sum2 -= *v2*tmp0;
3444               sum3 -= *v3*tmp0;
3445               sum4 -= *v4*tmp0;
3446             }
3447             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3448             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3449             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3450             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3451             break;
3452           case 5:
3453 
3454             sum1  = b[row];
3455             sum2  = b[row-1];
3456             sum3  = b[row-2];
3457             sum4  = b[row-3];
3458             sum5  = b[row-4];
3459             v2    = a->a + ii[row-1];
3460             v3    = a->a + ii[row-2];
3461             v4    = a->a + ii[row-3];
3462             v5    = a->a + ii[row-4];
3463             for (n = 0; n<sz-1; n+=2) {
3464               i1   = idx[0];
3465               i2   = idx[1];
3466               idx += 2;
3467               tmp0 = x[i1];
3468               tmp1 = x[i2];
3469               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3470               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3471               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3472               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3473               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3474             }
3475 
3476             if (n == sz-1) {
3477               tmp0  = x[*idx];
3478               sum1 -= *v1*tmp0;
3479               sum2 -= *v2*tmp0;
3480               sum3 -= *v3*tmp0;
3481               sum4 -= *v4*tmp0;
3482               sum5 -= *v5*tmp0;
3483             }
3484             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3485             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3486             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3487             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3488             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3489             break;
3490           default:
3491             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3492         }
3493       }
3494 
3495       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3496     }
3497   }
3498   if (flag & SOR_EISENSTAT) {
3499     /*
3500           Apply  (U + D)^-1  where D is now the block diagonal
3501     */
3502     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3503     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3504       ibdiag -= sizes[i]*sizes[i];
3505       sz      = ii[row+1] - diag[row] - 1;
3506       v1      = a->a + diag[row] + 1;
3507       idx     = a->j + diag[row] + 1;
3508       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3509       switch (sizes[i]) {
3510         case 1:
3511 
3512           sum1  = b[row];
3513           for (n = 0; n<sz-1; n+=2) {
3514             i1   = idx[0];
3515             i2   = idx[1];
3516             idx += 2;
3517             tmp0 = x[i1];
3518             tmp1 = x[i2];
3519             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3520           }
3521 
3522           if (n == sz-1) {
3523             tmp0  = x[*idx];
3524             sum1 -= *v1*tmp0;
3525           }
3526           x[row] = sum1*(*ibdiag);row--;
3527           break;
3528 
3529         case 2:
3530 
3531           sum1  = b[row];
3532           sum2  = b[row-1];
3533           /* note that sum1 is associated with the second of the two rows */
3534           v2    = a->a + diag[row-1] + 2;
3535           for (n = 0; n<sz-1; n+=2) {
3536             i1   = idx[0];
3537             i2   = idx[1];
3538             idx += 2;
3539             tmp0 = x[i1];
3540             tmp1 = x[i2];
3541             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3542             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3543           }
3544 
3545           if (n == sz-1) {
3546             tmp0  = x[*idx];
3547             sum1 -= *v1*tmp0;
3548             sum2 -= *v2*tmp0;
3549           }
3550           x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3551           x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3552           row -= 2;
3553           break;
3554         case 3:
3555 
3556           sum1  = b[row];
3557           sum2  = b[row-1];
3558           sum3  = b[row-2];
3559           v2    = a->a + diag[row-1] + 2;
3560           v3    = a->a + diag[row-2] + 3;
3561           for (n = 0; n<sz-1; n+=2) {
3562             i1   = idx[0];
3563             i2   = idx[1];
3564             idx += 2;
3565             tmp0 = x[i1];
3566             tmp1 = x[i2];
3567             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3568             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3569             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3570           }
3571 
3572           if (n == sz-1) {
3573             tmp0  = x[*idx];
3574             sum1 -= *v1*tmp0;
3575             sum2 -= *v2*tmp0;
3576             sum3 -= *v3*tmp0;
3577           }
3578           x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3579           x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3580           x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3581           row -= 3;
3582           break;
3583         case 4:
3584 
3585           sum1  = b[row];
3586           sum2  = b[row-1];
3587           sum3  = b[row-2];
3588           sum4  = b[row-3];
3589           v2    = a->a + diag[row-1] + 2;
3590           v3    = a->a + diag[row-2] + 3;
3591           v4    = a->a + diag[row-3] + 4;
3592           for (n = 0; n<sz-1; n+=2) {
3593             i1   = idx[0];
3594             i2   = idx[1];
3595             idx += 2;
3596             tmp0 = x[i1];
3597             tmp1 = x[i2];
3598             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3599             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3600             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3601             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3602           }
3603 
3604           if (n == sz-1) {
3605             tmp0  = x[*idx];
3606             sum1 -= *v1*tmp0;
3607             sum2 -= *v2*tmp0;
3608             sum3 -= *v3*tmp0;
3609             sum4 -= *v4*tmp0;
3610           }
3611           x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3612           x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3613           x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3614           x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3615           row -= 4;
3616           break;
3617         case 5:
3618 
3619           sum1  = b[row];
3620           sum2  = b[row-1];
3621           sum3  = b[row-2];
3622           sum4  = b[row-3];
3623           sum5  = b[row-4];
3624           v2    = a->a + diag[row-1] + 2;
3625           v3    = a->a + diag[row-2] + 3;
3626           v4    = a->a + diag[row-3] + 4;
3627           v5    = a->a + diag[row-4] + 5;
3628           for (n = 0; n<sz-1; n+=2) {
3629             i1   = idx[0];
3630             i2   = idx[1];
3631             idx += 2;
3632             tmp0 = x[i1];
3633             tmp1 = x[i2];
3634             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3635             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3636             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3637             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3638             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3639           }
3640 
3641           if (n == sz-1) {
3642             tmp0  = x[*idx];
3643             sum1 -= *v1*tmp0;
3644             sum2 -= *v2*tmp0;
3645             sum3 -= *v3*tmp0;
3646             sum4 -= *v4*tmp0;
3647             sum5 -= *v5*tmp0;
3648           }
3649           x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3650           x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3651           x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3652           x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3653           x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3654           row -= 5;
3655           break;
3656         default:
3657           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3658       }
3659     }
3660     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3661 
3662     /*
3663            t = b - D x    where D is the block diagonal
3664     */
3665     cnt = 0;
3666     for (i=0, row=0; i<m; i++) {
3667       switch (sizes[i]) {
3668         case 1:
3669           t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3670           break;
3671         case 2:
3672           x1   = x[row]; x2 = x[row+1];
3673           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3674           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3675           t[row]   = b[row] - tmp1;
3676           t[row+1] = b[row+1] - tmp2; row += 2;
3677           cnt += 4;
3678           break;
3679         case 3:
3680           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3681           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3682           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3683           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3684           t[row] = b[row] - tmp1;
3685           t[row+1] = b[row+1] - tmp2;
3686           t[row+2] = b[row+2] - tmp3; row += 3;
3687           cnt += 9;
3688           break;
3689         case 4:
3690           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3691           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3692           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3693           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3694           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3695           t[row] = b[row] - tmp1;
3696           t[row+1] = b[row+1] - tmp2;
3697           t[row+2] = b[row+2] - tmp3;
3698           t[row+3] = b[row+3] - tmp4; row += 4;
3699           cnt += 16;
3700           break;
3701         case 5:
3702           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3703           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3704           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3705           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3706           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3707           tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3708           t[row] = b[row] - tmp1;
3709           t[row+1] = b[row+1] - tmp2;
3710           t[row+2] = b[row+2] - tmp3;
3711           t[row+3] = b[row+3] - tmp4;
3712           t[row+4] = b[row+4] - tmp5;row += 5;
3713           cnt += 25;
3714           break;
3715         default:
3716           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3717       }
3718     }
3719     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3720 
3721 
3722 
3723     /*
3724           Apply (L + D)^-1 where D is the block diagonal
3725     */
3726     for (i=0, row=0; i<m; i++) {
3727       sz  = diag[row] - ii[row];
3728       v1  = a->a + ii[row];
3729       idx = a->j + ii[row];
3730       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3731       switch (sizes[i]) {
3732         case 1:
3733 
3734           sum1  = t[row];
3735           for (n = 0; n<sz-1; n+=2) {
3736             i1   = idx[0];
3737             i2   = idx[1];
3738             idx += 2;
3739             tmp0 = t[i1];
3740             tmp1 = t[i2];
3741             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3742           }
3743 
3744           if (n == sz-1) {
3745             tmp0  = t[*idx];
3746             sum1 -= *v1 * tmp0;
3747           }
3748           x[row] += t[row] = sum1*(*ibdiag++); row++;
3749           break;
3750         case 2:
3751           v2    = a->a + ii[row+1];
3752           sum1  = t[row];
3753           sum2  = t[row+1];
3754           for (n = 0; n<sz-1; n+=2) {
3755             i1   = idx[0];
3756             i2   = idx[1];
3757             idx += 2;
3758             tmp0 = t[i1];
3759             tmp1 = t[i2];
3760             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3761             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3762           }
3763 
3764           if (n == sz-1) {
3765             tmp0  = t[*idx];
3766             sum1 -= v1[0] * tmp0;
3767             sum2 -= v2[0] * tmp0;
3768           }
3769           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3770           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3771           ibdiag  += 4; row += 2;
3772           break;
3773         case 3:
3774           v2    = a->a + ii[row+1];
3775           v3    = a->a + ii[row+2];
3776           sum1  = t[row];
3777           sum2  = t[row+1];
3778           sum3  = t[row+2];
3779           for (n = 0; n<sz-1; n+=2) {
3780             i1   = idx[0];
3781             i2   = idx[1];
3782             idx += 2;
3783             tmp0 = t[i1];
3784             tmp1 = t[i2];
3785             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3786             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3787             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3788           }
3789 
3790           if (n == sz-1) {
3791             tmp0  = t[*idx];
3792             sum1 -= v1[0] * tmp0;
3793             sum2 -= v2[0] * tmp0;
3794             sum3 -= v3[0] * tmp0;
3795           }
3796           x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3797           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3798           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3799           ibdiag  += 9; row += 3;
3800           break;
3801         case 4:
3802           v2    = a->a + ii[row+1];
3803           v3    = a->a + ii[row+2];
3804           v4    = a->a + ii[row+3];
3805           sum1  = t[row];
3806           sum2  = t[row+1];
3807           sum3  = t[row+2];
3808           sum4  = t[row+3];
3809           for (n = 0; n<sz-1; n+=2) {
3810             i1   = idx[0];
3811             i2   = idx[1];
3812             idx += 2;
3813             tmp0 = t[i1];
3814             tmp1 = t[i2];
3815             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3816             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3817             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3818             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3819           }
3820 
3821           if (n == sz-1) {
3822             tmp0  = t[*idx];
3823             sum1 -= v1[0] * tmp0;
3824             sum2 -= v2[0] * tmp0;
3825             sum3 -= v3[0] * tmp0;
3826             sum4 -= v4[0] * tmp0;
3827           }
3828           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3829           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3830           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3831           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3832           ibdiag  += 16; row += 4;
3833           break;
3834         case 5:
3835           v2    = a->a + ii[row+1];
3836           v3    = a->a + ii[row+2];
3837           v4    = a->a + ii[row+3];
3838           v5    = a->a + ii[row+4];
3839           sum1  = t[row];
3840           sum2  = t[row+1];
3841           sum3  = t[row+2];
3842           sum4  = t[row+3];
3843           sum5  = t[row+4];
3844           for (n = 0; n<sz-1; n+=2) {
3845             i1   = idx[0];
3846             i2   = idx[1];
3847             idx += 2;
3848             tmp0 = t[i1];
3849             tmp1 = t[i2];
3850             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3851             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3852             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3853             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3854             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3855           }
3856 
3857           if (n == sz-1) {
3858             tmp0  = t[*idx];
3859             sum1 -= v1[0] * tmp0;
3860             sum2 -= v2[0] * tmp0;
3861             sum3 -= v3[0] * tmp0;
3862             sum4 -= v4[0] * tmp0;
3863             sum5 -= v5[0] * tmp0;
3864           }
3865           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3866           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3867           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3868           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3869           x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3870           ibdiag  += 25; row += 5;
3871           break;
3872         default:
3873           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3874       }
3875     }
3876     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3877   }
3878   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3879   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3880   CHKMEMQ;  PetscFunctionReturn(0);
3881 }
3882 
3883 #undef __FUNCT__
3884 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3885 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3886 {
3887   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3888   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3889   const MatScalar    *bdiag = a->inode.bdiag;
3890   const PetscScalar  *b;
3891   PetscErrorCode      ierr;
3892   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3893   const PetscInt      *sizes = a->inode.size;
3894 
3895   PetscFunctionBegin;
3896   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3897   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3898   cnt = 0;
3899   for (i=0, row=0; i<m; i++) {
3900     switch (sizes[i]) {
3901       case 1:
3902         x[row] = b[row]*bdiag[cnt++];row++;
3903         break;
3904       case 2:
3905         x1   = b[row]; x2 = b[row+1];
3906         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3907         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3908         x[row++] = tmp1;
3909         x[row++] = tmp2;
3910         cnt += 4;
3911         break;
3912       case 3:
3913         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3914         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3915         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3916         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3917         x[row++] = tmp1;
3918         x[row++] = tmp2;
3919         x[row++] = tmp3;
3920         cnt += 9;
3921         break;
3922       case 4:
3923         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3924         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3925         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3926         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3927         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3928         x[row++] = tmp1;
3929         x[row++] = tmp2;
3930         x[row++] = tmp3;
3931         x[row++] = tmp4;
3932         cnt += 16;
3933         break;
3934       case 5:
3935         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3936         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3937         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3938         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3939         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3940         tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3941         x[row++] = tmp1;
3942         x[row++] = tmp2;
3943         x[row++] = tmp3;
3944         x[row++] = tmp4;
3945         x[row++] = tmp5;
3946         cnt += 25;
3947         break;
3948       default:
3949         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3950     }
3951   }
3952   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3953   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3954   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3955   PetscFunctionReturn(0);
3956 }
3957 
3958 /*
3959     samestructure indicates that the matrix has not changed its nonzero structure so we
3960     do not need to recompute the inodes
3961 */
3962 #undef __FUNCT__
3963 #define __FUNCT__ "Mat_CheckInode"
3964 PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3965 {
3966   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3967   PetscErrorCode ierr;
3968   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3969   PetscBool      flag;
3970 
3971   PetscFunctionBegin;
3972   if (!a->inode.use)                     PetscFunctionReturn(0);
3973   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3974 
3975 
3976   m = A->rmap->n;
3977   if (a->inode.size) {ns = a->inode.size;}
3978   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3979 
3980   i          = 0;
3981   node_count = 0;
3982   idx        = a->j;
3983   ii         = a->i;
3984   while (i < m) {                /* For each row */
3985     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3986     /* Limits the number of elements in a node to 'a->inode.limit' */
3987     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3988       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3989       if (nzy != nzx) break;
3990       idy  += nzx;             /* Same nonzero pattern */
3991       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3992       if (!flag) break;
3993     }
3994     ns[node_count++] = blk_size;
3995     idx += blk_size*nzx;
3996     i    = j;
3997   }
3998   /* If not enough inodes found,, do not use inode version of the routines */
3999   if (!m || node_count > .8*m) {
4000     ierr = PetscFree(ns);CHKERRQ(ierr);
4001     a->inode.node_count       = 0;
4002     a->inode.size             = PETSC_NULL;
4003     a->inode.use              = PETSC_FALSE;
4004     A->ops->mult              = MatMult_SeqAIJ;
4005     A->ops->sor               = MatSOR_SeqAIJ;
4006     A->ops->multadd           = MatMultAdd_SeqAIJ;
4007     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4008     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4009     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4010     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4011     A->ops->coloringpatch     = 0;
4012     A->ops->multdiagonalblock = 0;
4013     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4014   } else {
4015     if (!A->factortype) {
4016       A->ops->mult              = MatMult_SeqAIJ_Inode;
4017       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4018       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4019       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4020       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4021       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4022       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4023       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4024       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4025     } else {
4026       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4027     }
4028     a->inode.node_count       = node_count;
4029     a->inode.size             = ns;
4030     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4031   }
4032   a->inode.checked = PETSC_TRUE;
4033   PetscFunctionReturn(0);
4034 }
4035 
4036 #undef __FUNCT__
4037 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4038 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4039 {
4040   Mat            B=*C;
4041   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4042   PetscErrorCode ierr;
4043   PetscInt       m=A->rmap->n;
4044 
4045   PetscFunctionBegin;
4046 
4047   c->inode.use          = a->inode.use;
4048   c->inode.limit        = a->inode.limit;
4049   c->inode.max_limit    = a->inode.max_limit;
4050   if (a->inode.size) {
4051     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4052     c->inode.node_count = a->inode.node_count;
4053     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4054     /* note the table of functions below should match that in Mat_CheckInode() */
4055     if (!B->factortype) {
4056       B->ops->mult              = MatMult_SeqAIJ_Inode;
4057       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4058       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4059       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4060       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4061       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4062       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4063       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4064       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4065     } else {
4066       B->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4067     }
4068   } else {
4069     c->inode.size       = 0;
4070     c->inode.node_count = 0;
4071   }
4072   c->inode.ibdiagvalid = PETSC_FALSE;
4073   c->inode.ibdiag      = 0;
4074   c->inode.bdiag       = 0;
4075   PetscFunctionReturn(0);
4076 }
4077 
4078 #undef __FUNCT__
4079 #define __FUNCT__ "MatGetRow_FactoredLU"
4080 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4081 {
4082   PetscInt k, *vi;
4083   PetscFunctionBegin;
4084   vi = aj + ai[row];
4085   for (k=0;k<nzl;k++) cols[k] = vi[k];
4086   vi = aj + adiag[row];
4087   cols[nzl] = vi[0];
4088   vi = aj + adiag[row+1]+1;
4089   for (k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
4090   PetscFunctionReturn(0);
4091 }
4092 /*
4093    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4094    Modified from Mat_CheckInode().
4095 
4096    Input Parameters:
4097 +  Mat A - ILU or LU matrix factor
4098 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4099     do not need to recompute the inodes
4100 */
4101 #undef __FUNCT__
4102 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4103 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
4104 {
4105   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4106   PetscErrorCode ierr;
4107   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4108   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4109   PetscBool      flag;
4110 
4111   PetscFunctionBegin;
4112   if (!a->inode.use)                     PetscFunctionReturn(0);
4113   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4114 
4115   m = A->rmap->n;
4116   if (a->inode.size) {ns = a->inode.size;}
4117   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
4118 
4119   i          = 0;
4120   node_count = 0;
4121   while (i < m) {                /* For each row */
4122     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4123     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4124     nzx  = nzl1 + nzu1 + 1;
4125     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4126     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4127 
4128     /* Limits the number of elements in a node to 'a->inode.limit' */
4129     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4130       nzl2    = ai[j+1] - ai[j];
4131       nzu2    = adiag[j] - adiag[j+1] - 1;
4132       nzy     = nzl2 + nzu2 + 1;
4133       if ( nzy != nzx) break;
4134       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4135       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4136       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4137       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4138       ierr = PetscFree(cols2);CHKERRQ(ierr);
4139     }
4140     ns[node_count++] = blk_size;
4141     ierr = PetscFree(cols1);CHKERRQ(ierr);
4142     i    = j;
4143   }
4144   /* If not enough inodes found,, do not use inode version of the routines */
4145   if (!m || node_count > .8*m) {
4146     ierr = PetscFree(ns);CHKERRQ(ierr);
4147     a->inode.node_count     = 0;
4148     a->inode.size           = PETSC_NULL;
4149     a->inode.use            = PETSC_FALSE;
4150     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4151   } else {
4152     A->ops->mult              = 0;
4153     A->ops->sor               = 0;
4154     A->ops->multadd           = 0;
4155     A->ops->getrowij          = 0;
4156     A->ops->restorerowij      = 0;
4157     A->ops->getcolumnij       = 0;
4158     A->ops->restorecolumnij   = 0;
4159     A->ops->coloringpatch     = 0;
4160     A->ops->multdiagonalblock = 0;
4161     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4162     a->inode.node_count       = node_count;
4163     a->inode.size             = ns;
4164     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4165   }
4166   a->inode.checked = PETSC_TRUE;
4167   PetscFunctionReturn(0);
4168 }
4169 
4170 #undef __FUNCT__
4171 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4172 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4173 {
4174   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4175 
4176   PetscFunctionBegin;
4177   a->inode.ibdiagvalid = PETSC_FALSE;
4178   PetscFunctionReturn(0);
4179 }
4180 
4181 /*
4182      This is really ugly. if inodes are used this replaces the
4183   permutations with ones that correspond to rows/cols of the matrix
4184   rather then inode blocks
4185 */
4186 #undef __FUNCT__
4187 #define __FUNCT__ "MatInodeAdjustForInodes"
4188 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4189 {
4190   PetscErrorCode ierr;
4191 
4192   PetscFunctionBegin;
4193   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4194   PetscFunctionReturn(0);
4195 }
4196 
4197 EXTERN_C_BEGIN
4198 #undef __FUNCT__
4199 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4200 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4201 {
4202   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4203   PetscErrorCode ierr;
4204   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4205   const PetscInt *ridx,*cidx;
4206   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4207   PetscInt       nslim_col,*ns_col;
4208   IS             ris = *rperm,cis = *cperm;
4209 
4210   PetscFunctionBegin;
4211   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4212   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4213 
4214   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4215   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4216   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4217 
4218   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4219   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4220 
4221   /* Form the inode structure for the rows of permuted matric using inv perm*/
4222   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4223 
4224   /* Construct the permutations for rows*/
4225   for (i=0,row = 0; i<nslim_row; ++i) {
4226     indx      = ridx[i];
4227     start_val = tns[indx];
4228     end_val   = tns[indx + 1];
4229     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4230   }
4231 
4232   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4233   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4234 
4235  /* Construct permutations for columns */
4236   for (i=0,col=0; i<nslim_col; ++i) {
4237     indx      = cidx[i];
4238     start_val = tns[indx];
4239     end_val   = tns[indx + 1];
4240     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4241   }
4242 
4243   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4244   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4245   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4246   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4247 
4248   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4249   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4250 
4251   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4252   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4253   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4254   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4255   ierr = PetscFree(tns);CHKERRQ(ierr);
4256   PetscFunctionReturn(0);
4257 }
4258 EXTERN_C_END
4259 
4260 #undef __FUNCT__
4261 #define __FUNCT__ "MatInodeGetInodeSizes"
4262 /*@C
4263    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4264 
4265    Not Collective
4266 
4267    Input Parameter:
4268 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4269 
4270    Output Parameter:
4271 +  node_count - no of inodes present in the matrix.
4272 .  sizes      - an array of size node_count,with sizes of each inode.
4273 -  limit      - the max size used to generate the inodes.
4274 
4275    Level: advanced
4276 
4277    Notes: This routine returns some internal storage information
4278    of the matrix, it is intended to be used by advanced users.
4279    It should be called after the matrix is assembled.
4280    The contents of the sizes[] array should not be changed.
4281    PETSC_NULL may be passed for information not requested.
4282 
4283 .keywords: matrix, seqaij, get, inode
4284 
4285 .seealso: MatGetInfo()
4286 @*/
4287 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4288 {
4289   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4290 
4291   PetscFunctionBegin;
4292   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4293   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
4294   if (f) {
4295     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4296   }
4297   PetscFunctionReturn(0);
4298 }
4299 
4300 EXTERN_C_BEGIN
4301 #undef __FUNCT__
4302 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4303 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4304 {
4305   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4306 
4307   PetscFunctionBegin;
4308   if (node_count) *node_count = a->inode.node_count;
4309   if (sizes)      *sizes      = a->inode.size;
4310   if (limit)      *limit      = a->inode.limit;
4311   PetscFunctionReturn(0);
4312 }
4313 EXTERN_C_END
4314