xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 0700a8246d308f50502909ba325e6169d3ee27eb)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap,newshift;
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   rtmp4 = rtmp3 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns){
1238     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239   }
1240 
1241   /* If max inode size > 4, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1244   for (i=0,j=0; i<node_max; ++i,++j){
1245     if (ns[i]>3) {
1246       tmp_vec1[j] = ns[i]/2;
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1259   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1260   for (i=0,row=0; i<node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j=0; j<nodesz; j++,row++) {
1263       nsmap[row] = i;
1264     }
1265   }
1266   /* Using nsmap, create a reordered ns structure */
1267   for (i=0,j=0; i< node_max; i++) {
1268     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269     tmp_vec2[i]  = nodesz;
1270     j           += nodesz;
1271   }
1272   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1273   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1274 
1275   /* Now use the correct ns */
1276   ns = tmp_vec2;
1277 
1278   do {
1279     sctx.useshift = PETSC_FALSE;
1280     /* Now loop over each block-row, and do the factorization */
1281     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1282       nodesz = ns[inod];
1283 
1284       switch (nodesz){
1285       case 1:
1286       /*----------*/
1287         /* zero rtmp1 */
1288         /* L part */
1289         nz    = bi[i+1] - bi[i];
1290         bjtmp = bj + bi[i];
1291         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1292 
1293         /* U part */
1294         nz = bdiag[i]-bdiag[i+1];
1295         bjtmp = bj + bdiag[i+1]+1;
1296         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1297 
1298         /* load in initial (unfactored row) */
1299         nz    = ai[r[i]+1] - ai[r[i]];
1300         ajtmp = aj + ai[r[i]];
1301         v     = aa + ai[r[i]];
1302         for (j=0; j<nz; j++) {
1303           rtmp1[ics[ajtmp[j]]] = v[j];
1304         }
1305         /* ZeropivotApply() */
1306         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1307 
1308         /* elimination */
1309         bjtmp = bj + bi[i];
1310         row   = *bjtmp++;
1311         nzL   = bi[i+1] - bi[i];
1312         for(k=0; k < nzL;k++) {
1313           pc = rtmp1 + row;
1314           if (*pc != 0.0) {
1315             pv   = b->a + bdiag[row];
1316             mul1 = *pc * (*pv);
1317             *pc  = mul1;
1318             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1319             pv = b->a + bdiag[row+1]+1;
1320             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1321             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1322             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1323           }
1324           row = *bjtmp++;
1325         }
1326 
1327         /* finished row so stick it into b->a */
1328         rs = 0.0;
1329         /* L part */
1330         pv = b->a + bi[i] ;
1331         pj = b->j + bi[i] ;
1332         nz = bi[i+1] - bi[i];
1333         for (j=0; j<nz; j++) {
1334           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335         }
1336 
1337         /* U part */
1338         pv = b->a + bdiag[i+1]+1;
1339         pj = b->j + bdiag[i+1]+1;
1340         nz = bdiag[i] - bdiag[i+1]-1;
1341         for (j=0; j<nz; j++) {
1342           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1343         }
1344 
1345         /* Check zero pivot */
1346         sctx.rs = rs;
1347         sctx.pv = rtmp1[i];
1348         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1349 	if(newshift == 1) break;
1350 
1351         /* Mark diagonal and invert diagonal for simplier triangular solves */
1352         pv  = b->a + bdiag[i];
1353         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1354         break;
1355 
1356       case 2:
1357       /*----------*/
1358         /* zero rtmp1 and rtmp2 */
1359         /* L part */
1360         nz    = bi[i+1] - bi[i];
1361         bjtmp = bj + bi[i];
1362         for  (j=0; j<nz; j++) {
1363           col = bjtmp[j];
1364           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365         }
1366 
1367         /* U part */
1368         nz = bdiag[i]-bdiag[i+1];
1369         bjtmp = bj + bdiag[i+1]+1;
1370         for  (j=0; j<nz; j++) {
1371           col = bjtmp[j];
1372           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1373         }
1374 
1375         /* load in initial (unfactored row) */
1376         nz    = ai[r[i]+1] - ai[r[i]];
1377         ajtmp = aj + ai[r[i]];
1378         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1379         for (j=0; j<nz; j++) {
1380           col = ics[ajtmp[j]];
1381           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1382         }
1383         /* ZeropivotApply(): shift the diagonal of the matrix  */
1384         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1385 
1386         /* elimination */
1387         bjtmp = bj + bi[i];
1388         row   = *bjtmp++; /* pivot row */
1389         nzL   = bi[i+1] - bi[i];
1390         for(k=0; k < nzL;k++) {
1391           pc1 = rtmp1 + row;
1392           pc2 = rtmp2 + row;
1393           if (*pc1 != 0.0 || *pc2 != 0.0) {
1394             pv   = b->a + bdiag[row];
1395             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1396             *pc1 = mul1;       *pc2 = mul2;
1397 
1398             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1399             pv = b->a + bdiag[row+1]+1;
1400             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1401             for (j=0; j<nz; j++){
1402               col = pj[j];
1403               rtmp1[col] -= mul1 * pv[j];
1404               rtmp2[col] -= mul2 * pv[j];
1405             }
1406             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1407           }
1408           row = *bjtmp++;
1409         }
1410 
1411         /* finished row i; check zero pivot, then stick row i into b->a */
1412         rs  = 0.0;
1413         /* L part */
1414         pc1 = b->a + bi[i];
1415         pj  = b->j + bi[i] ;
1416         nz  = bi[i+1] - bi[i];
1417         for (j=0; j<nz; j++) {
1418           col = pj[j];
1419           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420         }
1421         /* U part */
1422         pc1 = b->a + bdiag[i+1]+1;
1423         pj  = b->j + bdiag[i+1]+1;
1424         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1425         for (j=0; j<nz; j++) {
1426           col = pj[j];
1427           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1428         }
1429 
1430         sctx.rs  = rs;
1431         sctx.pv  = rtmp1[i];
1432         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1433 	if(newshift == 1) break;
1434         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1435         *pc1 = 1.0/sctx.pv;
1436 
1437         /* Now take care of diagonal 2x2 block. */
1438         pc2 = rtmp2 + i;
1439         if (*pc2 != 0.0){
1440           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1441           *pc2 = mul1;          /* insert L entry */
1442           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1443           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1444           for (j=0; j<nz; j++) {
1445             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1446           }
1447           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1448         }
1449 
1450         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1451         rs = 0.0;
1452         /* L part */
1453         pc2 = b->a + bi[i+1];
1454         pj  = b->j + bi[i+1] ;
1455         nz  = bi[i+2] - bi[i+1];
1456         for (j=0; j<nz; j++) {
1457           col = pj[j];
1458           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459         }
1460         /* U part */
1461         pc2 = b->a + bdiag[i+2]+1;
1462         pj  = b->j + bdiag[i+2]+1;
1463         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1464         for (j=0; j<nz; j++) {
1465           col = pj[j];
1466           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1467         }
1468 
1469         sctx.rs  = rs;
1470         sctx.pv  = rtmp2[i+1];
1471         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1472 	if(newshift == 1) break;
1473         pc2  = b->a + bdiag[i+1];
1474         *pc2 = 1.0/sctx.pv;
1475         break;
1476 
1477       case 3:
1478       /*----------*/
1479         /* zero rtmp */
1480         /* L part */
1481         nz    = bi[i+1] - bi[i];
1482         bjtmp = bj + bi[i];
1483         for  (j=0; j<nz; j++) {
1484           col = bjtmp[j];
1485           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486         }
1487 
1488         /* U part */
1489         nz = bdiag[i]-bdiag[i+1];
1490         bjtmp = bj + bdiag[i+1]+1;
1491         for  (j=0; j<nz; j++) {
1492           col = bjtmp[j];
1493           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1494         }
1495 
1496         /* load in initial (unfactored row) */
1497         nz    = ai[r[i]+1] - ai[r[i]];
1498         ajtmp = aj + ai[r[i]];
1499         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1500         for (j=0; j<nz; j++) {
1501           col = ics[ajtmp[j]];
1502           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1503         }
1504         /* ZeropivotApply(): shift the diagonal of the matrix  */
1505         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1506 
1507         /* elimination */
1508         bjtmp = bj + bi[i];
1509         row   = *bjtmp++; /* pivot row */
1510         nzL   = bi[i+1] - bi[i];
1511         for(k=0; k < nzL;k++) {
1512           pc1 = rtmp1 + row;
1513           pc2 = rtmp2 + row;
1514           pc3 = rtmp3 + row;
1515           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1516             pv  = b->a + bdiag[row];
1517             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1518             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1519 
1520             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1521             pv = b->a + bdiag[row+1]+1;
1522             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1523             for (j=0; j<nz; j++){
1524               col = pj[j];
1525               rtmp1[col] -= mul1 * pv[j];
1526               rtmp2[col] -= mul2 * pv[j];
1527               rtmp3[col] -= mul3 * pv[j];
1528             }
1529             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1530           }
1531           row = *bjtmp++;
1532         }
1533 
1534         /* finished row i; check zero pivot, then stick row i into b->a */
1535         rs  = 0.0;
1536         /* L part */
1537         pc1 = b->a + bi[i];
1538         pj  = b->j + bi[i] ;
1539         nz  = bi[i+1] - bi[i];
1540         for (j=0; j<nz; j++) {
1541           col = pj[j];
1542           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543         }
1544         /* U part */
1545         pc1 = b->a + bdiag[i+1]+1;
1546         pj  = b->j + bdiag[i+1]+1;
1547         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1548         for (j=0; j<nz; j++) {
1549           col = pj[j];
1550           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1551         }
1552 
1553         sctx.rs  = rs;
1554         sctx.pv  = rtmp1[i];
1555         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1556 	if(newshift == 1) break;
1557         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1558         *pc1 = 1.0/sctx.pv;
1559 
1560         /* Now take care of 1st column of diagonal 3x3 block. */
1561         pc2 = rtmp2 + i;
1562         pc3 = rtmp3 + i;
1563         if (*pc2 != 0.0 || *pc3 != 0.0){
1564           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1565           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1566           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1567           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1568           for (j=0; j<nz; j++) {
1569             col = pj[j];
1570             rtmp2[col] -= mul2 * rtmp1[col];
1571             rtmp3[col] -= mul3 * rtmp1[col];
1572           }
1573           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1574         }
1575 
1576         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1577         rs = 0.0;
1578         /* L part */
1579         pc2 = b->a + bi[i+1];
1580         pj  = b->j + bi[i+1] ;
1581         nz  = bi[i+2] - bi[i+1];
1582         for (j=0; j<nz; j++) {
1583           col = pj[j];
1584           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585         }
1586         /* U part */
1587         pc2 = b->a + bdiag[i+2]+1;
1588         pj  = b->j + bdiag[i+2]+1;
1589         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1590         for (j=0; j<nz; j++) {
1591           col = pj[j];
1592           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1593         }
1594 
1595         sctx.rs  = rs;
1596         sctx.pv  = rtmp2[i+1];
1597         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1598 	if(newshift == 1) break;
1599         pc2  = b->a + bdiag[i+1];
1600         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1601 
1602         /* Now take care of 2nd column of diagonal 3x3 block. */
1603         pc3 = rtmp3 + i+1;
1604         if (*pc3 != 0.0){
1605           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1606           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1607           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1608           for (j=0; j<nz; j++) {
1609             col = pj[j];
1610             rtmp3[col] -= mul3 * rtmp2[col];
1611           }
1612           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1613         }
1614 
1615         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1616         rs = 0.0;
1617         /* L part */
1618         pc3 = b->a + bi[i+2];
1619         pj  = b->j + bi[i+2] ;
1620         nz  = bi[i+3] - bi[i+2];
1621         for (j=0; j<nz; j++) {
1622           col = pj[j];
1623           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624         }
1625         /* U part */
1626         pc3 = b->a + bdiag[i+3]+1;
1627         pj  = b->j + bdiag[i+3]+1;
1628         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1629         for (j=0; j<nz; j++) {
1630           col = pj[j];
1631           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1632         }
1633 
1634         sctx.rs  = rs;
1635         sctx.pv  = rtmp3[i+2];
1636         ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr);
1637 	if(newshift == 1) break;
1638         pc3  = b->a + bdiag[i+2];
1639         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1640         break;
1641       case 4:
1642       /*----------*/
1643         /* zero rtmp */
1644         /* L part */
1645         nz    = bi[i+1] - bi[i];
1646         bjtmp = bj + bi[i];
1647         for  (j=0; j<nz; j++) {
1648           col = bjtmp[j];
1649           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1650         }
1651 
1652         /* U part */
1653         nz = bdiag[i]-bdiag[i+1];
1654         bjtmp = bj + bdiag[i+1]+1;
1655         for  (j=0; j<nz; j++) {
1656           col = bjtmp[j];
1657           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1658         }
1659 
1660         /* load in initial (unfactored row) */
1661         nz    = ai[r[i]+1] - ai[r[i]];
1662         ajtmp = aj + ai[r[i]];
1663         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1664         for (j=0; j<nz; j++) {
1665           col = ics[ajtmp[j]];
1666           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1667         }
1668         /* ZeropivotApply(): shift the diagonal of the matrix  */
1669         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1670 
1671         /* elimination */
1672         bjtmp = bj + bi[i];
1673         row   = *bjtmp++; /* pivot row */
1674         nzL   = bi[i+1] - bi[i];
1675         for(k=0; k < nzL;k++) {
1676           pc1 = rtmp1 + row;
1677           pc2 = rtmp2 + row;
1678           pc3 = rtmp3 + row;
1679 	  pc4 = rtmp4 + row;
1680           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1681             pv  = b->a + bdiag[row];
1682             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1683             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1684 
1685             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1686             pv = b->a + bdiag[row+1]+1;
1687             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1688             for (j=0; j<nz; j++){
1689               col = pj[j];
1690               rtmp1[col] -= mul1 * pv[j];
1691               rtmp2[col] -= mul2 * pv[j];
1692               rtmp3[col] -= mul3 * pv[j];
1693 	      rtmp4[col] -= mul4 * pv[j];
1694             }
1695             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1696           }
1697           row = *bjtmp++;
1698         }
1699 
1700         /* finished row i; check zero pivot, then stick row i into b->a */
1701         rs  = 0.0;
1702         /* L part */
1703         pc1 = b->a + bi[i];
1704         pj  = b->j + bi[i] ;
1705         nz  = bi[i+1] - bi[i];
1706         for (j=0; j<nz; j++) {
1707           col = pj[j];
1708           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709         }
1710         /* U part */
1711         pc1 = b->a + bdiag[i+1]+1;
1712         pj  = b->j + bdiag[i+1]+1;
1713         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1714         for (j=0; j<nz; j++) {
1715           col = pj[j];
1716           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1717         }
1718 
1719         sctx.rs  = rs;
1720         sctx.pv  = rtmp1[i];
1721         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1722 	if(newshift == 1) break;
1723         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1724         *pc1 = 1.0/sctx.pv;
1725 
1726         /* Now take care of 1st column of diagonal 4x4 block. */
1727         pc2 = rtmp2 + i;
1728         pc3 = rtmp3 + i;
1729 	pc4 = rtmp4 + i;
1730         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1731           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1732           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1733 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1734           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1735           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1736           for (j=0; j<nz; j++) {
1737             col = pj[j];
1738             rtmp2[col] -= mul2 * rtmp1[col];
1739             rtmp3[col] -= mul3 * rtmp1[col];
1740 	    rtmp4[col] -= mul4 * rtmp1[col];
1741           }
1742           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1743         }
1744 
1745         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1746         rs = 0.0;
1747         /* L part */
1748         pc2 = b->a + bi[i+1];
1749         pj  = b->j + bi[i+1] ;
1750         nz  = bi[i+2] - bi[i+1];
1751         for (j=0; j<nz; j++) {
1752           col = pj[j];
1753           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754         }
1755         /* U part */
1756         pc2 = b->a + bdiag[i+2]+1;
1757         pj  = b->j + bdiag[i+2]+1;
1758         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1759         for (j=0; j<nz; j++) {
1760           col = pj[j];
1761           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1762         }
1763 
1764         sctx.rs  = rs;
1765         sctx.pv  = rtmp2[i+1];
1766         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1767 	if(newshift == 1) break;
1768         pc2  = b->a + bdiag[i+1];
1769         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1770 
1771         /* Now take care of 2nd column of diagonal 4x4 block. */
1772         pc3 = rtmp3 + i+1;
1773 	pc4 = rtmp4 + i+1;
1774         if (*pc3 != 0.0 || *pc4 != 0.0){
1775           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1776 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1777           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1778           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1779           for (j=0; j<nz; j++) {
1780             col = pj[j];
1781             rtmp3[col] -= mul3 * rtmp2[col];
1782 	    rtmp4[col] -= mul4 * rtmp2[col];
1783           }
1784           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1785         }
1786 
1787         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1788         rs = 0.0;
1789         /* L part */
1790         pc3 = b->a + bi[i+2];
1791         pj  = b->j + bi[i+2] ;
1792         nz  = bi[i+3] - bi[i+2];
1793         for (j=0; j<nz; j++) {
1794           col = pj[j];
1795           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796         }
1797         /* U part */
1798         pc3 = b->a + bdiag[i+3]+1;
1799         pj  = b->j + bdiag[i+3]+1;
1800         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1801         for (j=0; j<nz; j++) {
1802           col = pj[j];
1803           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1804         }
1805 
1806         sctx.rs  = rs;
1807         sctx.pv  = rtmp3[i+2];
1808         ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr);
1809 	if(newshift == 1) break;
1810         pc3  = b->a + bdiag[i+2];
1811         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1812 
1813 	/* Now take care of 3rd column of diagonal 4x4 block. */
1814 	pc4 = rtmp4 + i+2;
1815         if (*pc4 != 0.0){
1816 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1817           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1818           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1819           for (j=0; j<nz; j++) {
1820             col = pj[j];
1821 	    rtmp4[col] -= mul4 * rtmp3[col];
1822           }
1823           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1824         }
1825 
1826         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1827         rs = 0.0;
1828         /* L part */
1829         pc4 = b->a + bi[i+3];
1830         pj  = b->j + bi[i+3] ;
1831         nz  = bi[i+4] - bi[i+3];
1832         for (j=0; j<nz; j++) {
1833           col = pj[j];
1834           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835         }
1836         /* U part */
1837         pc4 = b->a + bdiag[i+4]+1;
1838         pj  = b->j + bdiag[i+4]+1;
1839         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1840         for (j=0; j<nz; j++) {
1841           col = pj[j];
1842           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1843         }
1844 
1845         sctx.rs  = rs;
1846         sctx.pv  = rtmp4[i+3];
1847         ierr = MatPivotCheck(info,&sctx,i+3,&newshift);CHKERRQ(ierr);
1848 	if(newshift == 1) break;
1849         pc4  = b->a + bdiag[i+3];
1850         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1851          break;
1852 
1853       default:
1854         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1855       }
1856       i += nodesz;                 /* Update the row */
1857     }
1858 
1859     /* MatPivotRefine() */
1860     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1861       /*
1862        * if no shift in this attempt & shifting & started shifting & can refine,
1863        * then try lower shift
1864        */
1865       sctx.shift_hi       = sctx.shift_fraction;
1866       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1867       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1868       sctx.useshift       = PETSC_TRUE;
1869       sctx.nshift++;
1870     }
1871   } while (sctx.useshift);
1872 
1873   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1874   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1876   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1877 
1878   C->ops->solve              = MatSolve_SeqAIJ;
1879   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1880   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1881   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1882   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1883   C->assembled    = PETSC_TRUE;
1884   C->preallocated = PETSC_TRUE;
1885   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1886 
1887   /* MatShiftView(A,info,&sctx) */
1888   if (sctx.nshift){
1889     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1890       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1892       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1894       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1895     }
1896   }
1897   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 #undef __FUNCT__
1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1904 {
1905   Mat               C = B;
1906   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1907   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1908   PetscErrorCode    ierr;
1909   const PetscInt    *r,*ic,*c,*ics;
1910   PetscInt          n = A->rmap->n,*bi = b->i;
1911   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1912   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1913   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1914   PetscScalar       mul1,mul2,mul3,tmp;
1915   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1916   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1917   PetscReal         rs=0.0;
1918   FactorShiftCtx    sctx;
1919   PetscInt          newshift;
1920 
1921   PetscFunctionBegin;
1922   sctx.shift_top      = 0;
1923   sctx.nshift_max     = 0;
1924   sctx.shift_lo       = 0;
1925   sctx.shift_hi       = 0;
1926   sctx.shift_fraction = 0;
1927 
1928   /* if both shift schemes are chosen by user, only use info->shiftpd */
1929   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1930     sctx.shift_top = 0;
1931     for (i=0; i<n; i++) {
1932       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1933       rs    = 0.0;
1934       ajtmp = aj + ai[i];
1935       rtmp1 = aa + ai[i];
1936       nz = ai[i+1] - ai[i];
1937       for (j=0; j<nz; j++){
1938         if (*ajtmp != i){
1939           rs += PetscAbsScalar(*rtmp1++);
1940         } else {
1941           rs -= PetscRealPart(*rtmp1++);
1942         }
1943         ajtmp++;
1944       }
1945       if (rs>sctx.shift_top) sctx.shift_top = rs;
1946     }
1947     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1948     sctx.shift_top *= 1.1;
1949     sctx.nshift_max = 5;
1950     sctx.shift_lo   = 0.;
1951     sctx.shift_hi   = 1.;
1952   }
1953   sctx.shift_amount = 0;
1954   sctx.nshift       = 0;
1955 
1956   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1957   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1958   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1959   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1960   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1961   ics   = ic ;
1962   rtmp22 = rtmp11 + n;
1963   rtmp33 = rtmp22 + n;
1964 
1965   node_max = a->inode.node_count;
1966   ns       = a->inode.size;
1967   if (!ns){
1968     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1969   }
1970 
1971   /* If max inode size > 3, split it into two inodes.*/
1972   /* also map the inode sizes according to the ordering */
1973   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1974   for (i=0,j=0; i<node_max; ++i,++j){
1975     if (ns[i]>3) {
1976       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1977       ++j;
1978       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1979     } else {
1980       tmp_vec1[j] = ns[i];
1981     }
1982   }
1983   /* Use the correct node_max */
1984   node_max = j;
1985 
1986   /* Now reorder the inode info based on mat re-ordering info */
1987   /* First create a row -> inode_size_array_index map */
1988   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1989   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1990   for (i=0,row=0; i<node_max; i++) {
1991     nodesz = tmp_vec1[i];
1992     for (j=0; j<nodesz; j++,row++) {
1993       nsmap[row] = i;
1994     }
1995   }
1996   /* Using nsmap, create a reordered ns structure */
1997   for (i=0,j=0; i< node_max; i++) {
1998     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1999     tmp_vec2[i]  = nodesz;
2000     j           += nodesz;
2001   }
2002   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2003   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2004   /* Now use the correct ns */
2005   ns = tmp_vec2;
2006 
2007   do {
2008     sctx.useshift = PETSC_FALSE;
2009     /* Now loop over each block-row, and do the factorization */
2010     for (i=0,row=0; i<node_max; i++) {
2011       nodesz = ns[i];
2012       nz     = bi[row+1] - bi[row];
2013       bjtmp  = bj + bi[row];
2014 
2015       switch (nodesz){
2016       case 1:
2017         for  (j=0; j<nz; j++){
2018           idx        = bjtmp[j];
2019           rtmp11[idx] = 0.0;
2020         }
2021 
2022         /* load in initial (unfactored row) */
2023         idx    = r[row];
2024         nz_tmp = ai[idx+1] - ai[idx];
2025         ajtmp  = aj + ai[idx];
2026         v1     = aa + ai[idx];
2027 
2028         for (j=0; j<nz_tmp; j++) {
2029           idx        = ics[ajtmp[j]];
2030           rtmp11[idx] = v1[j];
2031         }
2032         rtmp11[ics[r[row]]] += sctx.shift_amount;
2033 
2034         prow = *bjtmp++ ;
2035         while (prow < row) {
2036           pc1 = rtmp11 + prow;
2037           if (*pc1 != 0.0){
2038             pv   = ba + bd[prow];
2039             pj   = nbj + bd[prow];
2040             mul1 = *pc1 * *pv++;
2041             *pc1 = mul1;
2042             nz_tmp = bi[prow+1] - bd[prow] - 1;
2043             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2044             for (j=0; j<nz_tmp; j++) {
2045               tmp = pv[j];
2046               idx = pj[j];
2047               rtmp11[idx] -= mul1 * tmp;
2048             }
2049           }
2050           prow = *bjtmp++ ;
2051         }
2052         pj  = bj + bi[row];
2053         pc1 = ba + bi[row];
2054 
2055         sctx.pv    = rtmp11[row];
2056         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2057         rs         = 0.0;
2058         for (j=0; j<nz; j++) {
2059           idx    = pj[j];
2060           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2061           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2062         }
2063         sctx.rs  = rs;
2064         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2065         if (newshift == 1) goto endofwhile;
2066         break;
2067 
2068       case 2:
2069         for (j=0; j<nz; j++) {
2070           idx        = bjtmp[j];
2071           rtmp11[idx] = 0.0;
2072           rtmp22[idx] = 0.0;
2073         }
2074 
2075         /* load in initial (unfactored row) */
2076         idx    = r[row];
2077         nz_tmp = ai[idx+1] - ai[idx];
2078         ajtmp  = aj + ai[idx];
2079         v1     = aa + ai[idx];
2080         v2     = aa + ai[idx+1];
2081         for (j=0; j<nz_tmp; j++) {
2082           idx        = ics[ajtmp[j]];
2083           rtmp11[idx] = v1[j];
2084           rtmp22[idx] = v2[j];
2085         }
2086         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2087         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2088 
2089         prow = *bjtmp++ ;
2090         while (prow < row) {
2091           pc1 = rtmp11 + prow;
2092           pc2 = rtmp22 + prow;
2093           if (*pc1 != 0.0 || *pc2 != 0.0){
2094             pv   = ba + bd[prow];
2095             pj   = nbj + bd[prow];
2096             mul1 = *pc1 * *pv;
2097             mul2 = *pc2 * *pv;
2098             ++pv;
2099             *pc1 = mul1;
2100             *pc2 = mul2;
2101 
2102             nz_tmp = bi[prow+1] - bd[prow] - 1;
2103             for (j=0; j<nz_tmp; j++) {
2104               tmp = pv[j];
2105               idx = pj[j];
2106               rtmp11[idx] -= mul1 * tmp;
2107               rtmp22[idx] -= mul2 * tmp;
2108             }
2109             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2110           }
2111           prow = *bjtmp++ ;
2112         }
2113 
2114         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2115         pc1 = rtmp11 + prow;
2116         pc2 = rtmp22 + prow;
2117 
2118         sctx.pv = *pc1;
2119         pj      = bj + bi[prow];
2120         rs      = 0.0;
2121         for (j=0; j<nz; j++){
2122           idx = pj[j];
2123           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2124         }
2125         sctx.rs = rs;
2126         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2127         if (newshift == 1) goto endofwhile;
2128 
2129         if (*pc2 != 0.0){
2130           pj     = nbj + bd[prow];
2131           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2132           *pc2   = mul2;
2133           nz_tmp = bi[prow+1] - bd[prow] - 1;
2134           for (j=0; j<nz_tmp; j++) {
2135             idx = pj[j] ;
2136             tmp = rtmp11[idx];
2137             rtmp22[idx] -= mul2 * tmp;
2138           }
2139           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2140         }
2141 
2142         pj  = bj + bi[row];
2143         pc1 = ba + bi[row];
2144         pc2 = ba + bi[row+1];
2145 
2146         sctx.pv = rtmp22[row+1];
2147         rs = 0.0;
2148         rtmp11[row]   = 1.0/rtmp11[row];
2149         rtmp22[row+1] = 1.0/rtmp22[row+1];
2150         /* copy row entries from dense representation to sparse */
2151         for (j=0; j<nz; j++) {
2152           idx    = pj[j];
2153           pc1[j] = rtmp11[idx];
2154           pc2[j] = rtmp22[idx];
2155           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2156         }
2157         sctx.rs = rs;
2158         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2159         if (newshift == 1) goto endofwhile;
2160         break;
2161 
2162       case 3:
2163         for  (j=0; j<nz; j++) {
2164           idx        = bjtmp[j];
2165           rtmp11[idx] = 0.0;
2166           rtmp22[idx] = 0.0;
2167           rtmp33[idx] = 0.0;
2168         }
2169         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2170         idx    = r[row];
2171         nz_tmp = ai[idx+1] - ai[idx];
2172         ajtmp = aj + ai[idx];
2173         v1    = aa + ai[idx];
2174         v2    = aa + ai[idx+1];
2175         v3    = aa + ai[idx+2];
2176         for (j=0; j<nz_tmp; j++) {
2177           idx        = ics[ajtmp[j]];
2178           rtmp11[idx] = v1[j];
2179           rtmp22[idx] = v2[j];
2180           rtmp33[idx] = v3[j];
2181         }
2182         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2183         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2184         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2185 
2186         /* loop over all pivot row blocks above this row block */
2187         prow = *bjtmp++ ;
2188         while (prow < row) {
2189           pc1 = rtmp11 + prow;
2190           pc2 = rtmp22 + prow;
2191           pc3 = rtmp33 + prow;
2192           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2193             pv   = ba  + bd[prow];
2194             pj   = nbj + bd[prow];
2195             mul1 = *pc1 * *pv;
2196             mul2 = *pc2 * *pv;
2197             mul3 = *pc3 * *pv;
2198             ++pv;
2199             *pc1 = mul1;
2200             *pc2 = mul2;
2201             *pc3 = mul3;
2202 
2203             nz_tmp = bi[prow+1] - bd[prow] - 1;
2204             /* update this row based on pivot row */
2205             for (j=0; j<nz_tmp; j++) {
2206               tmp = pv[j];
2207               idx = pj[j];
2208               rtmp11[idx] -= mul1 * tmp;
2209               rtmp22[idx] -= mul2 * tmp;
2210               rtmp33[idx] -= mul3 * tmp;
2211             }
2212             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2213           }
2214           prow = *bjtmp++ ;
2215         }
2216 
2217         /* Now take care of diagonal 3x3 block in this set of rows */
2218         /* note: prow = row here */
2219         pc1 = rtmp11 + prow;
2220         pc2 = rtmp22 + prow;
2221         pc3 = rtmp33 + prow;
2222 
2223         sctx.pv = *pc1;
2224         pj      = bj + bi[prow];
2225         rs      = 0.0;
2226         for (j=0; j<nz; j++){
2227           idx = pj[j];
2228           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2229         }
2230         sctx.rs = rs;
2231         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2232         if (newshift == 1) goto endofwhile;
2233 
2234         if (*pc2 != 0.0 || *pc3 != 0.0){
2235           mul2 = (*pc2)/(*pc1);
2236           mul3 = (*pc3)/(*pc1);
2237           *pc2 = mul2;
2238           *pc3 = mul3;
2239           nz_tmp = bi[prow+1] - bd[prow] - 1;
2240           pj     = nbj + bd[prow];
2241           for (j=0; j<nz_tmp; j++) {
2242             idx = pj[j] ;
2243             tmp = rtmp11[idx];
2244             rtmp22[idx] -= mul2 * tmp;
2245             rtmp33[idx] -= mul3 * tmp;
2246           }
2247           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2248         }
2249         ++prow;
2250 
2251         pc2 = rtmp22 + prow;
2252         pc3 = rtmp33 + prow;
2253         sctx.pv = *pc2;
2254         pj      = bj + bi[prow];
2255         rs      = 0.0;
2256         for (j=0; j<nz; j++){
2257           idx = pj[j];
2258           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2259         }
2260         sctx.rs = rs;
2261         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2262         if (newshift == 1) goto endofwhile;
2263 
2264         if (*pc3 != 0.0){
2265           mul3   = (*pc3)/(*pc2);
2266           *pc3   = mul3;
2267           pj     = nbj + bd[prow];
2268           nz_tmp = bi[prow+1] - bd[prow] - 1;
2269           for (j=0; j<nz_tmp; j++) {
2270             idx = pj[j] ;
2271             tmp = rtmp22[idx];
2272             rtmp33[idx] -= mul3 * tmp;
2273           }
2274           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2275         }
2276 
2277         pj  = bj + bi[row];
2278         pc1 = ba + bi[row];
2279         pc2 = ba + bi[row+1];
2280         pc3 = ba + bi[row+2];
2281 
2282         sctx.pv = rtmp33[row+2];
2283         rs = 0.0;
2284         rtmp11[row]   = 1.0/rtmp11[row];
2285         rtmp22[row+1] = 1.0/rtmp22[row+1];
2286         rtmp33[row+2] = 1.0/rtmp33[row+2];
2287         /* copy row entries from dense representation to sparse */
2288         for (j=0; j<nz; j++) {
2289           idx    = pj[j];
2290           pc1[j] = rtmp11[idx];
2291           pc2[j] = rtmp22[idx];
2292           pc3[j] = rtmp33[idx];
2293           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2294         }
2295 
2296         sctx.rs = rs;
2297         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2298         if (newshift == 1) goto endofwhile;
2299         break;
2300 
2301       default:
2302         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2303       }
2304       row += nodesz;                 /* Update the row */
2305     }
2306     endofwhile:;
2307   } while (sctx.useshift);
2308   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2309   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2310   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2311   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2312   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2313   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2314   /* do not set solve add, since MatSolve_Inode + Add is faster */
2315   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2316   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2317   C->assembled   = PETSC_TRUE;
2318   C->preallocated = PETSC_TRUE;
2319   if (sctx.nshift) {
2320     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2321       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2322     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2323       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2324     }
2325   }
2326   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2327   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2328   PetscFunctionReturn(0);
2329 }
2330 
2331 
2332 /* ----------------------------------------------------------- */
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2336 {
2337   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2338   IS                iscol = a->col,isrow = a->row;
2339   PetscErrorCode    ierr;
2340   const PetscInt    *r,*c,*rout,*cout;
2341   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2342   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2343   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2344   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2345   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2346   const PetscScalar *b;
2347 
2348   PetscFunctionBegin;
2349   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2350   node_max = a->inode.node_count;
2351   ns       = a->inode.size;     /* Node Size array */
2352 
2353   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2354   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2355   tmp  = a->solve_work;
2356 
2357   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2358   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2359 
2360   /* forward solve the lower triangular */
2361   tmps = tmp ;
2362   aa   = a_a ;
2363   aj   = a_j ;
2364   ad   = a->diag;
2365 
2366   for (i = 0,row = 0; i< node_max; ++i){
2367     nsz = ns[i];
2368     aii = ai[row];
2369     v1  = aa + aii;
2370     vi  = aj + aii;
2371     nz  = ai[row+1]- ai[row];
2372 
2373     if (i < node_max-1) {
2374       /* Prefetch the indices for the next block */
2375       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2376       /* Prefetch the data for the next block */
2377       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2378     }
2379 
2380     switch (nsz){               /* Each loop in 'case' is unrolled */
2381     case 1 :
2382       sum1 = b[r[row]];
2383       for(j=0; j<nz-1; j+=2){
2384         i0   = vi[j];
2385         i1   = vi[j+1];
2386         tmp0 = tmps[i0];
2387         tmp1 = tmps[i1];
2388         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2389       }
2390       if(j == nz-1){
2391         tmp0 = tmps[vi[j]];
2392         sum1 -= v1[j]*tmp0;
2393       }
2394       tmp[row++]=sum1;
2395       break;
2396     case 2:
2397       sum1 = b[r[row]];
2398       sum2 = b[r[row+1]];
2399       v2   = aa + ai[row+1];
2400 
2401       for(j=0; j<nz-1; j+=2){
2402         i0   = vi[j];
2403         i1   = vi[j+1];
2404         tmp0 = tmps[i0];
2405         tmp1 = tmps[i1];
2406         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2407         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2408       }
2409       if(j == nz-1){
2410         tmp0 = tmps[vi[j]];
2411         sum1 -= v1[j] *tmp0;
2412         sum2 -= v2[j] *tmp0;
2413       }
2414       sum2 -= v2[nz] * sum1;
2415       tmp[row ++]=sum1;
2416       tmp[row ++]=sum2;
2417       break;
2418     case 3:
2419       sum1 = b[r[row]];
2420       sum2 = b[r[row+1]];
2421       sum3 = b[r[row+2]];
2422       v2   = aa + ai[row+1];
2423       v3   = aa + ai[row+2];
2424 
2425       for (j=0; j<nz-1; j+=2){
2426         i0   = vi[j];
2427         i1   = vi[j+1];
2428         tmp0 = tmps[i0];
2429         tmp1 = tmps[i1];
2430         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2431         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2432         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2433       }
2434       if (j == nz-1){
2435         tmp0 = tmps[vi[j]];
2436         sum1 -= v1[j] *tmp0;
2437         sum2 -= v2[j] *tmp0;
2438         sum3 -= v3[j] *tmp0;
2439       }
2440       sum2 -= v2[nz] * sum1;
2441       sum3 -= v3[nz] * sum1;
2442       sum3 -= v3[nz+1] * sum2;
2443       tmp[row ++]=sum1;
2444       tmp[row ++]=sum2;
2445       tmp[row ++]=sum3;
2446       break;
2447 
2448     case 4:
2449       sum1 = b[r[row]];
2450       sum2 = b[r[row+1]];
2451       sum3 = b[r[row+2]];
2452       sum4 = b[r[row+3]];
2453       v2   = aa + ai[row+1];
2454       v3   = aa + ai[row+2];
2455       v4   = aa + ai[row+3];
2456 
2457       for (j=0; j<nz-1; j+=2){
2458         i0   = vi[j];
2459         i1   = vi[j+1];
2460         tmp0 = tmps[i0];
2461         tmp1 = tmps[i1];
2462         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2463         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2464         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2465         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2466       }
2467       if (j == nz-1){
2468         tmp0 = tmps[vi[j]];
2469         sum1 -= v1[j] *tmp0;
2470         sum2 -= v2[j] *tmp0;
2471         sum3 -= v3[j] *tmp0;
2472         sum4 -= v4[j] *tmp0;
2473       }
2474       sum2 -= v2[nz] * sum1;
2475       sum3 -= v3[nz] * sum1;
2476       sum4 -= v4[nz] * sum1;
2477       sum3 -= v3[nz+1] * sum2;
2478       sum4 -= v4[nz+1] * sum2;
2479       sum4 -= v4[nz+2] * sum3;
2480 
2481       tmp[row ++]=sum1;
2482       tmp[row ++]=sum2;
2483       tmp[row ++]=sum3;
2484       tmp[row ++]=sum4;
2485       break;
2486     case 5:
2487       sum1 = b[r[row]];
2488       sum2 = b[r[row+1]];
2489       sum3 = b[r[row+2]];
2490       sum4 = b[r[row+3]];
2491       sum5 = b[r[row+4]];
2492       v2   = aa + ai[row+1];
2493       v3   = aa + ai[row+2];
2494       v4   = aa + ai[row+3];
2495       v5   = aa + ai[row+4];
2496 
2497       for (j=0; j<nz-1; j+=2){
2498         i0   = vi[j];
2499         i1   = vi[j+1];
2500         tmp0 = tmps[i0];
2501         tmp1 = tmps[i1];
2502         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2503         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2504         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2505         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2506         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2507       }
2508       if (j == nz-1){
2509         tmp0 = tmps[vi[j]];
2510         sum1 -= v1[j] *tmp0;
2511         sum2 -= v2[j] *tmp0;
2512         sum3 -= v3[j] *tmp0;
2513         sum4 -= v4[j] *tmp0;
2514         sum5 -= v5[j] *tmp0;
2515       }
2516 
2517       sum2 -= v2[nz] * sum1;
2518       sum3 -= v3[nz] * sum1;
2519       sum4 -= v4[nz] * sum1;
2520       sum5 -= v5[nz] * sum1;
2521       sum3 -= v3[nz+1] * sum2;
2522       sum4 -= v4[nz+1] * sum2;
2523       sum5 -= v5[nz+1] * sum2;
2524       sum4 -= v4[nz+2] * sum3;
2525       sum5 -= v5[nz+2] * sum3;
2526       sum5 -= v5[nz+3] * sum4;
2527 
2528       tmp[row ++]=sum1;
2529       tmp[row ++]=sum2;
2530       tmp[row ++]=sum3;
2531       tmp[row ++]=sum4;
2532       tmp[row ++]=sum5;
2533       break;
2534     default:
2535       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2536     }
2537   }
2538   /* backward solve the upper triangular */
2539   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2540     nsz = ns[i];
2541     aii = ad[row+1] + 1;
2542     v1  = aa + aii;
2543     vi  = aj + aii;
2544     nz  = ad[row]- ad[row+1] - 1;
2545 
2546     if (i > 0) {
2547       /* Prefetch the indices for the next block */
2548       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2549       /* Prefetch the data for the next block */
2550       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2551     }
2552 
2553     switch (nsz){               /* Each loop in 'case' is unrolled */
2554     case 1 :
2555       sum1 = tmp[row];
2556 
2557       for(j=0 ; j<nz-1; j+=2){
2558         i0   = vi[j];
2559         i1   = vi[j+1];
2560         tmp0 = tmps[i0];
2561         tmp1 = tmps[i1];
2562         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2563       }
2564       if (j == nz-1){
2565         tmp0  = tmps[vi[j]];
2566         sum1 -= v1[j]*tmp0;
2567       }
2568       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2569       break;
2570     case 2 :
2571       sum1 = tmp[row];
2572       sum2 = tmp[row-1];
2573       v2   = aa + ad[row] + 1;
2574       for (j=0 ; j<nz-1; j+=2){
2575         i0   = vi[j];
2576         i1   = vi[j+1];
2577         tmp0 = tmps[i0];
2578         tmp1 = tmps[i1];
2579         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2580         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2581       }
2582       if (j == nz-1){
2583         tmp0  = tmps[vi[j]];
2584         sum1 -= v1[j]* tmp0;
2585         sum2 -= v2[j+1]* tmp0;
2586       }
2587 
2588       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2589       sum2   -= v2[0] * tmp0;
2590       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2591       break;
2592     case 3 :
2593       sum1 = tmp[row];
2594       sum2 = tmp[row -1];
2595       sum3 = tmp[row -2];
2596       v2   = aa + ad[row] + 1;
2597       v3   = aa + ad[row -1] + 1;
2598       for (j=0 ; j<nz-1; j+=2){
2599         i0   = vi[j];
2600         i1   = vi[j+1];
2601         tmp0 = tmps[i0];
2602         tmp1 = tmps[i1];
2603         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2604         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2605         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2606       }
2607       if (j== nz-1){
2608         tmp0  = tmps[vi[j]];
2609         sum1 -= v1[j] * tmp0;
2610         sum2 -= v2[j+1] * tmp0;
2611         sum3 -= v3[j+2] * tmp0;
2612       }
2613       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2614       sum2   -= v2[0]* tmp0;
2615       sum3   -= v3[1] * tmp0;
2616       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2617       sum3   -= v3[0]* tmp0;
2618       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2619 
2620       break;
2621     case 4 :
2622       sum1 = tmp[row];
2623       sum2 = tmp[row -1];
2624       sum3 = tmp[row -2];
2625       sum4 = tmp[row -3];
2626       v2   = aa + ad[row]+1;
2627       v3   = aa + ad[row -1]+1;
2628       v4   = aa + ad[row -2]+1;
2629 
2630       for (j=0 ; j<nz-1; j+=2){
2631         i0   = vi[j];
2632         i1   = vi[j+1];
2633         tmp0 = tmps[i0];
2634         tmp1 = tmps[i1];
2635         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2636         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2637         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2638         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2639       }
2640       if (j== nz-1){
2641         tmp0  = tmps[vi[j]];
2642         sum1 -= v1[j] * tmp0;
2643         sum2 -= v2[j+1] * tmp0;
2644         sum3 -= v3[j+2] * tmp0;
2645         sum4 -= v4[j+3] * tmp0;
2646       }
2647 
2648       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2649       sum2   -= v2[0] * tmp0;
2650       sum3   -= v3[1] * tmp0;
2651       sum4   -= v4[2] * tmp0;
2652       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2653       sum3   -= v3[0] * tmp0;
2654       sum4   -= v4[1] * tmp0;
2655       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2656       sum4   -= v4[0] * tmp0;
2657       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2658       break;
2659     case 5 :
2660       sum1 = tmp[row];
2661       sum2 = tmp[row -1];
2662       sum3 = tmp[row -2];
2663       sum4 = tmp[row -3];
2664       sum5 = tmp[row -4];
2665       v2   = aa + ad[row]+1;
2666       v3   = aa + ad[row -1]+1;
2667       v4   = aa + ad[row -2]+1;
2668       v5   = aa + ad[row -3]+1;
2669       for (j=0 ; j<nz-1; j+=2){
2670         i0   = vi[j];
2671         i1   = vi[j+1];
2672         tmp0 = tmps[i0];
2673         tmp1 = tmps[i1];
2674         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2675         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2676         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2677         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2678         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2679       }
2680       if (j==nz-1){
2681         tmp0  = tmps[vi[j]];
2682         sum1 -= v1[j] * tmp0;
2683         sum2 -= v2[j+1] * tmp0;
2684         sum3 -= v3[j+2] * tmp0;
2685         sum4 -= v4[j+3] * tmp0;
2686         sum5 -= v5[j+4] * tmp0;
2687       }
2688 
2689       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2690       sum2   -= v2[0] * tmp0;
2691       sum3   -= v3[1] * tmp0;
2692       sum4   -= v4[2] * tmp0;
2693       sum5   -= v5[3] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2695       sum3   -= v3[0] * tmp0;
2696       sum4   -= v4[1] * tmp0;
2697       sum5   -= v5[2] * tmp0;
2698       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2699       sum4   -= v4[0] * tmp0;
2700       sum5   -= v5[1] * tmp0;
2701       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2702       sum5   -= v5[0] * tmp0;
2703       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2704       break;
2705     default:
2706       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2707     }
2708   }
2709   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2710   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2711   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2712   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2713   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2714   PetscFunctionReturn(0);
2715 }
2716 
2717 
2718 /*
2719      Makes a longer coloring[] array and calls the usual code with that
2720 */
2721 #undef __FUNCT__
2722 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2723 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2724 {
2725   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2726   PetscErrorCode  ierr;
2727   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2728   PetscInt        *colorused,i;
2729   ISColoringValue *newcolor;
2730 
2731   PetscFunctionBegin;
2732   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2733   /* loop over inodes, marking a color for each column*/
2734   row = 0;
2735   for (i=0; i<m; i++){
2736     for (j=0; j<ns[i]; j++) {
2737       newcolor[row++] = coloring[i] + j*ncolors;
2738     }
2739   }
2740 
2741   /* eliminate unneeded colors */
2742   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2743   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2744   for (i=0; i<n; i++) {
2745     colorused[newcolor[i]] = 1;
2746   }
2747 
2748   for (i=1; i<5*ncolors; i++) {
2749     colorused[i] += colorused[i-1];
2750   }
2751   ncolors = colorused[5*ncolors-1];
2752   for (i=0; i<n; i++) {
2753     newcolor[i] = colorused[newcolor[i]]-1;
2754   }
2755   ierr = PetscFree(colorused);CHKERRQ(ierr);
2756   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2757   ierr = PetscFree(coloring);CHKERRQ(ierr);
2758   PetscFunctionReturn(0);
2759 }
2760 
2761 #include "../src/mat/blockinvert.h"
2762 
2763 #undef __FUNCT__
2764 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2765 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2766 {
2767   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2768   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2769   MatScalar          *ibdiag,*bdiag,work[25];
2770   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2771   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2772   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2773   PetscErrorCode     ierr;
2774   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2775   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2776 
2777   PetscFunctionBegin;
2778   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2779   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2780   if (its > 1) {
2781     /* switch to non-inode version */
2782     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2783     PetscFunctionReturn(0);
2784   }
2785 
2786   if (!a->inode.ibdiagvalid) {
2787     if (!a->inode.ibdiag) {
2788       /* calculate space needed for diagonal blocks */
2789       for (i=0; i<m; i++) {
2790 	cnt += sizes[i]*sizes[i];
2791       }
2792       a->inode.bdiagsize = cnt;
2793       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2794     }
2795 
2796     /* copy over the diagonal blocks and invert them */
2797     ibdiag = a->inode.ibdiag;
2798     bdiag  = a->inode.bdiag;
2799     cnt = 0;
2800     for (i=0, row = 0; i<m; i++) {
2801       for (j=0; j<sizes[i]; j++) {
2802         for (k=0; k<sizes[i]; k++) {
2803           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2804         }
2805       }
2806       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2807 
2808       switch(sizes[i]) {
2809         case 1:
2810           /* Create matrix data structure */
2811           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2812           ibdiag[cnt] = 1.0/ibdiag[cnt];
2813           break;
2814         case 2:
2815           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2816           break;
2817         case 3:
2818           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2819           break;
2820         case 4:
2821           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2822           break;
2823         case 5:
2824           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2825           break;
2826        default:
2827 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2828       }
2829       cnt += sizes[i]*sizes[i];
2830       row += sizes[i];
2831     }
2832     a->inode.ibdiagvalid = PETSC_TRUE;
2833   }
2834   ibdiag = a->inode.ibdiag;
2835   bdiag  = a->inode.bdiag;
2836 
2837 
2838   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2839   if (flag & SOR_ZERO_INITIAL_GUESS) {
2840     PetscScalar *b;
2841     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2842     if (xx != bb) {
2843       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2844     } else {
2845       b = x;
2846     }
2847     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2848 
2849       for (i=0, row=0; i<m; i++) {
2850         sz  = diag[row] - ii[row];
2851         v1  = a->a + ii[row];
2852         idx = a->j + ii[row];
2853 
2854         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2855         switch (sizes[i]){
2856           case 1:
2857 
2858             sum1  = b[row];
2859             for(n = 0; n<sz-1; n+=2) {
2860               i1   = idx[0];
2861               i2   = idx[1];
2862               idx += 2;
2863               tmp0 = x[i1];
2864               tmp1 = x[i2];
2865               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2866             }
2867 
2868             if (n == sz-1){
2869               tmp0  = x[*idx];
2870               sum1 -= *v1 * tmp0;
2871             }
2872             x[row++] = sum1*(*ibdiag++);
2873             break;
2874           case 2:
2875             v2    = a->a + ii[row+1];
2876             sum1  = b[row];
2877             sum2  = b[row+1];
2878             for(n = 0; n<sz-1; n+=2) {
2879               i1   = idx[0];
2880               i2   = idx[1];
2881               idx += 2;
2882               tmp0 = x[i1];
2883               tmp1 = x[i2];
2884               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2885               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2886             }
2887 
2888             if (n == sz-1){
2889               tmp0  = x[*idx];
2890               sum1 -= v1[0] * tmp0;
2891               sum2 -= v2[0] * tmp0;
2892             }
2893             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2894             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2895             ibdiag  += 4;
2896             break;
2897           case 3:
2898             v2    = a->a + ii[row+1];
2899             v3    = a->a + ii[row+2];
2900             sum1  = b[row];
2901             sum2  = b[row+1];
2902             sum3  = b[row+2];
2903             for(n = 0; n<sz-1; n+=2) {
2904               i1   = idx[0];
2905               i2   = idx[1];
2906               idx += 2;
2907               tmp0 = x[i1];
2908               tmp1 = x[i2];
2909               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2910               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2911               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2912             }
2913 
2914             if (n == sz-1){
2915               tmp0  = x[*idx];
2916               sum1 -= v1[0] * tmp0;
2917               sum2 -= v2[0] * tmp0;
2918               sum3 -= v3[0] * tmp0;
2919             }
2920             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923             ibdiag  += 9;
2924             break;
2925           case 4:
2926             v2    = a->a + ii[row+1];
2927             v3    = a->a + ii[row+2];
2928             v4    = a->a + ii[row+3];
2929             sum1  = b[row];
2930             sum2  = b[row+1];
2931             sum3  = b[row+2];
2932             sum4  = b[row+3];
2933             for(n = 0; n<sz-1; n+=2) {
2934               i1   = idx[0];
2935               i2   = idx[1];
2936               idx += 2;
2937               tmp0 = x[i1];
2938               tmp1 = x[i2];
2939               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943             }
2944 
2945             if (n == sz-1){
2946               tmp0  = x[*idx];
2947               sum1 -= v1[0] * tmp0;
2948               sum2 -= v2[0] * tmp0;
2949               sum3 -= v3[0] * tmp0;
2950               sum4 -= v4[0] * tmp0;
2951             }
2952             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2953             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2954             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2955             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2956             ibdiag  += 16;
2957             break;
2958           case 5:
2959             v2    = a->a + ii[row+1];
2960             v3    = a->a + ii[row+2];
2961             v4    = a->a + ii[row+3];
2962             v5    = a->a + ii[row+4];
2963             sum1  = b[row];
2964             sum2  = b[row+1];
2965             sum3  = b[row+2];
2966             sum4  = b[row+3];
2967             sum5  = b[row+4];
2968             for(n = 0; n<sz-1; n+=2) {
2969               i1   = idx[0];
2970               i2   = idx[1];
2971               idx += 2;
2972               tmp0 = x[i1];
2973               tmp1 = x[i2];
2974               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2975               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2976               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2977               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2978               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2979             }
2980 
2981             if (n == sz-1){
2982               tmp0  = x[*idx];
2983               sum1 -= v1[0] * tmp0;
2984               sum2 -= v2[0] * tmp0;
2985               sum3 -= v3[0] * tmp0;
2986               sum4 -= v4[0] * tmp0;
2987               sum5 -= v5[0] * tmp0;
2988             }
2989             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2990             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2991             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2992             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2993             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2994             ibdiag  += 25;
2995             break;
2996 	  default:
2997    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2998         }
2999       }
3000 
3001       xb = x;
3002       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3003     } else xb = b;
3004     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
3005         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
3006       cnt = 0;
3007       for (i=0, row=0; i<m; i++) {
3008 
3009         switch (sizes[i]){
3010           case 1:
3011             x[row++] *= bdiag[cnt++];
3012             break;
3013           case 2:
3014             x1   = x[row]; x2 = x[row+1];
3015             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3016             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3017             x[row++] = tmp1;
3018             x[row++] = tmp2;
3019             cnt += 4;
3020             break;
3021           case 3:
3022             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3023             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3024             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3025             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3026             x[row++] = tmp1;
3027             x[row++] = tmp2;
3028             x[row++] = tmp3;
3029             cnt += 9;
3030             break;
3031           case 4:
3032             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3033             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3034             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3035             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3036             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3037             x[row++] = tmp1;
3038             x[row++] = tmp2;
3039             x[row++] = tmp3;
3040             x[row++] = tmp4;
3041             cnt += 16;
3042             break;
3043           case 5:
3044             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3045             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3046             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3047             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3048             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3049             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3050             x[row++] = tmp1;
3051             x[row++] = tmp2;
3052             x[row++] = tmp3;
3053             x[row++] = tmp4;
3054             x[row++] = tmp5;
3055             cnt += 25;
3056             break;
3057 	  default:
3058    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3059         }
3060       }
3061       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3062     }
3063     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3064 
3065       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3066       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3067         ibdiag -= sizes[i]*sizes[i];
3068         sz      = ii[row+1] - diag[row] - 1;
3069         v1      = a->a + diag[row] + 1;
3070         idx     = a->j + diag[row] + 1;
3071 
3072         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3073         switch (sizes[i]){
3074           case 1:
3075 
3076             sum1  = xb[row];
3077             for(n = 0; n<sz-1; n+=2) {
3078               i1   = idx[0];
3079               i2   = idx[1];
3080               idx += 2;
3081               tmp0 = x[i1];
3082               tmp1 = x[i2];
3083               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3084             }
3085 
3086             if (n == sz-1){
3087               tmp0  = x[*idx];
3088               sum1 -= *v1*tmp0;
3089             }
3090             x[row--] = sum1*(*ibdiag);
3091             break;
3092 
3093           case 2:
3094 
3095             sum1  = xb[row];
3096             sum2  = xb[row-1];
3097             /* note that sum1 is associated with the second of the two rows */
3098             v2    = a->a + diag[row-1] + 2;
3099             for(n = 0; n<sz-1; n+=2) {
3100               i1   = idx[0];
3101               i2   = idx[1];
3102               idx += 2;
3103               tmp0 = x[i1];
3104               tmp1 = x[i2];
3105               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3106               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3107             }
3108 
3109             if (n == sz-1){
3110               tmp0  = x[*idx];
3111               sum1 -= *v1*tmp0;
3112               sum2 -= *v2*tmp0;
3113             }
3114             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3115             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3116             break;
3117           case 3:
3118 
3119             sum1  = xb[row];
3120             sum2  = xb[row-1];
3121             sum3  = xb[row-2];
3122             v2    = a->a + diag[row-1] + 2;
3123             v3    = a->a + diag[row-2] + 3;
3124             for(n = 0; n<sz-1; n+=2) {
3125               i1   = idx[0];
3126               i2   = idx[1];
3127               idx += 2;
3128               tmp0 = x[i1];
3129               tmp1 = x[i2];
3130               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3131               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3132               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3133             }
3134 
3135             if (n == sz-1){
3136               tmp0  = x[*idx];
3137               sum1 -= *v1*tmp0;
3138               sum2 -= *v2*tmp0;
3139               sum3 -= *v3*tmp0;
3140             }
3141             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3142             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3143             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3144             break;
3145           case 4:
3146 
3147             sum1  = xb[row];
3148             sum2  = xb[row-1];
3149             sum3  = xb[row-2];
3150             sum4  = xb[row-3];
3151             v2    = a->a + diag[row-1] + 2;
3152             v3    = a->a + diag[row-2] + 3;
3153             v4    = a->a + diag[row-3] + 4;
3154             for(n = 0; n<sz-1; n+=2) {
3155               i1   = idx[0];
3156               i2   = idx[1];
3157               idx += 2;
3158               tmp0 = x[i1];
3159               tmp1 = x[i2];
3160               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3161               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3162               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3163               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3164             }
3165 
3166             if (n == sz-1){
3167               tmp0  = x[*idx];
3168               sum1 -= *v1*tmp0;
3169               sum2 -= *v2*tmp0;
3170               sum3 -= *v3*tmp0;
3171               sum4 -= *v4*tmp0;
3172             }
3173             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3174             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3175             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3176             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3177             break;
3178           case 5:
3179 
3180             sum1  = xb[row];
3181             sum2  = xb[row-1];
3182             sum3  = xb[row-2];
3183             sum4  = xb[row-3];
3184             sum5  = xb[row-4];
3185             v2    = a->a + diag[row-1] + 2;
3186             v3    = a->a + diag[row-2] + 3;
3187             v4    = a->a + diag[row-3] + 4;
3188             v5    = a->a + diag[row-4] + 5;
3189             for(n = 0; n<sz-1; n+=2) {
3190               i1   = idx[0];
3191               i2   = idx[1];
3192               idx += 2;
3193               tmp0 = x[i1];
3194               tmp1 = x[i2];
3195               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3196               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3197               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3198               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3199               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3200             }
3201 
3202             if (n == sz-1){
3203               tmp0  = x[*idx];
3204               sum1 -= *v1*tmp0;
3205               sum2 -= *v2*tmp0;
3206               sum3 -= *v3*tmp0;
3207               sum4 -= *v4*tmp0;
3208               sum5 -= *v5*tmp0;
3209             }
3210             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3211             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3212             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3213             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3214             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3215             break;
3216 	  default:
3217    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3218         }
3219       }
3220 
3221       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3222     }
3223     its--;
3224     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3225     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3226   }
3227   if (flag & SOR_EISENSTAT) {
3228     const PetscScalar *b;
3229     MatScalar         *t = a->inode.ssor_work;
3230 
3231     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3232     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3233     /*
3234           Apply  (U + D)^-1  where D is now the block diagonal
3235     */
3236     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3237     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3238       ibdiag -= sizes[i]*sizes[i];
3239       sz      = ii[row+1] - diag[row] - 1;
3240       v1      = a->a + diag[row] + 1;
3241       idx     = a->j + diag[row] + 1;
3242       CHKMEMQ;
3243       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3244       switch (sizes[i]){
3245         case 1:
3246 
3247 	  sum1  = b[row];
3248 	  for(n = 0; n<sz-1; n+=2) {
3249 	    i1   = idx[0];
3250 	    i2   = idx[1];
3251 	    idx += 2;
3252 	    tmp0 = x[i1];
3253 	    tmp1 = x[i2];
3254 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3255 	  }
3256 
3257 	  if (n == sz-1){
3258 	    tmp0  = x[*idx];
3259 	    sum1 -= *v1*tmp0;
3260 	  }
3261 	  x[row] = sum1*(*ibdiag);row--;
3262 	  break;
3263 
3264         case 2:
3265 
3266 	  sum1  = b[row];
3267 	  sum2  = b[row-1];
3268 	  /* note that sum1 is associated with the second of the two rows */
3269 	  v2    = a->a + diag[row-1] + 2;
3270 	  for(n = 0; n<sz-1; n+=2) {
3271 	    i1   = idx[0];
3272 	    i2   = idx[1];
3273 	    idx += 2;
3274 	    tmp0 = x[i1];
3275 	    tmp1 = x[i2];
3276 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3277 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3278 	  }
3279 
3280 	  if (n == sz-1){
3281 	    tmp0  = x[*idx];
3282 	    sum1 -= *v1*tmp0;
3283 	    sum2 -= *v2*tmp0;
3284 	  }
3285 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3286 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3287           row -= 2;
3288 	  break;
3289         case 3:
3290 
3291 	  sum1  = b[row];
3292 	  sum2  = b[row-1];
3293 	  sum3  = b[row-2];
3294 	  v2    = a->a + diag[row-1] + 2;
3295 	  v3    = a->a + diag[row-2] + 3;
3296 	  for(n = 0; n<sz-1; n+=2) {
3297 	    i1   = idx[0];
3298 	    i2   = idx[1];
3299 	    idx += 2;
3300 	    tmp0 = x[i1];
3301 	    tmp1 = x[i2];
3302 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3303 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3304 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3305 	  }
3306 
3307 	  if (n == sz-1){
3308 	    tmp0  = x[*idx];
3309 	    sum1 -= *v1*tmp0;
3310 	    sum2 -= *v2*tmp0;
3311 	    sum3 -= *v3*tmp0;
3312 	  }
3313 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3314 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3315 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3316           row -= 3;
3317 	  break;
3318         case 4:
3319 
3320 	  sum1  = b[row];
3321 	  sum2  = b[row-1];
3322 	  sum3  = b[row-2];
3323 	  sum4  = b[row-3];
3324 	  v2    = a->a + diag[row-1] + 2;
3325 	  v3    = a->a + diag[row-2] + 3;
3326 	  v4    = a->a + diag[row-3] + 4;
3327 	  for(n = 0; n<sz-1; n+=2) {
3328 	    i1   = idx[0];
3329 	    i2   = idx[1];
3330 	    idx += 2;
3331 	    tmp0 = x[i1];
3332 	    tmp1 = x[i2];
3333 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3334 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3335 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3336 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3337 	  }
3338 
3339 	  if (n == sz-1){
3340 	    tmp0  = x[*idx];
3341 	    sum1 -= *v1*tmp0;
3342 	    sum2 -= *v2*tmp0;
3343 	    sum3 -= *v3*tmp0;
3344 	    sum4 -= *v4*tmp0;
3345 	  }
3346 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3347 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3348 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3349 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3350           row -= 4;
3351 	  break;
3352         case 5:
3353 
3354 	  sum1  = b[row];
3355 	  sum2  = b[row-1];
3356 	  sum3  = b[row-2];
3357 	  sum4  = b[row-3];
3358 	  sum5  = b[row-4];
3359 	  v2    = a->a + diag[row-1] + 2;
3360 	  v3    = a->a + diag[row-2] + 3;
3361 	  v4    = a->a + diag[row-3] + 4;
3362 	  v5    = a->a + diag[row-4] + 5;
3363 	  for(n = 0; n<sz-1; n+=2) {
3364 	    i1   = idx[0];
3365 	    i2   = idx[1];
3366 	    idx += 2;
3367 	    tmp0 = x[i1];
3368 	    tmp1 = x[i2];
3369 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3370 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3371 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3372 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3373 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3374 	  }
3375 
3376 	  if (n == sz-1){
3377 	    tmp0  = x[*idx];
3378 	    sum1 -= *v1*tmp0;
3379 	    sum2 -= *v2*tmp0;
3380 	    sum3 -= *v3*tmp0;
3381 	    sum4 -= *v4*tmp0;
3382 	    sum5 -= *v5*tmp0;
3383 	  }
3384 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3385 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3386 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3387 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3388 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3389           row -= 5;
3390 	  break;
3391         default:
3392 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3393       }
3394       CHKMEMQ;
3395     }
3396     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3397 
3398     /*
3399            t = b - D x    where D is the block diagonal
3400     */
3401     cnt = 0;
3402     for (i=0, row=0; i<m; i++) {
3403       CHKMEMQ;
3404       switch (sizes[i]){
3405         case 1:
3406 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3407 	  break;
3408         case 2:
3409 	  x1   = x[row]; x2 = x[row+1];
3410 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3411 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3412 	  t[row]   = b[row] - tmp1;
3413 	  t[row+1] = b[row+1] - tmp2; row += 2;
3414 	  cnt += 4;
3415 	  break;
3416         case 3:
3417 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3418 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3419 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3420 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3421 	  t[row] = b[row] - tmp1;
3422 	  t[row+1] = b[row+1] - tmp2;
3423 	  t[row+2] = b[row+2] - tmp3; row += 3;
3424 	  cnt += 9;
3425 	  break;
3426         case 4:
3427 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3428 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3429 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3430 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3431 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3432 	  t[row] = b[row] - tmp1;
3433 	  t[row+1] = b[row+1] - tmp2;
3434 	  t[row+2] = b[row+2] - tmp3;
3435 	  t[row+3] = b[row+3] - tmp4; row += 4;
3436 	  cnt += 16;
3437 	  break;
3438         case 5:
3439 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3440 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3441 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3442 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3443 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3444 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3445 	  t[row] = b[row] - tmp1;
3446 	  t[row+1] = b[row+1] - tmp2;
3447 	  t[row+2] = b[row+2] - tmp3;
3448 	  t[row+3] = b[row+3] - tmp4;
3449 	  t[row+4] = b[row+4] - tmp5;row += 5;
3450 	  cnt += 25;
3451 	  break;
3452         default:
3453 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3454       }
3455       CHKMEMQ;
3456     }
3457     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3458 
3459 
3460 
3461     /*
3462           Apply (L + D)^-1 where D is the block diagonal
3463     */
3464     for (i=0, row=0; i<m; i++) {
3465       sz  = diag[row] - ii[row];
3466       v1  = a->a + ii[row];
3467       idx = a->j + ii[row];
3468       CHKMEMQ;
3469       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3470       switch (sizes[i]){
3471         case 1:
3472 
3473 	  sum1  = t[row];
3474 	  for(n = 0; n<sz-1; n+=2) {
3475 	    i1   = idx[0];
3476 	    i2   = idx[1];
3477 	    idx += 2;
3478 	    tmp0 = t[i1];
3479 	    tmp1 = t[i2];
3480 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3481 	  }
3482 
3483 	  if (n == sz-1){
3484 	    tmp0  = t[*idx];
3485 	    sum1 -= *v1 * tmp0;
3486 	  }
3487 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3488 	  break;
3489         case 2:
3490 	  v2    = a->a + ii[row+1];
3491 	  sum1  = t[row];
3492 	  sum2  = t[row+1];
3493 	  for(n = 0; n<sz-1; n+=2) {
3494 	    i1   = idx[0];
3495 	    i2   = idx[1];
3496 	    idx += 2;
3497 	    tmp0 = t[i1];
3498 	    tmp1 = t[i2];
3499 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3500 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3501 	  }
3502 
3503 	  if (n == sz-1){
3504 	    tmp0  = t[*idx];
3505 	    sum1 -= v1[0] * tmp0;
3506 	    sum2 -= v2[0] * tmp0;
3507 	  }
3508 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3509 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3510 	  ibdiag  += 4; row += 2;
3511 	  break;
3512         case 3:
3513 	  v2    = a->a + ii[row+1];
3514 	  v3    = a->a + ii[row+2];
3515 	  sum1  = t[row];
3516 	  sum2  = t[row+1];
3517 	  sum3  = t[row+2];
3518 	  for(n = 0; n<sz-1; n+=2) {
3519 	    i1   = idx[0];
3520 	    i2   = idx[1];
3521 	    idx += 2;
3522 	    tmp0 = t[i1];
3523 	    tmp1 = t[i2];
3524 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3525 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3526 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3527 	  }
3528 
3529 	  if (n == sz-1){
3530 	    tmp0  = t[*idx];
3531 	    sum1 -= v1[0] * tmp0;
3532 	    sum2 -= v2[0] * tmp0;
3533 	    sum3 -= v3[0] * tmp0;
3534 	  }
3535 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3536 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3537 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3538 	  ibdiag  += 9; row += 3;
3539 	  break;
3540         case 4:
3541 	  v2    = a->a + ii[row+1];
3542 	  v3    = a->a + ii[row+2];
3543 	  v4    = a->a + ii[row+3];
3544 	  sum1  = t[row];
3545 	  sum2  = t[row+1];
3546 	  sum3  = t[row+2];
3547 	  sum4  = t[row+3];
3548 	  for(n = 0; n<sz-1; n+=2) {
3549 	    i1   = idx[0];
3550 	    i2   = idx[1];
3551 	    idx += 2;
3552 	    tmp0 = t[i1];
3553 	    tmp1 = t[i2];
3554 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3555 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3556 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3557 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3558 	  }
3559 
3560 	  if (n == sz-1){
3561 	    tmp0  = t[*idx];
3562 	    sum1 -= v1[0] * tmp0;
3563 	    sum2 -= v2[0] * tmp0;
3564 	    sum3 -= v3[0] * tmp0;
3565 	    sum4 -= v4[0] * tmp0;
3566 	  }
3567 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3568 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3569 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3570 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3571 	  ibdiag  += 16; row += 4;
3572 	  break;
3573         case 5:
3574 	  v2    = a->a + ii[row+1];
3575 	  v3    = a->a + ii[row+2];
3576 	  v4    = a->a + ii[row+3];
3577 	  v5    = a->a + ii[row+4];
3578 	  sum1  = t[row];
3579 	  sum2  = t[row+1];
3580 	  sum3  = t[row+2];
3581 	  sum4  = t[row+3];
3582 	  sum5  = t[row+4];
3583 	  for(n = 0; n<sz-1; n+=2) {
3584 	    i1   = idx[0];
3585 	    i2   = idx[1];
3586 	    idx += 2;
3587 	    tmp0 = t[i1];
3588 	    tmp1 = t[i2];
3589 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3590 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3591 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3592 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3593 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3594 	  }
3595 
3596 	  if (n == sz-1){
3597 	    tmp0  = t[*idx];
3598 	    sum1 -= v1[0] * tmp0;
3599 	    sum2 -= v2[0] * tmp0;
3600 	    sum3 -= v3[0] * tmp0;
3601 	    sum4 -= v4[0] * tmp0;
3602 	    sum5 -= v5[0] * tmp0;
3603 	  }
3604 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3605 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3606 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3607 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3608 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3609 	  ibdiag  += 25; row += 5;
3610 	  break;
3611         default:
3612 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3613       }
3614       CHKMEMQ;
3615     }
3616     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3617     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3618     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3619   }
3620   PetscFunctionReturn(0);
3621 }
3622 
3623 #undef __FUNCT__
3624 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3625 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3626 {
3627   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3628   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3629   const MatScalar    *bdiag = a->inode.bdiag;
3630   const PetscScalar  *b;
3631   PetscErrorCode      ierr;
3632   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3633   const PetscInt      *sizes = a->inode.size;
3634 
3635   PetscFunctionBegin;
3636   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3637   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3638   cnt = 0;
3639   for (i=0, row=0; i<m; i++) {
3640     switch (sizes[i]){
3641       case 1:
3642 	x[row] = b[row]*bdiag[cnt++];row++;
3643 	break;
3644       case 2:
3645 	x1   = b[row]; x2 = b[row+1];
3646 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3647 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3648 	x[row++] = tmp1;
3649 	x[row++] = tmp2;
3650 	cnt += 4;
3651 	break;
3652       case 3:
3653 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3654 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3655 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3656 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3657 	x[row++] = tmp1;
3658 	x[row++] = tmp2;
3659 	x[row++] = tmp3;
3660 	cnt += 9;
3661 	break;
3662       case 4:
3663 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3664 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3665 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3666 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3667 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3668 	x[row++] = tmp1;
3669 	x[row++] = tmp2;
3670 	x[row++] = tmp3;
3671 	x[row++] = tmp4;
3672 	cnt += 16;
3673 	break;
3674       case 5:
3675 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3676 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3677 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3678 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3679 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3680 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3681 	x[row++] = tmp1;
3682 	x[row++] = tmp2;
3683 	x[row++] = tmp3;
3684 	x[row++] = tmp4;
3685 	x[row++] = tmp5;
3686 	cnt += 25;
3687 	break;
3688       default:
3689 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3690     }
3691   }
3692   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3693   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3694   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3695   PetscFunctionReturn(0);
3696 }
3697 
3698 /*
3699     samestructure indicates that the matrix has not changed its nonzero structure so we
3700     do not need to recompute the inodes
3701 */
3702 #undef __FUNCT__
3703 #define __FUNCT__ "Mat_CheckInode"
3704 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3705 {
3706   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3707   PetscErrorCode ierr;
3708   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3709   PetscTruth     flag;
3710 
3711   PetscFunctionBegin;
3712   if (!a->inode.use)                     PetscFunctionReturn(0);
3713   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3714 
3715 
3716   m = A->rmap->n;
3717   if (a->inode.size) {ns = a->inode.size;}
3718   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3719 
3720   i          = 0;
3721   node_count = 0;
3722   idx        = a->j;
3723   ii         = a->i;
3724   while (i < m){                /* For each row */
3725     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3726     /* Limits the number of elements in a node to 'a->inode.limit' */
3727     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3728       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3729       if (nzy != nzx) break;
3730       idy  += nzx;             /* Same nonzero pattern */
3731       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3732       if (!flag) break;
3733     }
3734     ns[node_count++] = blk_size;
3735     idx += blk_size*nzx;
3736     i    = j;
3737   }
3738   /* If not enough inodes found,, do not use inode version of the routines */
3739   if (!a->inode.size && m && node_count > .9*m) {
3740     ierr = PetscFree(ns);CHKERRQ(ierr);
3741     a->inode.node_count     = 0;
3742     a->inode.size           = PETSC_NULL;
3743     a->inode.use            = PETSC_FALSE;
3744     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3745   } else {
3746     if (!A->factortype) {
3747       A->ops->mult              = MatMult_SeqAIJ_Inode;
3748       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3749       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3750       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3751       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3752       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3753       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3754       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3755       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3756     } else {
3757       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3758     }
3759     a->inode.node_count       = node_count;
3760     a->inode.size             = ns;
3761     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3762   }
3763   PetscFunctionReturn(0);
3764 }
3765 
3766 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3767 PetscInt __k, *__vi; \
3768 __vi = aj + ai[row];				\
3769 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3770 __vi = aj + adiag[row];				\
3771 cols[nzl] = __vi[0];\
3772 __vi = aj + adiag[row+1]+1;\
3773 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3774 
3775 
3776 /*
3777    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3778    Modified from Mat_CheckInode().
3779 
3780    Input Parameters:
3781 +  Mat A - ILU or LU matrix factor
3782 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3783     do not need to recompute the inodes
3784 */
3785 #undef __FUNCT__
3786 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3787 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3788 {
3789   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3790   PetscErrorCode ierr;
3791   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3792   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3793   PetscTruth     flag;
3794 
3795   PetscFunctionBegin;
3796   if (!a->inode.use)                     PetscFunctionReturn(0);
3797   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3798 
3799   m = A->rmap->n;
3800   if (a->inode.size) {ns = a->inode.size;}
3801   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3802 
3803   i          = 0;
3804   node_count = 0;
3805   while (i < m){                /* For each row */
3806     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3807     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3808     nzx  = nzl1 + nzu1 + 1;
3809     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3810     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3811 
3812     /* Limits the number of elements in a node to 'a->inode.limit' */
3813     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3814       nzl2    = ai[j+1] - ai[j];
3815       nzu2    = adiag[j] - adiag[j+1] - 1;
3816       nzy     = nzl2 + nzu2 + 1;
3817       if( nzy != nzx) break;
3818       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3819       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3820       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3821       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3822       ierr = PetscFree(cols2);CHKERRQ(ierr);
3823     }
3824     ns[node_count++] = blk_size;
3825     ierr = PetscFree(cols1);CHKERRQ(ierr);
3826     i    = j;
3827   }
3828   /* If not enough inodes found,, do not use inode version of the routines */
3829   if (!a->inode.size && m && node_count > .9*m) {
3830     ierr = PetscFree(ns);CHKERRQ(ierr);
3831     a->inode.node_count     = 0;
3832     a->inode.size           = PETSC_NULL;
3833     a->inode.use            = PETSC_FALSE;
3834     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3835   } else {
3836     A->ops->mult              = 0;
3837     A->ops->sor               = 0;
3838     A->ops->multadd           = 0;
3839     A->ops->getrowij          = 0;
3840     A->ops->restorerowij      = 0;
3841     A->ops->getcolumnij       = 0;
3842     A->ops->restorecolumnij   = 0;
3843     A->ops->coloringpatch     = 0;
3844     A->ops->multdiagonalblock = 0;
3845     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3846     a->inode.node_count       = node_count;
3847     a->inode.size             = ns;
3848     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3849   }
3850   PetscFunctionReturn(0);
3851 }
3852 
3853 /*
3854      This is really ugly. if inodes are used this replaces the
3855   permutations with ones that correspond to rows/cols of the matrix
3856   rather then inode blocks
3857 */
3858 #undef __FUNCT__
3859 #define __FUNCT__ "MatInodeAdjustForInodes"
3860 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3861 {
3862   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3863 
3864   PetscFunctionBegin;
3865   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3866   if (f) {
3867     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3868   }
3869   PetscFunctionReturn(0);
3870 }
3871 
3872 EXTERN_C_BEGIN
3873 #undef __FUNCT__
3874 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3875 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3876 {
3877   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3878   PetscErrorCode ierr;
3879   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3880   const PetscInt *ridx,*cidx;
3881   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3882   PetscInt       nslim_col,*ns_col;
3883   IS             ris = *rperm,cis = *cperm;
3884 
3885   PetscFunctionBegin;
3886   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3887   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3888 
3889   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3890   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3891   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3892 
3893   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3894   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3895 
3896   /* Form the inode structure for the rows of permuted matric using inv perm*/
3897   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3898 
3899   /* Construct the permutations for rows*/
3900   for (i=0,row = 0; i<nslim_row; ++i){
3901     indx      = ridx[i];
3902     start_val = tns[indx];
3903     end_val   = tns[indx + 1];
3904     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3905   }
3906 
3907   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3908   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3909 
3910  /* Construct permutations for columns */
3911   for (i=0,col=0; i<nslim_col; ++i){
3912     indx      = cidx[i];
3913     start_val = tns[indx];
3914     end_val   = tns[indx + 1];
3915     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3916   }
3917 
3918   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3919   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3920   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3921   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3922 
3923   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3924   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3925 
3926   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3927   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3928   ierr = ISDestroy(cis);CHKERRQ(ierr);
3929   ierr = ISDestroy(ris);CHKERRQ(ierr);
3930   ierr = PetscFree(tns);CHKERRQ(ierr);
3931   PetscFunctionReturn(0);
3932 }
3933 EXTERN_C_END
3934 
3935 #undef __FUNCT__
3936 #define __FUNCT__ "MatInodeGetInodeSizes"
3937 /*@C
3938    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3939 
3940    Collective on Mat
3941 
3942    Input Parameter:
3943 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3944 
3945    Output Parameter:
3946 +  node_count - no of inodes present in the matrix.
3947 .  sizes      - an array of size node_count,with sizes of each inode.
3948 -  limit      - the max size used to generate the inodes.
3949 
3950    Level: advanced
3951 
3952    Notes: This routine returns some internal storage information
3953    of the matrix, it is intended to be used by advanced users.
3954    It should be called after the matrix is assembled.
3955    The contents of the sizes[] array should not be changed.
3956    PETSC_NULL may be passed for information not requested.
3957 
3958 .keywords: matrix, seqaij, get, inode
3959 
3960 .seealso: MatGetInfo()
3961 @*/
3962 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3963 {
3964   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3965 
3966   PetscFunctionBegin;
3967   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3968   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3969   if (f) {
3970     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3971   }
3972   PetscFunctionReturn(0);
3973 }
3974 
3975 EXTERN_C_BEGIN
3976 #undef __FUNCT__
3977 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3978 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3979 {
3980   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3981 
3982   PetscFunctionBegin;
3983   if (node_count) *node_count = a->inode.node_count;
3984   if (sizes)      *sizes      = a->inode.size;
3985   if (limit)      *limit      = a->inode.limit;
3986   PetscFunctionReturn(0);
3987 }
3988 EXTERN_C_END
3989