xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 84df9cb40eca90ea9b18a456fab7a4ecc7f6c1a4)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11 {
12   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24     for(; count+1 < n; count++,i++);
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for(; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n)  {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1){
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col)
86       tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     i2   = 0;
101     col  = *j++ + ishift;
102     i2   = tvc[col];
103     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104       ia[i1+1]++;
105       ia[i2+1]++;
106       i2++;                     /* Start col of next node */
107       while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
108       i2 = tvc[col];
109     }
110     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111   }
112 
113   /* shift ia[i] to point to next row */
114   for (i1=1; i1<nslim_row+1; i1++) {
115     row        = ia[i1-1];
116     ia[i1]    += row;
117     work[i1-1] = row - oshift;
118   }
119 
120   /* allocate space for column pointers */
121   nz   = ia[nslim_row] + (!ishift);
122   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
123   *jja = ja;
124 
125  /* loop over lower triangular part putting into ja */
126   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127     j    = aj + ai[row] + ishift;
128     jmax = aj + ai[row+1] + ishift;
129     i2   = 0;                     /* Col inode index */
130     col  = *j++ + ishift;
131     i2   = tvc[col];
132     while (i2<i1 && j<jmax) {
133       ja[work[i2]++] = i1 + oshift;
134       ja[work[i1]++] = i2 + oshift;
135       ++i2;
136       while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
137       i2 = tvc[col];
138     }
139     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
140 
141   }
142   ierr = PetscFree(work);CHKERRQ(ierr);
143   ierr = PetscFree(tns);CHKERRQ(ierr);
144   ierr = PetscFree(tvc);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /*
149       This builds nonsymmetric version of nonzero structure,
150 */
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154 {
155   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
156   PetscErrorCode ierr;
157   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
159 
160   PetscFunctionBegin;
161   nslim_row = a->inode.node_count;
162   n         = A->cmap->n;
163 
164   /* Create The column_inode for this matrix */
165   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
166 
167   /* allocate space for reformated column_inode structure */
168   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
169   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
170   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171 
172   for (i1=0,col=0; i1<nslim_col; ++i1){
173     nsz = ns_col[i1];
174     for (i2=0; i2<nsz; ++i2,++col)
175       tvc[col] = i1;
176   }
177   /* allocate space for row pointers */
178   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
179   *iia = ia;
180   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
181   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
182 
183   /* determine the number of columns in each row */
184   ia[0] = oshift;
185   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186     j   = aj + ai[row] + ishift;
187     col = *j++ + ishift;
188     i2  = tvc[col];
189     nz  = ai[row+1] - ai[row];
190     while (nz-- > 0) {           /* off-diagonal elemets */
191       ia[i1+1]++;
192       i2++;                     /* Start col of next node */
193       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
194       if (nz > 0) i2 = tvc[col];
195     }
196   }
197 
198   /* shift ia[i] to point to next row */
199   for (i1=1; i1<nslim_row+1; i1++) {
200     row        = ia[i1-1];
201     ia[i1]    += row;
202     work[i1-1] = row - oshift;
203   }
204 
205   /* allocate space for column pointers */
206   nz   = ia[nslim_row] + (!ishift);
207   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
208   *jja = ja;
209 
210  /* loop over matrix putting into ja */
211   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
212     j   = aj + ai[row] + ishift;
213     i2  = 0;                     /* Col inode index */
214     col = *j++ + ishift;
215     i2  = tvc[col];
216     nz  = ai[row+1] - ai[row];
217     while (nz-- > 0) {
218       ja[work[i1]++] = i2 + oshift;
219       ++i2;
220       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
221       if (nz > 0) i2 = tvc[col];
222     }
223   }
224   ierr = PetscFree(ns_col);CHKERRQ(ierr);
225   ierr = PetscFree(work);CHKERRQ(ierr);
226   ierr = PetscFree(tns);CHKERRQ(ierr);
227   ierr = PetscFree(tvc);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 #undef __FUNCT__
232 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
233 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
234 {
235   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   *n     = a->inode.node_count;
240   if (!ia) PetscFunctionReturn(0);
241   if (!blockcompressed) {
242     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
243   } else if (symmetric) {
244     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
245   } else {
246     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
253 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   if (!ia) PetscFunctionReturn(0);
259 
260   if (!blockcompressed) {
261     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
262   } else {
263     ierr = PetscFree(*ia);CHKERRQ(ierr);
264     ierr = PetscFree(*ja);CHKERRQ(ierr);
265   }
266 
267   PetscFunctionReturn(0);
268 }
269 
270 /* ----------------------------------------------------------- */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275 {
276   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
277   PetscErrorCode ierr;
278   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
280 
281   PetscFunctionBegin;
282   nslim_row = a->inode.node_count;
283   n         = A->cmap->n;
284 
285   /* Create The column_inode for this matrix */
286   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
287 
288   /* allocate space for reformated column_inode structure */
289   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
290   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
291   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
292 
293   for (i1=0,col=0; i1<nslim_col; ++i1){
294     nsz = ns_col[i1];
295     for (i2=0; i2<nsz; ++i2,++col)
296       tvc[col] = i1;
297   }
298   /* allocate space for column pointers */
299   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
300   *iia = ia;
301   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
302   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
303 
304   /* determine the number of columns in each row */
305   ia[0] = oshift;
306   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
307     j   = aj + ai[row] + ishift;
308     col = *j++ + ishift;
309     i2  = tvc[col];
310     nz  = ai[row+1] - ai[row];
311     while (nz-- > 0) {           /* off-diagonal elemets */
312       /* ia[i1+1]++; */
313       ia[i2+1]++;
314       i2++;
315       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
316       if (nz > 0) i2 = tvc[col];
317     }
318   }
319 
320   /* shift ia[i] to point to next col */
321   for (i1=1; i1<nslim_col+1; i1++) {
322     col        = ia[i1-1];
323     ia[i1]    += col;
324     work[i1-1] = col - oshift;
325   }
326 
327   /* allocate space for column pointers */
328   nz   = ia[nslim_col] + (!ishift);
329   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
330   *jja = ja;
331 
332  /* loop over matrix putting into ja */
333   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
334     j   = aj + ai[row] + ishift;
335     i2  = 0;                     /* Col inode index */
336     col = *j++ + ishift;
337     i2  = tvc[col];
338     nz  = ai[row+1] - ai[row];
339     while (nz-- > 0) {
340       /* ja[work[i1]++] = i2 + oshift; */
341       ja[work[i2]++] = i1 + oshift;
342       i2++;
343       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
344       if (nz > 0) i2 = tvc[col];
345     }
346   }
347   ierr = PetscFree(ns_col);CHKERRQ(ierr);
348   ierr = PetscFree(work);CHKERRQ(ierr);
349   ierr = PetscFree(tns);CHKERRQ(ierr);
350   ierr = PetscFree(tvc);CHKERRQ(ierr);
351   PetscFunctionReturn(0);
352 }
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
356 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
357 {
358   PetscErrorCode ierr;
359 
360   PetscFunctionBegin;
361   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
362   if (!ia) PetscFunctionReturn(0);
363 
364   if (!blockcompressed) {
365     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
366   } else if (symmetric) {
367     /* Since the indices are symmetric it does'nt matter */
368     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
369   } else {
370     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
371   }
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
377 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
378 {
379   PetscErrorCode ierr;
380 
381   PetscFunctionBegin;
382   if (!ia) PetscFunctionReturn(0);
383   if (!blockcompressed) {
384     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
385   } else {
386     ierr = PetscFree(*ia);CHKERRQ(ierr);
387     ierr = PetscFree(*ja);CHKERRQ(ierr);
388   }
389   PetscFunctionReturn(0);
390 }
391 
392 /* ----------------------------------------------------------- */
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
396 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397 {
398   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400   PetscScalar       *y;
401   const PetscScalar *x;
402   const MatScalar   *v1,*v2,*v3,*v4,*v5;
403   PetscErrorCode    ierr;
404   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405 
406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408 #endif
409 
410   PetscFunctionBegin;
411   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
412   node_max = a->inode.node_count;
413   ns       = a->inode.size;     /* Node Size array */
414   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
416   idx  = a->j;
417   v1   = a->a;
418   ii   = a->i;
419 
420   for (i = 0,row = 0; i< node_max; ++i){
421     nsz  = ns[i];
422     n    = ii[1] - ii[0];
423     nonzerorow += (n>0)*nsz;
424     ii  += nsz;
425     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
426     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
427     sz   = n;                   /* No of non zeros in this row */
428                                 /* Switch on the size of Node */
429     switch (nsz){               /* Each loop in 'case' is unrolled */
430     case 1 :
431       sum1  = 0.;
432 
433       for(n = 0; n< sz-1; n+=2) {
434         i1   = idx[0];          /* The instructions are ordered to */
435         i2   = idx[1];          /* make the compiler's job easy */
436         idx += 2;
437         tmp0 = x[i1];
438         tmp1 = x[i2];
439         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440        }
441 
442       if (n == sz-1){          /* Take care of the last nonzero  */
443         tmp0  = x[*idx++];
444         sum1 += *v1++ * tmp0;
445       }
446       y[row++]=sum1;
447       break;
448     case 2:
449       sum1  = 0.;
450       sum2  = 0.;
451       v2    = v1 + n;
452 
453       for (n = 0; n< sz-1; n+=2) {
454         i1   = idx[0];
455         i2   = idx[1];
456         idx += 2;
457         tmp0 = x[i1];
458         tmp1 = x[i2];
459         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
460         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
461       }
462       if (n == sz-1){
463         tmp0  = x[*idx++];
464         sum1 += *v1++ * tmp0;
465         sum2 += *v2++ * tmp0;
466       }
467       y[row++]=sum1;
468       y[row++]=sum2;
469       v1      =v2;              /* Since the next block to be processed starts there*/
470       idx    +=sz;
471       break;
472     case 3:
473       sum1  = 0.;
474       sum2  = 0.;
475       sum3  = 0.;
476       v2    = v1 + n;
477       v3    = v2 + n;
478 
479       for (n = 0; n< sz-1; n+=2) {
480         i1   = idx[0];
481         i2   = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
486         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
487         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
488       }
489       if (n == sz-1){
490         tmp0  = x[*idx++];
491         sum1 += *v1++ * tmp0;
492         sum2 += *v2++ * tmp0;
493         sum3 += *v3++ * tmp0;
494       }
495       y[row++]=sum1;
496       y[row++]=sum2;
497       y[row++]=sum3;
498       v1       =v3;             /* Since the next block to be processed starts there*/
499       idx     +=2*sz;
500       break;
501     case 4:
502       sum1  = 0.;
503       sum2  = 0.;
504       sum3  = 0.;
505       sum4  = 0.;
506       v2    = v1 + n;
507       v3    = v2 + n;
508       v4    = v3 + n;
509 
510       for (n = 0; n< sz-1; n+=2) {
511         i1   = idx[0];
512         i2   = idx[1];
513         idx += 2;
514         tmp0 = x[i1];
515         tmp1 = x[i2];
516         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
517         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
518         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
519         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
520       }
521       if (n == sz-1){
522         tmp0  = x[*idx++];
523         sum1 += *v1++ * tmp0;
524         sum2 += *v2++ * tmp0;
525         sum3 += *v3++ * tmp0;
526         sum4 += *v4++ * tmp0;
527       }
528       y[row++]=sum1;
529       y[row++]=sum2;
530       y[row++]=sum3;
531       y[row++]=sum4;
532       v1      =v4;              /* Since the next block to be processed starts there*/
533       idx    +=3*sz;
534       break;
535     case 5:
536       sum1  = 0.;
537       sum2  = 0.;
538       sum3  = 0.;
539       sum4  = 0.;
540       sum5  = 0.;
541       v2    = v1 + n;
542       v3    = v2 + n;
543       v4    = v3 + n;
544       v5    = v4 + n;
545 
546       for (n = 0; n<sz-1; n+=2) {
547         i1   = idx[0];
548         i2   = idx[1];
549         idx += 2;
550         tmp0 = x[i1];
551         tmp1 = x[i2];
552         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
553         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
554         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
555         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
556         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
557       }
558       if (n == sz-1){
559         tmp0  = x[*idx++];
560         sum1 += *v1++ * tmp0;
561         sum2 += *v2++ * tmp0;
562         sum3 += *v3++ * tmp0;
563         sum4 += *v4++ * tmp0;
564         sum5 += *v5++ * tmp0;
565       }
566       y[row++]=sum1;
567       y[row++]=sum2;
568       y[row++]=sum3;
569       y[row++]=sum4;
570       y[row++]=sum5;
571       v1      =v5;       /* Since the next block to be processed starts there */
572       idx    +=4*sz;
573       break;
574     default :
575       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
576     }
577   }
578   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
579   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
580   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 /* ----------------------------------------------------------- */
584 /* Almost same code as the MatMult_SeqAIJ_Inode() */
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
587 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
588 {
589   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
590   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
591   MatScalar      *v1,*v2,*v3,*v4,*v5;
592   PetscScalar    *x,*y,*z,*zt;
593   PetscErrorCode ierr;
594   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
595 
596   PetscFunctionBegin;
597   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
598   node_max = a->inode.node_count;
599   ns       = a->inode.size;     /* Node Size array */
600   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
601   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
602   if (zz != yy) {
603     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
604   } else {
605     z = y;
606   }
607   zt = z;
608 
609   idx  = a->j;
610   v1   = a->a;
611   ii   = a->i;
612 
613   for (i = 0,row = 0; i< node_max; ++i){
614     nsz  = ns[i];
615     n    = ii[1] - ii[0];
616     ii  += nsz;
617     sz   = n;                   /* No of non zeros in this row */
618                                 /* Switch on the size of Node */
619     switch (nsz){               /* Each loop in 'case' is unrolled */
620     case 1 :
621       sum1  = *zt++;
622 
623       for(n = 0; n< sz-1; n+=2) {
624         i1   = idx[0];          /* The instructions are ordered to */
625         i2   = idx[1];          /* make the compiler's job easy */
626         idx += 2;
627         tmp0 = x[i1];
628         tmp1 = x[i2];
629         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
630        }
631 
632       if(n   == sz-1){          /* Take care of the last nonzero  */
633         tmp0  = x[*idx++];
634         sum1 += *v1++ * tmp0;
635       }
636       y[row++]=sum1;
637       break;
638     case 2:
639       sum1  = *zt++;
640       sum2  = *zt++;
641       v2    = v1 + n;
642 
643       for(n = 0; n< sz-1; n+=2) {
644         i1   = idx[0];
645         i2   = idx[1];
646         idx += 2;
647         tmp0 = x[i1];
648         tmp1 = x[i2];
649         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
650         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
651       }
652       if(n   == sz-1){
653         tmp0  = x[*idx++];
654         sum1 += *v1++ * tmp0;
655         sum2 += *v2++ * tmp0;
656       }
657       y[row++]=sum1;
658       y[row++]=sum2;
659       v1      =v2;              /* Since the next block to be processed starts there*/
660       idx    +=sz;
661       break;
662     case 3:
663       sum1  = *zt++;
664       sum2  = *zt++;
665       sum3  = *zt++;
666       v2    = v1 + n;
667       v3    = v2 + n;
668 
669       for (n = 0; n< sz-1; n+=2) {
670         i1   = idx[0];
671         i2   = idx[1];
672         idx += 2;
673         tmp0 = x[i1];
674         tmp1 = x[i2];
675         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
676         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
677         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
678       }
679       if (n == sz-1){
680         tmp0  = x[*idx++];
681         sum1 += *v1++ * tmp0;
682         sum2 += *v2++ * tmp0;
683         sum3 += *v3++ * tmp0;
684       }
685       y[row++]=sum1;
686       y[row++]=sum2;
687       y[row++]=sum3;
688       v1       =v3;             /* Since the next block to be processed starts there*/
689       idx     +=2*sz;
690       break;
691     case 4:
692       sum1  = *zt++;
693       sum2  = *zt++;
694       sum3  = *zt++;
695       sum4  = *zt++;
696       v2    = v1 + n;
697       v3    = v2 + n;
698       v4    = v3 + n;
699 
700       for (n = 0; n< sz-1; n+=2) {
701         i1   = idx[0];
702         i2   = idx[1];
703         idx += 2;
704         tmp0 = x[i1];
705         tmp1 = x[i2];
706         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
707         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
708         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
709         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
710       }
711       if (n == sz-1){
712         tmp0  = x[*idx++];
713         sum1 += *v1++ * tmp0;
714         sum2 += *v2++ * tmp0;
715         sum3 += *v3++ * tmp0;
716         sum4 += *v4++ * tmp0;
717       }
718       y[row++]=sum1;
719       y[row++]=sum2;
720       y[row++]=sum3;
721       y[row++]=sum4;
722       v1      =v4;              /* Since the next block to be processed starts there*/
723       idx    +=3*sz;
724       break;
725     case 5:
726       sum1  = *zt++;
727       sum2  = *zt++;
728       sum3  = *zt++;
729       sum4  = *zt++;
730       sum5  = *zt++;
731       v2    = v1 + n;
732       v3    = v2 + n;
733       v4    = v3 + n;
734       v5    = v4 + n;
735 
736       for (n = 0; n<sz-1; n+=2) {
737         i1   = idx[0];
738         i2   = idx[1];
739         idx += 2;
740         tmp0 = x[i1];
741         tmp1 = x[i2];
742         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
743         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
744         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
745         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
746         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
747       }
748       if(n   == sz-1){
749         tmp0  = x[*idx++];
750         sum1 += *v1++ * tmp0;
751         sum2 += *v2++ * tmp0;
752         sum3 += *v3++ * tmp0;
753         sum4 += *v4++ * tmp0;
754         sum5 += *v5++ * tmp0;
755       }
756       y[row++]=sum1;
757       y[row++]=sum2;
758       y[row++]=sum3;
759       y[row++]=sum4;
760       y[row++]=sum5;
761       v1      =v5;       /* Since the next block to be processed starts there */
762       idx    +=4*sz;
763       break;
764     default :
765       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
766     }
767   }
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
770   if (zz != yy) {
771     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
772   }
773   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
774   PetscFunctionReturn(0);
775 }
776 
777 /* ----------------------------------------------------------- */
778 #undef __FUNCT__
779 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
780 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
781 {
782   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
783   IS                iscol = a->col,isrow = a->row;
784   PetscErrorCode    ierr;
785   const PetscInt    *r,*c,*rout,*cout;
786   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
787   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
788   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
789   PetscScalar       sum1,sum2,sum3,sum4,sum5;
790   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
791   const PetscScalar *b;
792 
793   PetscFunctionBegin;
794   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
795   node_max = a->inode.node_count;
796   ns       = a->inode.size;     /* Node Size array */
797 
798   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
799   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
800   tmp  = a->solve_work;
801 
802   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
803   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
804 
805   /* forward solve the lower triangular */
806   tmps = tmp ;
807   aa   = a_a ;
808   aj   = a_j ;
809   ad   = a->diag;
810 
811   for (i = 0,row = 0; i< node_max; ++i){
812     nsz = ns[i];
813     aii = ai[row];
814     v1  = aa + aii;
815     vi  = aj + aii;
816     nz  = ad[row]- aii;
817     if (i < node_max-1) {
818       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
819       * but our indexing to determine it's size could. */
820       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
821       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
822       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
823       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
824     }
825 
826     switch (nsz){               /* Each loop in 'case' is unrolled */
827     case 1 :
828       sum1 = b[*r++];
829       for(j=0; j<nz-1; j+=2){
830         i0   = vi[0];
831         i1   = vi[1];
832         vi  +=2;
833         tmp0 = tmps[i0];
834         tmp1 = tmps[i1];
835         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
836       }
837       if(j == nz-1){
838         tmp0 = tmps[*vi++];
839         sum1 -= *v1++ *tmp0;
840       }
841       tmp[row ++]=sum1;
842       break;
843     case 2:
844       sum1 = b[*r++];
845       sum2 = b[*r++];
846       v2   = aa + ai[row+1];
847 
848       for(j=0; j<nz-1; j+=2){
849         i0   = vi[0];
850         i1   = vi[1];
851         vi  +=2;
852         tmp0 = tmps[i0];
853         tmp1 = tmps[i1];
854         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
855         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
856       }
857       if(j == nz-1){
858         tmp0 = tmps[*vi++];
859         sum1 -= *v1++ *tmp0;
860         sum2 -= *v2++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       tmp[row ++]=sum1;
864       tmp[row ++]=sum2;
865       break;
866     case 3:
867       sum1 = b[*r++];
868       sum2 = b[*r++];
869       sum3 = b[*r++];
870       v2   = aa + ai[row+1];
871       v3   = aa + ai[row+2];
872 
873       for (j=0; j<nz-1; j+=2){
874         i0   = vi[0];
875         i1   = vi[1];
876         vi  +=2;
877         tmp0 = tmps[i0];
878         tmp1 = tmps[i1];
879         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
880         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
881         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
882       }
883       if (j == nz-1){
884         tmp0 = tmps[*vi++];
885         sum1 -= *v1++ *tmp0;
886         sum2 -= *v2++ *tmp0;
887         sum3 -= *v3++ *tmp0;
888       }
889       sum2 -= *v2++ * sum1;
890       sum3 -= *v3++ * sum1;
891       sum3 -= *v3++ * sum2;
892       tmp[row ++]=sum1;
893       tmp[row ++]=sum2;
894       tmp[row ++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2){
907         i0   = vi[0];
908         i1   = vi[1];
909         vi  +=2;
910         tmp0 = tmps[i0];
911         tmp1 = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1){
918         tmp0 = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row ++]=sum1;
932       tmp[row ++]=sum2;
933       tmp[row ++]=sum3;
934       tmp[row ++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2){
948         i0   = vi[0];
949         i1   = vi[1];
950         vi  +=2;
951         tmp0 = tmps[i0];
952         tmp1 = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1){
960         tmp0 = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row ++]=sum1;
980       tmp[row ++]=sum2;
981       tmp[row ++]=sum3;
982       tmp[row ++]=sum4;
983       tmp[row ++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz){               /* Each loop in 'case' is unrolled */
997     case 1 :
998       sum1 = tmp[row];
999 
1000       for(j=nz ; j>1; j-=2){
1001         vi  -=2;
1002         i0   = vi[2];
1003         i1   = vi[1];
1004         tmp0 = tmps[i0];
1005         tmp1 = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1){
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2 :
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz ; j>1; j-=2){
1020         vi  -=2;
1021         i0   = vi[2];
1022         i1   = vi[1];
1023         tmp0 = tmps[i0];
1024         tmp1 = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1){
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3 :
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz ; j>1; j-=2){
1047         vi  -=2;
1048         i0   = vi[2];
1049         i1   = vi[1];
1050         tmp0 = tmps[i0];
1051         tmp1 = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1){
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4 :
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz ; j>1; j-=2){
1083         vi  -=2;
1084         i0   = vi[2];
1085         i1   = vi[1];
1086         tmp0 = tmps[i0];
1087         tmp1 = tmps[i1];
1088         v1  -= 2;
1089         v2  -= 2;
1090         v3  -= 2;
1091         v4  -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1){
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5 :
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz ; j>1; j-=2){
1127         vi  -= 2;
1128         i0   = vi[2];
1129         i1   = vi[1];
1130         tmp0 = tmps[i0];
1131         tmp1 = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1){
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat              C=B;
1185   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1186   IS               isrow = b->row,isicol = b->icol;
1187   PetscErrorCode   ierr;
1188   const PetscInt   *r,*ic,*ics;
1189   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt         i,j,k,nz,nzL,row,*pj;
1191   const PetscInt   *ajtmp,*bjtmp;
1192   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx   sctx;
1195   const PetscInt   *ddiag;
1196   PetscReal        rs;
1197   MatScalar        d;
1198   PetscInt         inod,nodesz,node_max,col;
1199   const PetscInt   *ns;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   rtmp4 = rtmp3 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1238 
1239   /* If max inode size > 4, split it into two inodes.*/
1240   /* also map the inode sizes according to the ordering */
1241   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1242   for (i=0,j=0; i<node_max; ++i,++j){
1243     if (ns[i] > 4) {
1244       tmp_vec1[j] = 4;
1245       ++j;
1246       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247     } else {
1248       tmp_vec1[j] = ns[i];
1249     }
1250   }
1251   /* Use the correct node_max */
1252   node_max = j;
1253 
1254   /* Now reorder the inode info based on mat re-ordering info */
1255   /* First create a row -> inode_size_array_index map */
1256   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1257   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1258   for (i=0,row=0; i<node_max; i++) {
1259     nodesz = tmp_vec1[i];
1260     for (j=0; j<nodesz; j++,row++) {
1261       nsmap[row] = i;
1262     }
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i=0,j=0; i< node_max; i++) {
1266     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267     tmp_vec2[i]  = nodesz;
1268     j           += nodesz;
1269   }
1270   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1271   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.newshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz){
1283       case 1:
1284       /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i+1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz = bdiag[i]-bdiag[i+1];
1293         bjtmp = bj + bdiag[i+1]+1;
1294         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i]+1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j=0; j<nz; j++) {
1301           rtmp1[ics[ajtmp[j]]] = v[j];
1302         }
1303         /* ZeropivotApply() */
1304         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1305 
1306         /* elimination */
1307         bjtmp = bj + bi[i];
1308         row   = *bjtmp++;
1309         nzL   = bi[i+1] - bi[i];
1310         for(k=0; k < nzL;k++) {
1311           pc = rtmp1 + row;
1312           if (*pc != 0.0) {
1313             pv   = b->a + bdiag[row];
1314             mul1 = *pc * (*pv);
1315             *pc  = mul1;
1316             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317             pv = b->a + bdiag[row+1]+1;
1318             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1321           }
1322           row = *bjtmp++;
1323         }
1324 
1325         /* finished row so stick it into b->a */
1326         rs = 0.0;
1327         /* L part */
1328         pv = b->a + bi[i] ;
1329         pj = b->j + bi[i] ;
1330         nz = bi[i+1] - bi[i];
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i+1]+1;
1337         pj = b->j + bdiag[i+1]+1;
1338         nz = bdiag[i] - bdiag[i+1]-1;
1339         for (j=0; j<nz; j++) {
1340           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341         }
1342 
1343         /* Check zero pivot */
1344         sctx.rs = rs;
1345         sctx.pv = rtmp1[i];
1346         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1347 	if(sctx.newshift) break;
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv   = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394             *pc1 = mul1;       *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         pc1 = b->a + bi[i];
1413         pj  = b->j + bi[i] ;
1414         nz  = bi[i+1] - bi[i];
1415         for (j=0; j<nz; j++) {
1416           col = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419         /* U part */
1420         pc1 = b->a + bdiag[i+1]+1;
1421         pj  = b->j + bdiag[i+1]+1;
1422         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col = pj[j];
1425           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426         }
1427 
1428         sctx.rs  = rs;
1429         sctx.pv  = rtmp1[i];
1430         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1431 	if(sctx.newshift) break;
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1470 	if(sctx.newshift) break;
1471         pc2  = b->a + bdiag[i+1];
1472         *pc2 = 1.0/sctx.pv;
1473         break;
1474 
1475       case 3:
1476       /*----------*/
1477         /* zero rtmp */
1478         /* L part */
1479         nz    = bi[i+1] - bi[i];
1480         bjtmp = bj + bi[i];
1481         for  (j=0; j<nz; j++) {
1482           col = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* U part */
1487         nz = bdiag[i]-bdiag[i+1];
1488         bjtmp = bj + bdiag[i+1]+1;
1489         for  (j=0; j<nz; j++) {
1490           col = bjtmp[j];
1491           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492         }
1493 
1494         /* load in initial (unfactored row) */
1495         nz    = ai[r[i]+1] - ai[r[i]];
1496         ajtmp = aj + ai[r[i]];
1497         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498         for (j=0; j<nz; j++) {
1499           col = ics[ajtmp[j]];
1500           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501         }
1502         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504 
1505         /* elimination */
1506         bjtmp = bj + bi[i];
1507         row   = *bjtmp++; /* pivot row */
1508         nzL   = bi[i+1] - bi[i];
1509         for(k=0; k < nzL;k++) {
1510           pc1 = rtmp1 + row;
1511           pc2 = rtmp2 + row;
1512           pc3 = rtmp3 + row;
1513           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514             pv  = b->a + bdiag[row];
1515             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517 
1518             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519             pv = b->a + bdiag[row+1]+1;
1520             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521             for (j=0; j<nz; j++){
1522               col = pj[j];
1523               rtmp1[col] -= mul1 * pv[j];
1524               rtmp2[col] -= mul2 * pv[j];
1525               rtmp3[col] -= mul3 * pv[j];
1526             }
1527             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1528           }
1529           row = *bjtmp++;
1530         }
1531 
1532         /* finished row i; check zero pivot, then stick row i into b->a */
1533         rs  = 0.0;
1534         /* L part */
1535         pc1 = b->a + bi[i];
1536         pj  = b->j + bi[i] ;
1537         nz  = bi[i+1] - bi[i];
1538         for (j=0; j<nz; j++) {
1539           col = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542         /* U part */
1543         pc1 = b->a + bdiag[i+1]+1;
1544         pj  = b->j + bdiag[i+1]+1;
1545         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546         for (j=0; j<nz; j++) {
1547           col = pj[j];
1548           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549         }
1550 
1551         sctx.rs  = rs;
1552         sctx.pv  = rtmp1[i];
1553         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1554 	if(sctx.newshift) break;
1555         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1556         *pc1 = 1.0/sctx.pv;
1557 
1558         /* Now take care of 1st column of diagonal 3x3 block. */
1559         pc2 = rtmp2 + i;
1560         pc3 = rtmp3 + i;
1561         if (*pc2 != 0.0 || *pc3 != 0.0){
1562           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col = pj[j];
1568             rtmp2[col] -= mul2 * rtmp1[col];
1569             rtmp3[col] -= mul3 * rtmp1[col];
1570           }
1571           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1572         }
1573 
1574         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575         rs = 0.0;
1576         /* L part */
1577         pc2 = b->a + bi[i+1];
1578         pj  = b->j + bi[i+1] ;
1579         nz  = bi[i+2] - bi[i+1];
1580         for (j=0; j<nz; j++) {
1581           col = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584         /* U part */
1585         pc2 = b->a + bdiag[i+2]+1;
1586         pj  = b->j + bdiag[i+2]+1;
1587         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588         for (j=0; j<nz; j++) {
1589           col = pj[j];
1590           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591         }
1592 
1593         sctx.rs  = rs;
1594         sctx.pv  = rtmp2[i+1];
1595         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1596 	if(sctx.newshift) break;
1597         pc2  = b->a + bdiag[i+1];
1598         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1599 
1600         /* Now take care of 2nd column of diagonal 3x3 block. */
1601         pc3 = rtmp3 + i+1;
1602         if (*pc3 != 0.0){
1603           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1605           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606           for (j=0; j<nz; j++) {
1607             col = pj[j];
1608             rtmp3[col] -= mul3 * rtmp2[col];
1609           }
1610           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1611         }
1612 
1613         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614         rs = 0.0;
1615         /* L part */
1616         pc3 = b->a + bi[i+2];
1617         pj  = b->j + bi[i+2] ;
1618         nz  = bi[i+3] - bi[i+2];
1619         for (j=0; j<nz; j++) {
1620           col = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623         /* U part */
1624         pc3 = b->a + bdiag[i+3]+1;
1625         pj  = b->j + bdiag[i+3]+1;
1626         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627         for (j=0; j<nz; j++) {
1628           col = pj[j];
1629           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630         }
1631 
1632         sctx.rs  = rs;
1633         sctx.pv  = rtmp3[i+2];
1634         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1635 	if(sctx.newshift) break;
1636         pc3  = b->a + bdiag[i+2];
1637         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638         break;
1639       case 4:
1640       /*----------*/
1641         /* zero rtmp */
1642         /* L part */
1643         nz    = bi[i+1] - bi[i];
1644         bjtmp = bj + bi[i];
1645         for  (j=0; j<nz; j++) {
1646           col = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648         }
1649 
1650         /* U part */
1651         nz = bdiag[i]-bdiag[i+1];
1652         bjtmp = bj + bdiag[i+1]+1;
1653         for  (j=0; j<nz; j++) {
1654           col = bjtmp[j];
1655           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656         }
1657 
1658         /* load in initial (unfactored row) */
1659         nz    = ai[r[i]+1] - ai[r[i]];
1660         ajtmp = aj + ai[r[i]];
1661         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662         for (j=0; j<nz; j++) {
1663           col = ics[ajtmp[j]];
1664           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665         }
1666         /* ZeropivotApply(): shift the diagonal of the matrix  */
1667         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668 
1669         /* elimination */
1670         bjtmp = bj + bi[i];
1671         row   = *bjtmp++; /* pivot row */
1672         nzL   = bi[i+1] - bi[i];
1673         for(k=0; k < nzL;k++) {
1674           pc1 = rtmp1 + row;
1675           pc2 = rtmp2 + row;
1676           pc3 = rtmp3 + row;
1677 	  pc4 = rtmp4 + row;
1678           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679             pv  = b->a + bdiag[row];
1680             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1682 
1683             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684             pv = b->a + bdiag[row+1]+1;
1685             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686             for (j=0; j<nz; j++){
1687               col = pj[j];
1688               rtmp1[col] -= mul1 * pv[j];
1689               rtmp2[col] -= mul2 * pv[j];
1690               rtmp3[col] -= mul3 * pv[j];
1691 	      rtmp4[col] -= mul4 * pv[j];
1692             }
1693             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1694           }
1695           row = *bjtmp++;
1696         }
1697 
1698         /* finished row i; check zero pivot, then stick row i into b->a */
1699         rs  = 0.0;
1700         /* L part */
1701         pc1 = b->a + bi[i];
1702         pj  = b->j + bi[i] ;
1703         nz  = bi[i+1] - bi[i];
1704         for (j=0; j<nz; j++) {
1705           col = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708         /* U part */
1709         pc1 = b->a + bdiag[i+1]+1;
1710         pj  = b->j + bdiag[i+1]+1;
1711         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712         for (j=0; j<nz; j++) {
1713           col = pj[j];
1714           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715         }
1716 
1717         sctx.rs  = rs;
1718         sctx.pv  = rtmp1[i];
1719         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1720 	if(sctx.newshift) break;
1721         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1722         *pc1 = 1.0/sctx.pv;
1723 
1724         /* Now take care of 1st column of diagonal 4x4 block. */
1725         pc2 = rtmp2 + i;
1726         pc3 = rtmp3 + i;
1727 	pc4 = rtmp4 + i;
1728         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1733           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col = pj[j];
1736             rtmp2[col] -= mul2 * rtmp1[col];
1737             rtmp3[col] -= mul3 * rtmp1[col];
1738 	    rtmp4[col] -= mul4 * rtmp1[col];
1739           }
1740           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1741         }
1742 
1743         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744         rs = 0.0;
1745         /* L part */
1746         pc2 = b->a + bi[i+1];
1747         pj  = b->j + bi[i+1] ;
1748         nz  = bi[i+2] - bi[i+1];
1749         for (j=0; j<nz; j++) {
1750           col = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753         /* U part */
1754         pc2 = b->a + bdiag[i+2]+1;
1755         pj  = b->j + bdiag[i+2]+1;
1756         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757         for (j=0; j<nz; j++) {
1758           col = pj[j];
1759           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760         }
1761 
1762         sctx.rs  = rs;
1763         sctx.pv  = rtmp2[i+1];
1764         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1765 	if(sctx.newshift) break;
1766         pc2  = b->a + bdiag[i+1];
1767         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1768 
1769         /* Now take care of 2nd column of diagonal 4x4 block. */
1770         pc3 = rtmp3 + i+1;
1771 	pc4 = rtmp4 + i+1;
1772         if (*pc3 != 0.0 || *pc4 != 0.0){
1773           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1776           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777           for (j=0; j<nz; j++) {
1778             col = pj[j];
1779             rtmp3[col] -= mul3 * rtmp2[col];
1780 	    rtmp4[col] -= mul4 * rtmp2[col];
1781           }
1782           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1783         }
1784 
1785         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786         rs = 0.0;
1787         /* L part */
1788         pc3 = b->a + bi[i+2];
1789         pj  = b->j + bi[i+2] ;
1790         nz  = bi[i+3] - bi[i+2];
1791         for (j=0; j<nz; j++) {
1792           col = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795         /* U part */
1796         pc3 = b->a + bdiag[i+3]+1;
1797         pj  = b->j + bdiag[i+3]+1;
1798         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799         for (j=0; j<nz; j++) {
1800           col = pj[j];
1801           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802         }
1803 
1804         sctx.rs  = rs;
1805         sctx.pv  = rtmp3[i+2];
1806         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1807 	if(sctx.newshift) break;
1808         pc3  = b->a + bdiag[i+2];
1809         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810 
1811 	/* Now take care of 3rd column of diagonal 4x4 block. */
1812 	pc4 = rtmp4 + i+2;
1813         if (*pc4 != 0.0){
1814 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1816           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817           for (j=0; j<nz; j++) {
1818             col = pj[j];
1819 	    rtmp4[col] -= mul4 * rtmp3[col];
1820           }
1821           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1822         }
1823 
1824         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825         rs = 0.0;
1826         /* L part */
1827         pc4 = b->a + bi[i+3];
1828         pj  = b->j + bi[i+3] ;
1829         nz  = bi[i+4] - bi[i+3];
1830         for (j=0; j<nz; j++) {
1831           col = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834         /* U part */
1835         pc4 = b->a + bdiag[i+4]+1;
1836         pj  = b->j + bdiag[i+4]+1;
1837         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838         for (j=0; j<nz; j++) {
1839           col = pj[j];
1840           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841         }
1842 
1843         sctx.rs  = rs;
1844         sctx.pv  = rtmp4[i+3];
1845         ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1846 	if(sctx.newshift) break;
1847         pc4  = b->a + bdiag[i+3];
1848         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849          break;
1850 
1851       default:
1852         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853       }
1854       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1855       i += nodesz;                 /* Update the row */
1856     }
1857 
1858     /* MatPivotRefine() */
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860       /*
1861        * if no shift in this attempt & shifting & started shifting & can refine,
1862        * then try lower shift
1863        */
1864       sctx.shift_hi       = sctx.shift_fraction;
1865       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1867       sctx.newshift       = PETSC_TRUE;
1868       sctx.nshift++;
1869     }
1870   } while (sctx.newshift);
1871 
1872   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1873   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1874   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1876 
1877   C->ops->solve              = MatSolve_SeqAIJ;
1878   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1879   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1880   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1881   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1882   C->assembled    = PETSC_TRUE;
1883   C->preallocated = PETSC_TRUE;
1884   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1885 
1886   /* MatShiftView(A,info,&sctx) */
1887   if (sctx.nshift){
1888     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1890     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1892     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1894     }
1895   }
1896   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 #undef __FUNCT__
1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903 {
1904   Mat               C = B;
1905   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1907   PetscErrorCode    ierr;
1908   const PetscInt    *r,*ic,*c,*ics;
1909   PetscInt          n = A->rmap->n,*bi = b->i;
1910   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913   PetscScalar       mul1,mul2,mul3,tmp;
1914   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916   PetscReal         rs=0.0;
1917   FactorShiftCtx    sctx;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++){
1936         if (*ajtmp != i){
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics   = ic ;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j){
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995     tmp_vec2[i]  = nodesz;
1996     j           += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.newshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz){
2012       case 1:
2013         for  (j=0; j<nz; j++){
2014           idx        = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx        = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++ ;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0){
2034             pv   = ba + bd[prow];
2035             pj   = nbj + bd[prow];
2036             mul1 = *pc1 * *pv++;
2037             *pc1 = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp = pv[j];
2042               idx = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++ ;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv    = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs         = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs  = rs;
2060         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2061         if (sctx.newshift) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx        = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx        = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++ ;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0){
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++ ;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++){
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2123         if (sctx.newshift) goto endofwhile;
2124 
2125         if (*pc2 != 0.0){
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx = pj[j] ;
2132             tmp = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv = rtmp22[row+1];
2143         rs = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2155         if (sctx.newshift) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx        = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp = aj + ai[idx];
2169         v1    = aa + ai[idx];
2170         v2    = aa + ai[idx+1];
2171         v3    = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx        = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++ ;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp = pv[j];
2203               idx = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++ ;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++){
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2228         if (sctx.newshift) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0){
2231           mul2 = (*pc2)/(*pc1);
2232           mul3 = (*pc3)/(*pc1);
2233           *pc2 = mul2;
2234           *pc3 = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx = pj[j] ;
2239             tmp = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2 = rtmp22 + prow;
2248         pc3 = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++){
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2258         if (sctx.newshift) goto endofwhile;
2259 
2260         if (*pc3 != 0.0){
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx = pj[j] ;
2267             tmp = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv = rtmp33[row+2];
2279         rs = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2294         if (sctx.newshift) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302     endofwhile:;
2303   } while (sctx.newshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313   C->assembled   = PETSC_TRUE;
2314   C->preallocated = PETSC_TRUE;
2315   if (sctx.nshift) {
2316     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2323   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2324   PetscFunctionReturn(0);
2325 }
2326 
2327 
2328 /* ----------------------------------------------------------- */
2329 #undef __FUNCT__
2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332 {
2333   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334   IS                iscol = a->col,isrow = a->row;
2335   PetscErrorCode    ierr;
2336   const PetscInt    *r,*c,*rout,*cout;
2337   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342   const PetscScalar *b;
2343 
2344   PetscFunctionBegin;
2345   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346   node_max = a->inode.node_count;
2347   ns       = a->inode.size;     /* Node Size array */
2348 
2349   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2351   tmp  = a->solve_work;
2352 
2353   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2354   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2355 
2356   /* forward solve the lower triangular */
2357   tmps = tmp ;
2358   aa   = a_a ;
2359   aj   = a_j ;
2360   ad   = a->diag;
2361 
2362   for (i = 0,row = 0; i< node_max; ++i){
2363     nsz = ns[i];
2364     aii = ai[row];
2365     v1  = aa + aii;
2366     vi  = aj + aii;
2367     nz  = ai[row+1]- ai[row];
2368 
2369     if (i < node_max-1) {
2370       /* Prefetch the indices for the next block */
2371       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2372       /* Prefetch the data for the next block */
2373       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2374     }
2375 
2376     switch (nsz){               /* Each loop in 'case' is unrolled */
2377     case 1 :
2378       sum1 = b[r[row]];
2379       for(j=0; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385       }
2386       if(j == nz-1){
2387         tmp0 = tmps[vi[j]];
2388         sum1 -= v1[j]*tmp0;
2389       }
2390       tmp[row++]=sum1;
2391       break;
2392     case 2:
2393       sum1 = b[r[row]];
2394       sum2 = b[r[row+1]];
2395       v2   = aa + ai[row+1];
2396 
2397       for(j=0; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404       }
2405       if(j == nz-1){
2406         tmp0 = tmps[vi[j]];
2407         sum1 -= v1[j] *tmp0;
2408         sum2 -= v2[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       tmp[row ++]=sum1;
2412       tmp[row ++]=sum2;
2413       break;
2414     case 3:
2415       sum1 = b[r[row]];
2416       sum2 = b[r[row+1]];
2417       sum3 = b[r[row+2]];
2418       v2   = aa + ai[row+1];
2419       v3   = aa + ai[row+2];
2420 
2421       for (j=0; j<nz-1; j+=2){
2422         i0   = vi[j];
2423         i1   = vi[j+1];
2424         tmp0 = tmps[i0];
2425         tmp1 = tmps[i1];
2426         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429       }
2430       if (j == nz-1){
2431         tmp0 = tmps[vi[j]];
2432         sum1 -= v1[j] *tmp0;
2433         sum2 -= v2[j] *tmp0;
2434         sum3 -= v3[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum3 -= v3[nz+1] * sum2;
2439       tmp[row ++]=sum1;
2440       tmp[row ++]=sum2;
2441       tmp[row ++]=sum3;
2442       break;
2443 
2444     case 4:
2445       sum1 = b[r[row]];
2446       sum2 = b[r[row+1]];
2447       sum3 = b[r[row+2]];
2448       sum4 = b[r[row+3]];
2449       v2   = aa + ai[row+1];
2450       v3   = aa + ai[row+2];
2451       v4   = aa + ai[row+3];
2452 
2453       for (j=0; j<nz-1; j+=2){
2454         i0   = vi[j];
2455         i1   = vi[j+1];
2456         tmp0 = tmps[i0];
2457         tmp1 = tmps[i1];
2458         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462       }
2463       if (j == nz-1){
2464         tmp0 = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum3 -= v3[nz+1] * sum2;
2474       sum4 -= v4[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476 
2477       tmp[row ++]=sum1;
2478       tmp[row ++]=sum2;
2479       tmp[row ++]=sum3;
2480       tmp[row ++]=sum4;
2481       break;
2482     case 5:
2483       sum1 = b[r[row]];
2484       sum2 = b[r[row+1]];
2485       sum3 = b[r[row+2]];
2486       sum4 = b[r[row+3]];
2487       sum5 = b[r[row+4]];
2488       v2   = aa + ai[row+1];
2489       v3   = aa + ai[row+2];
2490       v4   = aa + ai[row+3];
2491       v5   = aa + ai[row+4];
2492 
2493       for (j=0; j<nz-1; j+=2){
2494         i0   = vi[j];
2495         i1   = vi[j+1];
2496         tmp0 = tmps[i0];
2497         tmp1 = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503       }
2504       if (j == nz-1){
2505         tmp0 = tmps[vi[j]];
2506         sum1 -= v1[j] *tmp0;
2507         sum2 -= v2[j] *tmp0;
2508         sum3 -= v3[j] *tmp0;
2509         sum4 -= v4[j] *tmp0;
2510         sum5 -= v5[j] *tmp0;
2511       }
2512 
2513       sum2 -= v2[nz] * sum1;
2514       sum3 -= v3[nz] * sum1;
2515       sum4 -= v4[nz] * sum1;
2516       sum5 -= v5[nz] * sum1;
2517       sum3 -= v3[nz+1] * sum2;
2518       sum4 -= v4[nz+1] * sum2;
2519       sum5 -= v5[nz+1] * sum2;
2520       sum4 -= v4[nz+2] * sum3;
2521       sum5 -= v5[nz+2] * sum3;
2522       sum5 -= v5[nz+3] * sum4;
2523 
2524       tmp[row ++]=sum1;
2525       tmp[row ++]=sum2;
2526       tmp[row ++]=sum3;
2527       tmp[row ++]=sum4;
2528       tmp[row ++]=sum5;
2529       break;
2530     default:
2531       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532     }
2533   }
2534   /* backward solve the upper triangular */
2535   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536     nsz = ns[i];
2537     aii = ad[row+1] + 1;
2538     v1  = aa + aii;
2539     vi  = aj + aii;
2540     nz  = ad[row]- ad[row+1] - 1;
2541 
2542     if (i > 0) {
2543       /* Prefetch the indices for the next block */
2544       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545       /* Prefetch the data for the next block */
2546       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547     }
2548 
2549     switch (nsz){               /* Each loop in 'case' is unrolled */
2550     case 1 :
2551       sum1 = tmp[row];
2552 
2553       for(j=0 ; j<nz-1; j+=2){
2554         i0   = vi[j];
2555         i1   = vi[j+1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559       }
2560       if (j == nz-1){
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j]*tmp0;
2563       }
2564       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       break;
2566     case 2 :
2567       sum1 = tmp[row];
2568       sum2 = tmp[row-1];
2569       v2   = aa + ad[row] + 1;
2570       for (j=0 ; j<nz-1; j+=2){
2571         i0   = vi[j];
2572         i1   = vi[j+1];
2573         tmp0 = tmps[i0];
2574         tmp1 = tmps[i1];
2575         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577       }
2578       if (j == nz-1){
2579         tmp0  = tmps[vi[j]];
2580         sum1 -= v1[j]* tmp0;
2581         sum2 -= v2[j+1]* tmp0;
2582       }
2583 
2584       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2   -= v2[0] * tmp0;
2586       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587       break;
2588     case 3 :
2589       sum1 = tmp[row];
2590       sum2 = tmp[row -1];
2591       sum3 = tmp[row -2];
2592       v2   = aa + ad[row] + 1;
2593       v3   = aa + ad[row -1] + 1;
2594       for (j=0 ; j<nz-1; j+=2){
2595         i0   = vi[j];
2596         i1   = vi[j+1];
2597         tmp0 = tmps[i0];
2598         tmp1 = tmps[i1];
2599         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602       }
2603       if (j== nz-1){
2604         tmp0  = tmps[vi[j]];
2605         sum1 -= v1[j] * tmp0;
2606         sum2 -= v2[j+1] * tmp0;
2607         sum3 -= v3[j+2] * tmp0;
2608       }
2609       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610       sum2   -= v2[0]* tmp0;
2611       sum3   -= v3[1] * tmp0;
2612       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613       sum3   -= v3[0]* tmp0;
2614       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615 
2616       break;
2617     case 4 :
2618       sum1 = tmp[row];
2619       sum2 = tmp[row -1];
2620       sum3 = tmp[row -2];
2621       sum4 = tmp[row -3];
2622       v2   = aa + ad[row]+1;
2623       v3   = aa + ad[row -1]+1;
2624       v4   = aa + ad[row -2]+1;
2625 
2626       for (j=0 ; j<nz-1; j+=2){
2627         i0   = vi[j];
2628         i1   = vi[j+1];
2629         tmp0 = tmps[i0];
2630         tmp1 = tmps[i1];
2631         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635       }
2636       if (j== nz-1){
2637         tmp0  = tmps[vi[j]];
2638         sum1 -= v1[j] * tmp0;
2639         sum2 -= v2[j+1] * tmp0;
2640         sum3 -= v3[j+2] * tmp0;
2641         sum4 -= v4[j+3] * tmp0;
2642       }
2643 
2644       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2   -= v2[0] * tmp0;
2646       sum3   -= v3[1] * tmp0;
2647       sum4   -= v4[2] * tmp0;
2648       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649       sum3   -= v3[0] * tmp0;
2650       sum4   -= v4[1] * tmp0;
2651       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4   -= v4[0] * tmp0;
2653       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654       break;
2655     case 5 :
2656       sum1 = tmp[row];
2657       sum2 = tmp[row -1];
2658       sum3 = tmp[row -2];
2659       sum4 = tmp[row -3];
2660       sum5 = tmp[row -4];
2661       v2   = aa + ad[row]+1;
2662       v3   = aa + ad[row -1]+1;
2663       v4   = aa + ad[row -2]+1;
2664       v5   = aa + ad[row -3]+1;
2665       for (j=0 ; j<nz-1; j+=2){
2666         i0   = vi[j];
2667         i1   = vi[j+1];
2668         tmp0 = tmps[i0];
2669         tmp1 = tmps[i1];
2670         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675       }
2676       if (j==nz-1){
2677         tmp0  = tmps[vi[j]];
2678         sum1 -= v1[j] * tmp0;
2679         sum2 -= v2[j+1] * tmp0;
2680         sum3 -= v3[j+2] * tmp0;
2681         sum4 -= v4[j+3] * tmp0;
2682         sum5 -= v5[j+4] * tmp0;
2683       }
2684 
2685       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686       sum2   -= v2[0] * tmp0;
2687       sum3   -= v3[1] * tmp0;
2688       sum4   -= v4[2] * tmp0;
2689       sum5   -= v5[3] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691       sum3   -= v3[0] * tmp0;
2692       sum4   -= v4[1] * tmp0;
2693       sum5   -= v5[2] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695       sum4   -= v4[0] * tmp0;
2696       sum5   -= v5[1] * tmp0;
2697       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698       sum5   -= v5[0] * tmp0;
2699       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700       break;
2701     default:
2702       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703     }
2704   }
2705   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2706   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2707   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2709   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 
2714 /*
2715      Makes a longer coloring[] array and calls the usual code with that
2716 */
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720 {
2721   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722   PetscErrorCode  ierr;
2723   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724   PetscInt        *colorused,i;
2725   ISColoringValue *newcolor;
2726 
2727   PetscFunctionBegin;
2728   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2729   /* loop over inodes, marking a color for each column*/
2730   row = 0;
2731   for (i=0; i<m; i++){
2732     for (j=0; j<ns[i]; j++) {
2733       newcolor[row++] = coloring[i] + j*ncolors;
2734     }
2735   }
2736 
2737   /* eliminate unneeded colors */
2738   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2739   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2740   for (i=0; i<n; i++) {
2741     colorused[newcolor[i]] = 1;
2742   }
2743 
2744   for (i=1; i<5*ncolors; i++) {
2745     colorused[i] += colorused[i-1];
2746   }
2747   ncolors = colorused[5*ncolors-1];
2748   for (i=0; i<n; i++) {
2749     newcolor[i] = colorused[newcolor[i]]-1;
2750   }
2751   ierr = PetscFree(colorused);CHKERRQ(ierr);
2752   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2753   ierr = PetscFree(coloring);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 #include <../src/mat/blockinvert.h>
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762 {
2763   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765   MatScalar          *ibdiag,*bdiag,work[25];
2766   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2767   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768   const PetscScalar  *xb;
2769   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770   PetscErrorCode     ierr;
2771   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2773 
2774   PetscFunctionBegin;
2775   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777   if (its > 1) {
2778     /* switch to non-inode version */
2779     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2780     PetscFunctionReturn(0);
2781   }
2782 
2783   if (!a->inode.ibdiagvalid) {
2784     if (!a->inode.ibdiag) {
2785       /* calculate space needed for diagonal blocks */
2786       for (i=0; i<m; i++) {
2787 	cnt += sizes[i]*sizes[i];
2788       }
2789       a->inode.bdiagsize = cnt;
2790       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2791     }
2792 
2793     /* copy over the diagonal blocks and invert them */
2794     ibdiag = a->inode.ibdiag;
2795     bdiag  = a->inode.bdiag;
2796     cnt = 0;
2797     for (i=0, row = 0; i<m; i++) {
2798       for (j=0; j<sizes[i]; j++) {
2799         for (k=0; k<sizes[i]; k++) {
2800           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2801         }
2802       }
2803       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2804 
2805       switch(sizes[i]) {
2806         case 1:
2807           /* Create matrix data structure */
2808           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2809           ibdiag[cnt] = 1.0/ibdiag[cnt];
2810           break;
2811         case 2:
2812           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2813           break;
2814         case 3:
2815           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2816           break;
2817         case 4:
2818           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2819           break;
2820         case 5:
2821           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2822           break;
2823        default:
2824 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2825       }
2826       cnt += sizes[i]*sizes[i];
2827       row += sizes[i];
2828     }
2829     a->inode.ibdiagvalid = PETSC_TRUE;
2830   }
2831   ibdiag = a->inode.ibdiag;
2832   bdiag  = a->inode.bdiag;
2833 
2834 
2835   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837     const PetscScalar *b;
2838     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2839     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2840     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2841 
2842       for (i=0, row=0; i<m; i++) {
2843         sz  = diag[row] - ii[row];
2844         v1  = a->a + ii[row];
2845         idx = a->j + ii[row];
2846 
2847         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2848         switch (sizes[i]){
2849           case 1:
2850 
2851             sum1  = b[row];
2852             for(n = 0; n<sz-1; n+=2) {
2853               i1   = idx[0];
2854               i2   = idx[1];
2855               idx += 2;
2856               tmp0 = x[i1];
2857               tmp1 = x[i2];
2858               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2859             }
2860 
2861             if (n == sz-1){
2862               tmp0  = x[*idx];
2863               sum1 -= *v1 * tmp0;
2864             }
2865             x[row++] = sum1*(*ibdiag++);
2866             break;
2867           case 2:
2868             v2    = a->a + ii[row+1];
2869             sum1  = b[row];
2870             sum2  = b[row+1];
2871             for(n = 0; n<sz-1; n+=2) {
2872               i1   = idx[0];
2873               i2   = idx[1];
2874               idx += 2;
2875               tmp0 = x[i1];
2876               tmp1 = x[i2];
2877               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2878               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2879             }
2880 
2881             if (n == sz-1){
2882               tmp0  = x[*idx];
2883               sum1 -= v1[0] * tmp0;
2884               sum2 -= v2[0] * tmp0;
2885             }
2886             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888             ibdiag  += 4;
2889             break;
2890           case 3:
2891             v2    = a->a + ii[row+1];
2892             v3    = a->a + ii[row+2];
2893             sum1  = b[row];
2894             sum2  = b[row+1];
2895             sum3  = b[row+2];
2896             for(n = 0; n<sz-1; n+=2) {
2897               i1   = idx[0];
2898               i2   = idx[1];
2899               idx += 2;
2900               tmp0 = x[i1];
2901               tmp1 = x[i2];
2902               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905             }
2906 
2907             if (n == sz-1){
2908               tmp0  = x[*idx];
2909               sum1 -= v1[0] * tmp0;
2910               sum2 -= v2[0] * tmp0;
2911               sum3 -= v3[0] * tmp0;
2912             }
2913             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916             ibdiag  += 9;
2917             break;
2918           case 4:
2919             v2    = a->a + ii[row+1];
2920             v3    = a->a + ii[row+2];
2921             v4    = a->a + ii[row+3];
2922             sum1  = b[row];
2923             sum2  = b[row+1];
2924             sum3  = b[row+2];
2925             sum4  = b[row+3];
2926             for(n = 0; n<sz-1; n+=2) {
2927               i1   = idx[0];
2928               i2   = idx[1];
2929               idx += 2;
2930               tmp0 = x[i1];
2931               tmp1 = x[i2];
2932               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936             }
2937 
2938             if (n == sz-1){
2939               tmp0  = x[*idx];
2940               sum1 -= v1[0] * tmp0;
2941               sum2 -= v2[0] * tmp0;
2942               sum3 -= v3[0] * tmp0;
2943               sum4 -= v4[0] * tmp0;
2944             }
2945             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2946             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2947             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2948             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2949             ibdiag  += 16;
2950             break;
2951           case 5:
2952             v2    = a->a + ii[row+1];
2953             v3    = a->a + ii[row+2];
2954             v4    = a->a + ii[row+3];
2955             v5    = a->a + ii[row+4];
2956             sum1  = b[row];
2957             sum2  = b[row+1];
2958             sum3  = b[row+2];
2959             sum4  = b[row+3];
2960             sum5  = b[row+4];
2961             for(n = 0; n<sz-1; n+=2) {
2962               i1   = idx[0];
2963               i2   = idx[1];
2964               idx += 2;
2965               tmp0 = x[i1];
2966               tmp1 = x[i2];
2967               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2968               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2969               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2970               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2971               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2972             }
2973 
2974             if (n == sz-1){
2975               tmp0  = x[*idx];
2976               sum1 -= v1[0] * tmp0;
2977               sum2 -= v2[0] * tmp0;
2978               sum3 -= v3[0] * tmp0;
2979               sum4 -= v4[0] * tmp0;
2980               sum5 -= v5[0] * tmp0;
2981             }
2982             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2983             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2984             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2985             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2986             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2987             ibdiag  += 25;
2988             break;
2989 	  default:
2990    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2991         }
2992       }
2993 
2994       xb = x;
2995       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2996     } else xb = b;
2997     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2998         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2999       cnt = 0;
3000       for (i=0, row=0; i<m; i++) {
3001 
3002         switch (sizes[i]){
3003           case 1:
3004             x[row++] *= bdiag[cnt++];
3005             break;
3006           case 2:
3007             x1   = x[row]; x2 = x[row+1];
3008             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3009             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3010             x[row++] = tmp1;
3011             x[row++] = tmp2;
3012             cnt += 4;
3013             break;
3014           case 3:
3015             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3016             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3017             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3018             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3019             x[row++] = tmp1;
3020             x[row++] = tmp2;
3021             x[row++] = tmp3;
3022             cnt += 9;
3023             break;
3024           case 4:
3025             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3026             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3027             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3028             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3029             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3030             x[row++] = tmp1;
3031             x[row++] = tmp2;
3032             x[row++] = tmp3;
3033             x[row++] = tmp4;
3034             cnt += 16;
3035             break;
3036           case 5:
3037             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3038             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3039             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3040             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3041             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3042             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3043             x[row++] = tmp1;
3044             x[row++] = tmp2;
3045             x[row++] = tmp3;
3046             x[row++] = tmp4;
3047             x[row++] = tmp5;
3048             cnt += 25;
3049             break;
3050 	  default:
3051    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3052         }
3053       }
3054       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3055     }
3056     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3057 
3058       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3059       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3060         ibdiag -= sizes[i]*sizes[i];
3061         sz      = ii[row+1] - diag[row] - 1;
3062         v1      = a->a + diag[row] + 1;
3063         idx     = a->j + diag[row] + 1;
3064 
3065         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3066         switch (sizes[i]){
3067           case 1:
3068 
3069             sum1  = xb[row];
3070             for(n = 0; n<sz-1; n+=2) {
3071               i1   = idx[0];
3072               i2   = idx[1];
3073               idx += 2;
3074               tmp0 = x[i1];
3075               tmp1 = x[i2];
3076               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077             }
3078 
3079             if (n == sz-1){
3080               tmp0  = x[*idx];
3081               sum1 -= *v1*tmp0;
3082             }
3083             x[row--] = sum1*(*ibdiag);
3084             break;
3085 
3086           case 2:
3087 
3088             sum1  = xb[row];
3089             sum2  = xb[row-1];
3090             /* note that sum1 is associated with the second of the two rows */
3091             v2    = a->a + diag[row-1] + 2;
3092             for(n = 0; n<sz-1; n+=2) {
3093               i1   = idx[0];
3094               i2   = idx[1];
3095               idx += 2;
3096               tmp0 = x[i1];
3097               tmp1 = x[i2];
3098               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100             }
3101 
3102             if (n == sz-1){
3103               tmp0  = x[*idx];
3104               sum1 -= *v1*tmp0;
3105               sum2 -= *v2*tmp0;
3106             }
3107             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3108             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3109             break;
3110           case 3:
3111 
3112             sum1  = xb[row];
3113             sum2  = xb[row-1];
3114             sum3  = xb[row-2];
3115             v2    = a->a + diag[row-1] + 2;
3116             v3    = a->a + diag[row-2] + 3;
3117             for(n = 0; n<sz-1; n+=2) {
3118               i1   = idx[0];
3119               i2   = idx[1];
3120               idx += 2;
3121               tmp0 = x[i1];
3122               tmp1 = x[i2];
3123               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3124               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3125               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3126             }
3127 
3128             if (n == sz-1){
3129               tmp0  = x[*idx];
3130               sum1 -= *v1*tmp0;
3131               sum2 -= *v2*tmp0;
3132               sum3 -= *v3*tmp0;
3133             }
3134             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3135             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3136             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3137             break;
3138           case 4:
3139 
3140             sum1  = xb[row];
3141             sum2  = xb[row-1];
3142             sum3  = xb[row-2];
3143             sum4  = xb[row-3];
3144             v2    = a->a + diag[row-1] + 2;
3145             v3    = a->a + diag[row-2] + 3;
3146             v4    = a->a + diag[row-3] + 4;
3147             for(n = 0; n<sz-1; n+=2) {
3148               i1   = idx[0];
3149               i2   = idx[1];
3150               idx += 2;
3151               tmp0 = x[i1];
3152               tmp1 = x[i2];
3153               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3155               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3156               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3157             }
3158 
3159             if (n == sz-1){
3160               tmp0  = x[*idx];
3161               sum1 -= *v1*tmp0;
3162               sum2 -= *v2*tmp0;
3163               sum3 -= *v3*tmp0;
3164               sum4 -= *v4*tmp0;
3165             }
3166             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3167             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3168             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3169             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3170             break;
3171           case 5:
3172 
3173             sum1  = xb[row];
3174             sum2  = xb[row-1];
3175             sum3  = xb[row-2];
3176             sum4  = xb[row-3];
3177             sum5  = xb[row-4];
3178             v2    = a->a + diag[row-1] + 2;
3179             v3    = a->a + diag[row-2] + 3;
3180             v4    = a->a + diag[row-3] + 4;
3181             v5    = a->a + diag[row-4] + 5;
3182             for(n = 0; n<sz-1; n+=2) {
3183               i1   = idx[0];
3184               i2   = idx[1];
3185               idx += 2;
3186               tmp0 = x[i1];
3187               tmp1 = x[i2];
3188               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3190               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3191               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3192               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3193             }
3194 
3195             if (n == sz-1){
3196               tmp0  = x[*idx];
3197               sum1 -= *v1*tmp0;
3198               sum2 -= *v2*tmp0;
3199               sum3 -= *v3*tmp0;
3200               sum4 -= *v4*tmp0;
3201               sum5 -= *v5*tmp0;
3202             }
3203             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3204             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3205             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3206             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3207             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3208             break;
3209 	  default:
3210    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3211         }
3212       }
3213 
3214       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3215     }
3216     its--;
3217     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3218     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3219   }
3220   if (flag & SOR_EISENSTAT) {
3221     const PetscScalar *b;
3222     MatScalar         *t = a->inode.ssor_work;
3223 
3224     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3225     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3226     /*
3227           Apply  (U + D)^-1  where D is now the block diagonal
3228     */
3229     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3230     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3231       ibdiag -= sizes[i]*sizes[i];
3232       sz      = ii[row+1] - diag[row] - 1;
3233       v1      = a->a + diag[row] + 1;
3234       idx     = a->j + diag[row] + 1;
3235       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3236       switch (sizes[i]){
3237         case 1:
3238 
3239 	  sum1  = b[row];
3240 	  for(n = 0; n<sz-1; n+=2) {
3241 	    i1   = idx[0];
3242 	    i2   = idx[1];
3243 	    idx += 2;
3244 	    tmp0 = x[i1];
3245 	    tmp1 = x[i2];
3246 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3247 	  }
3248 
3249 	  if (n == sz-1){
3250 	    tmp0  = x[*idx];
3251 	    sum1 -= *v1*tmp0;
3252 	  }
3253 	  x[row] = sum1*(*ibdiag);row--;
3254 	  break;
3255 
3256         case 2:
3257 
3258 	  sum1  = b[row];
3259 	  sum2  = b[row-1];
3260 	  /* note that sum1 is associated with the second of the two rows */
3261 	  v2    = a->a + diag[row-1] + 2;
3262 	  for(n = 0; n<sz-1; n+=2) {
3263 	    i1   = idx[0];
3264 	    i2   = idx[1];
3265 	    idx += 2;
3266 	    tmp0 = x[i1];
3267 	    tmp1 = x[i2];
3268 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3269 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3270 	  }
3271 
3272 	  if (n == sz-1){
3273 	    tmp0  = x[*idx];
3274 	    sum1 -= *v1*tmp0;
3275 	    sum2 -= *v2*tmp0;
3276 	  }
3277 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3278 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3279           row -= 2;
3280 	  break;
3281         case 3:
3282 
3283 	  sum1  = b[row];
3284 	  sum2  = b[row-1];
3285 	  sum3  = b[row-2];
3286 	  v2    = a->a + diag[row-1] + 2;
3287 	  v3    = a->a + diag[row-2] + 3;
3288 	  for(n = 0; n<sz-1; n+=2) {
3289 	    i1   = idx[0];
3290 	    i2   = idx[1];
3291 	    idx += 2;
3292 	    tmp0 = x[i1];
3293 	    tmp1 = x[i2];
3294 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3295 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3296 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3297 	  }
3298 
3299 	  if (n == sz-1){
3300 	    tmp0  = x[*idx];
3301 	    sum1 -= *v1*tmp0;
3302 	    sum2 -= *v2*tmp0;
3303 	    sum3 -= *v3*tmp0;
3304 	  }
3305 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3306 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3307 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3308           row -= 3;
3309 	  break;
3310         case 4:
3311 
3312 	  sum1  = b[row];
3313 	  sum2  = b[row-1];
3314 	  sum3  = b[row-2];
3315 	  sum4  = b[row-3];
3316 	  v2    = a->a + diag[row-1] + 2;
3317 	  v3    = a->a + diag[row-2] + 3;
3318 	  v4    = a->a + diag[row-3] + 4;
3319 	  for(n = 0; n<sz-1; n+=2) {
3320 	    i1   = idx[0];
3321 	    i2   = idx[1];
3322 	    idx += 2;
3323 	    tmp0 = x[i1];
3324 	    tmp1 = x[i2];
3325 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3326 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3327 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3328 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3329 	  }
3330 
3331 	  if (n == sz-1){
3332 	    tmp0  = x[*idx];
3333 	    sum1 -= *v1*tmp0;
3334 	    sum2 -= *v2*tmp0;
3335 	    sum3 -= *v3*tmp0;
3336 	    sum4 -= *v4*tmp0;
3337 	  }
3338 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3339 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3340 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3341 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3342           row -= 4;
3343 	  break;
3344         case 5:
3345 
3346 	  sum1  = b[row];
3347 	  sum2  = b[row-1];
3348 	  sum3  = b[row-2];
3349 	  sum4  = b[row-3];
3350 	  sum5  = b[row-4];
3351 	  v2    = a->a + diag[row-1] + 2;
3352 	  v3    = a->a + diag[row-2] + 3;
3353 	  v4    = a->a + diag[row-3] + 4;
3354 	  v5    = a->a + diag[row-4] + 5;
3355 	  for(n = 0; n<sz-1; n+=2) {
3356 	    i1   = idx[0];
3357 	    i2   = idx[1];
3358 	    idx += 2;
3359 	    tmp0 = x[i1];
3360 	    tmp1 = x[i2];
3361 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3362 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3363 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3364 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3365 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3366 	  }
3367 
3368 	  if (n == sz-1){
3369 	    tmp0  = x[*idx];
3370 	    sum1 -= *v1*tmp0;
3371 	    sum2 -= *v2*tmp0;
3372 	    sum3 -= *v3*tmp0;
3373 	    sum4 -= *v4*tmp0;
3374 	    sum5 -= *v5*tmp0;
3375 	  }
3376 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3377 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3378 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3379 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3380 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3381           row -= 5;
3382 	  break;
3383         default:
3384 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3385       }
3386     }
3387     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3388 
3389     /*
3390            t = b - D x    where D is the block diagonal
3391     */
3392     cnt = 0;
3393     for (i=0, row=0; i<m; i++) {
3394       switch (sizes[i]){
3395         case 1:
3396 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3397 	  break;
3398         case 2:
3399 	  x1   = x[row]; x2 = x[row+1];
3400 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3401 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3402 	  t[row]   = b[row] - tmp1;
3403 	  t[row+1] = b[row+1] - tmp2; row += 2;
3404 	  cnt += 4;
3405 	  break;
3406         case 3:
3407 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3408 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3409 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3410 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3411 	  t[row] = b[row] - tmp1;
3412 	  t[row+1] = b[row+1] - tmp2;
3413 	  t[row+2] = b[row+2] - tmp3; row += 3;
3414 	  cnt += 9;
3415 	  break;
3416         case 4:
3417 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3418 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3419 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3420 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3421 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3422 	  t[row] = b[row] - tmp1;
3423 	  t[row+1] = b[row+1] - tmp2;
3424 	  t[row+2] = b[row+2] - tmp3;
3425 	  t[row+3] = b[row+3] - tmp4; row += 4;
3426 	  cnt += 16;
3427 	  break;
3428         case 5:
3429 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3430 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3431 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3432 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3433 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3434 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3435 	  t[row] = b[row] - tmp1;
3436 	  t[row+1] = b[row+1] - tmp2;
3437 	  t[row+2] = b[row+2] - tmp3;
3438 	  t[row+3] = b[row+3] - tmp4;
3439 	  t[row+4] = b[row+4] - tmp5;row += 5;
3440 	  cnt += 25;
3441 	  break;
3442         default:
3443 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3444       }
3445     }
3446     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3447 
3448 
3449 
3450     /*
3451           Apply (L + D)^-1 where D is the block diagonal
3452     */
3453     for (i=0, row=0; i<m; i++) {
3454       sz  = diag[row] - ii[row];
3455       v1  = a->a + ii[row];
3456       idx = a->j + ii[row];
3457       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3458       switch (sizes[i]){
3459         case 1:
3460 
3461 	  sum1  = t[row];
3462 	  for(n = 0; n<sz-1; n+=2) {
3463 	    i1   = idx[0];
3464 	    i2   = idx[1];
3465 	    idx += 2;
3466 	    tmp0 = t[i1];
3467 	    tmp1 = t[i2];
3468 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3469 	  }
3470 
3471 	  if (n == sz-1){
3472 	    tmp0  = t[*idx];
3473 	    sum1 -= *v1 * tmp0;
3474 	  }
3475 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3476 	  break;
3477         case 2:
3478 	  v2    = a->a + ii[row+1];
3479 	  sum1  = t[row];
3480 	  sum2  = t[row+1];
3481 	  for(n = 0; n<sz-1; n+=2) {
3482 	    i1   = idx[0];
3483 	    i2   = idx[1];
3484 	    idx += 2;
3485 	    tmp0 = t[i1];
3486 	    tmp1 = t[i2];
3487 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3488 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3489 	  }
3490 
3491 	  if (n == sz-1){
3492 	    tmp0  = t[*idx];
3493 	    sum1 -= v1[0] * tmp0;
3494 	    sum2 -= v2[0] * tmp0;
3495 	  }
3496 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3497 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3498 	  ibdiag  += 4; row += 2;
3499 	  break;
3500         case 3:
3501 	  v2    = a->a + ii[row+1];
3502 	  v3    = a->a + ii[row+2];
3503 	  sum1  = t[row];
3504 	  sum2  = t[row+1];
3505 	  sum3  = t[row+2];
3506 	  for(n = 0; n<sz-1; n+=2) {
3507 	    i1   = idx[0];
3508 	    i2   = idx[1];
3509 	    idx += 2;
3510 	    tmp0 = t[i1];
3511 	    tmp1 = t[i2];
3512 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3513 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3514 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3515 	  }
3516 
3517 	  if (n == sz-1){
3518 	    tmp0  = t[*idx];
3519 	    sum1 -= v1[0] * tmp0;
3520 	    sum2 -= v2[0] * tmp0;
3521 	    sum3 -= v3[0] * tmp0;
3522 	  }
3523 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3524 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3525 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3526 	  ibdiag  += 9; row += 3;
3527 	  break;
3528         case 4:
3529 	  v2    = a->a + ii[row+1];
3530 	  v3    = a->a + ii[row+2];
3531 	  v4    = a->a + ii[row+3];
3532 	  sum1  = t[row];
3533 	  sum2  = t[row+1];
3534 	  sum3  = t[row+2];
3535 	  sum4  = t[row+3];
3536 	  for(n = 0; n<sz-1; n+=2) {
3537 	    i1   = idx[0];
3538 	    i2   = idx[1];
3539 	    idx += 2;
3540 	    tmp0 = t[i1];
3541 	    tmp1 = t[i2];
3542 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3543 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3544 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3545 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3546 	  }
3547 
3548 	  if (n == sz-1){
3549 	    tmp0  = t[*idx];
3550 	    sum1 -= v1[0] * tmp0;
3551 	    sum2 -= v2[0] * tmp0;
3552 	    sum3 -= v3[0] * tmp0;
3553 	    sum4 -= v4[0] * tmp0;
3554 	  }
3555 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3556 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3557 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3558 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3559 	  ibdiag  += 16; row += 4;
3560 	  break;
3561         case 5:
3562 	  v2    = a->a + ii[row+1];
3563 	  v3    = a->a + ii[row+2];
3564 	  v4    = a->a + ii[row+3];
3565 	  v5    = a->a + ii[row+4];
3566 	  sum1  = t[row];
3567 	  sum2  = t[row+1];
3568 	  sum3  = t[row+2];
3569 	  sum4  = t[row+3];
3570 	  sum5  = t[row+4];
3571 	  for(n = 0; n<sz-1; n+=2) {
3572 	    i1   = idx[0];
3573 	    i2   = idx[1];
3574 	    idx += 2;
3575 	    tmp0 = t[i1];
3576 	    tmp1 = t[i2];
3577 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3578 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3579 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3580 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3581 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3582 	  }
3583 
3584 	  if (n == sz-1){
3585 	    tmp0  = t[*idx];
3586 	    sum1 -= v1[0] * tmp0;
3587 	    sum2 -= v2[0] * tmp0;
3588 	    sum3 -= v3[0] * tmp0;
3589 	    sum4 -= v4[0] * tmp0;
3590 	    sum5 -= v5[0] * tmp0;
3591 	  }
3592 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3593 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3594 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3595 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3596 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3597 	  ibdiag  += 25; row += 5;
3598 	  break;
3599         default:
3600 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3601       }
3602     }
3603     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3604     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3605     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3606   }
3607   PetscFunctionReturn(0);
3608 }
3609 
3610 #undef __FUNCT__
3611 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3612 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3613 {
3614   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3615   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3616   const MatScalar    *bdiag = a->inode.bdiag;
3617   const PetscScalar  *b;
3618   PetscErrorCode      ierr;
3619   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3620   const PetscInt      *sizes = a->inode.size;
3621 
3622   PetscFunctionBegin;
3623   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3624   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3625   cnt = 0;
3626   for (i=0, row=0; i<m; i++) {
3627     switch (sizes[i]){
3628       case 1:
3629 	x[row] = b[row]*bdiag[cnt++];row++;
3630 	break;
3631       case 2:
3632 	x1   = b[row]; x2 = b[row+1];
3633 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3634 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3635 	x[row++] = tmp1;
3636 	x[row++] = tmp2;
3637 	cnt += 4;
3638 	break;
3639       case 3:
3640 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3641 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3642 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3643 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3644 	x[row++] = tmp1;
3645 	x[row++] = tmp2;
3646 	x[row++] = tmp3;
3647 	cnt += 9;
3648 	break;
3649       case 4:
3650 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3651 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3652 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3653 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3654 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3655 	x[row++] = tmp1;
3656 	x[row++] = tmp2;
3657 	x[row++] = tmp3;
3658 	x[row++] = tmp4;
3659 	cnt += 16;
3660 	break;
3661       case 5:
3662 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3663 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3664 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3665 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3666 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3667 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3668 	x[row++] = tmp1;
3669 	x[row++] = tmp2;
3670 	x[row++] = tmp3;
3671 	x[row++] = tmp4;
3672 	x[row++] = tmp5;
3673 	cnt += 25;
3674 	break;
3675       default:
3676 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3677     }
3678   }
3679   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3681   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3682   PetscFunctionReturn(0);
3683 }
3684 
3685 /*
3686     samestructure indicates that the matrix has not changed its nonzero structure so we
3687     do not need to recompute the inodes
3688 */
3689 #undef __FUNCT__
3690 #define __FUNCT__ "Mat_CheckInode"
3691 PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3692 {
3693   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3694   PetscErrorCode ierr;
3695   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3696   PetscBool      flag;
3697 
3698   PetscFunctionBegin;
3699   if (!a->inode.use)                     PetscFunctionReturn(0);
3700   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3701 
3702 
3703   m = A->rmap->n;
3704   if (a->inode.size) {ns = a->inode.size;}
3705   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3706 
3707   i          = 0;
3708   node_count = 0;
3709   idx        = a->j;
3710   ii         = a->i;
3711   while (i < m){                /* For each row */
3712     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3713     /* Limits the number of elements in a node to 'a->inode.limit' */
3714     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3715       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3716       if (nzy != nzx) break;
3717       idy  += nzx;             /* Same nonzero pattern */
3718       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3719       if (!flag) break;
3720     }
3721     ns[node_count++] = blk_size;
3722     idx += blk_size*nzx;
3723     i    = j;
3724   }
3725   /* If not enough inodes found,, do not use inode version of the routines */
3726   if (!m || node_count > .8*m) {
3727     ierr = PetscFree(ns);CHKERRQ(ierr);
3728     a->inode.node_count       = 0;
3729     a->inode.size             = PETSC_NULL;
3730     a->inode.use              = PETSC_FALSE;
3731     A->ops->mult              = MatMult_SeqAIJ;
3732     A->ops->sor               = MatSOR_SeqAIJ;
3733     A->ops->multadd           = MatMultAdd_SeqAIJ;
3734     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
3735     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
3736     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
3737     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
3738     A->ops->coloringpatch     = 0;
3739     A->ops->multdiagonalblock = 0;
3740     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3741   } else {
3742     if (!A->factortype) {
3743       A->ops->mult              = MatMult_SeqAIJ_Inode;
3744       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3745       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3746       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3747       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3748       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3749       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3750       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3751       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3752     } else {
3753       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3754     }
3755     a->inode.node_count       = node_count;
3756     a->inode.size             = ns;
3757     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3758   }
3759   a->inode.checked = PETSC_TRUE;
3760   PetscFunctionReturn(0);
3761 }
3762 
3763 #undef __FUNCT__
3764 #define __FUNCT__ "MatGetRow_FactoredLU"
3765 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
3766 {
3767   PetscInt k, *vi;
3768   PetscFunctionBegin;
3769   vi = aj + ai[row];
3770   for(k=0;k<nzl;k++) cols[k] = vi[k];
3771   vi = aj + adiag[row];
3772   cols[nzl] = vi[0];
3773   vi = aj + adiag[row+1]+1;
3774   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
3775   PetscFunctionReturn(0);
3776 }
3777 /*
3778    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3779    Modified from Mat_CheckInode().
3780 
3781    Input Parameters:
3782 +  Mat A - ILU or LU matrix factor
3783 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
3784     do not need to recompute the inodes
3785 */
3786 #undef __FUNCT__
3787 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3788 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
3789 {
3790   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3791   PetscErrorCode ierr;
3792   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3793   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3794   PetscBool      flag;
3795 
3796   PetscFunctionBegin;
3797   if (!a->inode.use)                     PetscFunctionReturn(0);
3798   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3799 
3800   m = A->rmap->n;
3801   if (a->inode.size) {ns = a->inode.size;}
3802   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3803 
3804   i          = 0;
3805   node_count = 0;
3806   while (i < m){                /* For each row */
3807     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3808     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3809     nzx  = nzl1 + nzu1 + 1;
3810     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3811     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3812 
3813     /* Limits the number of elements in a node to 'a->inode.limit' */
3814     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3815       nzl2    = ai[j+1] - ai[j];
3816       nzu2    = adiag[j] - adiag[j+1] - 1;
3817       nzy     = nzl2 + nzu2 + 1;
3818       if( nzy != nzx) break;
3819       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3820       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
3821       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3822       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3823       ierr = PetscFree(cols2);CHKERRQ(ierr);
3824     }
3825     ns[node_count++] = blk_size;
3826     ierr = PetscFree(cols1);CHKERRQ(ierr);
3827     i    = j;
3828   }
3829   /* If not enough inodes found,, do not use inode version of the routines */
3830   if (!m || node_count > .8*m) {
3831     ierr = PetscFree(ns);CHKERRQ(ierr);
3832     a->inode.node_count     = 0;
3833     a->inode.size           = PETSC_NULL;
3834     a->inode.use            = PETSC_FALSE;
3835     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3836   } else {
3837     A->ops->mult              = 0;
3838     A->ops->sor               = 0;
3839     A->ops->multadd           = 0;
3840     A->ops->getrowij          = 0;
3841     A->ops->restorerowij      = 0;
3842     A->ops->getcolumnij       = 0;
3843     A->ops->restorecolumnij   = 0;
3844     A->ops->coloringpatch     = 0;
3845     A->ops->multdiagonalblock = 0;
3846     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3847     a->inode.node_count       = node_count;
3848     a->inode.size             = ns;
3849     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3850   }
3851   a->inode.checked = PETSC_TRUE;
3852   PetscFunctionReturn(0);
3853 }
3854 
3855 /*
3856      This is really ugly. if inodes are used this replaces the
3857   permutations with ones that correspond to rows/cols of the matrix
3858   rather then inode blocks
3859 */
3860 #undef __FUNCT__
3861 #define __FUNCT__ "MatInodeAdjustForInodes"
3862 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3863 {
3864   PetscErrorCode ierr;
3865 
3866   PetscFunctionBegin;
3867   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
3868   PetscFunctionReturn(0);
3869 }
3870 
3871 EXTERN_C_BEGIN
3872 #undef __FUNCT__
3873 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
3874 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3875 {
3876   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3877   PetscErrorCode ierr;
3878   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3879   const PetscInt *ridx,*cidx;
3880   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3881   PetscInt       nslim_col,*ns_col;
3882   IS             ris = *rperm,cis = *cperm;
3883 
3884   PetscFunctionBegin;
3885   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3886   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3887 
3888   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3889   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3890   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3891 
3892   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3893   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3894 
3895   /* Form the inode structure for the rows of permuted matric using inv perm*/
3896   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3897 
3898   /* Construct the permutations for rows*/
3899   for (i=0,row = 0; i<nslim_row; ++i){
3900     indx      = ridx[i];
3901     start_val = tns[indx];
3902     end_val   = tns[indx + 1];
3903     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3904   }
3905 
3906   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3907   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3908 
3909  /* Construct permutations for columns */
3910   for (i=0,col=0; i<nslim_col; ++i){
3911     indx      = cidx[i];
3912     start_val = tns[indx];
3913     end_val   = tns[indx + 1];
3914     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3915   }
3916 
3917   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
3918   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3919   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
3920   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3921 
3922   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3923   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3924 
3925   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3926   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3927   ierr = ISDestroy(&cis);CHKERRQ(ierr);
3928   ierr = ISDestroy(&ris);CHKERRQ(ierr);
3929   ierr = PetscFree(tns);CHKERRQ(ierr);
3930   PetscFunctionReturn(0);
3931 }
3932 EXTERN_C_END
3933 
3934 #undef __FUNCT__
3935 #define __FUNCT__ "MatInodeGetInodeSizes"
3936 /*@C
3937    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3938 
3939    Not Collective
3940 
3941    Input Parameter:
3942 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3943 
3944    Output Parameter:
3945 +  node_count - no of inodes present in the matrix.
3946 .  sizes      - an array of size node_count,with sizes of each inode.
3947 -  limit      - the max size used to generate the inodes.
3948 
3949    Level: advanced
3950 
3951    Notes: This routine returns some internal storage information
3952    of the matrix, it is intended to be used by advanced users.
3953    It should be called after the matrix is assembled.
3954    The contents of the sizes[] array should not be changed.
3955    PETSC_NULL may be passed for information not requested.
3956 
3957 .keywords: matrix, seqaij, get, inode
3958 
3959 .seealso: MatGetInfo()
3960 @*/
3961 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3962 {
3963   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3964 
3965   PetscFunctionBegin;
3966   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3967   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3968   if (f) {
3969     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3970   }
3971   PetscFunctionReturn(0);
3972 }
3973 
3974 EXTERN_C_BEGIN
3975 #undef __FUNCT__
3976 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3977 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3978 {
3979   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3980 
3981   PetscFunctionBegin;
3982   if (node_count) *node_count = a->inode.node_count;
3983   if (sizes)      *sizes      = a->inode.size;
3984   if (limit)      *limit      = a->inode.limit;
3985   PetscFunctionReturn(0);
3986 }
3987 EXTERN_C_END
3988