xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 7d0a6c19129e7069c8a40e210b34ed62989173db)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include "../src/mat/impls/aij/seq/aij.h"
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11 {
12   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24     for(; count+1 < n; count++,i++);
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for(; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n)  {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1){
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col)
86       tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     i2   = 0;
101     col  = *j++ + ishift;
102     i2   = tvc[col];
103     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
104       ia[i1+1]++;
105       ia[i2+1]++;
106       i2++;                     /* Start col of next node */
107       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
108       i2 = tvc[col];
109     }
110     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
111   }
112 
113   /* shift ia[i] to point to next row */
114   for (i1=1; i1<nslim_row+1; i1++) {
115     row        = ia[i1-1];
116     ia[i1]    += row;
117     work[i1-1] = row - oshift;
118   }
119 
120   /* allocate space for column pointers */
121   nz   = ia[nslim_row] + (!ishift);
122   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
123   *jja = ja;
124 
125  /* loop over lower triangular part putting into ja */
126   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
127     j    = aj + ai[row] + ishift;
128     jmax = aj + ai[row+1] + ishift;
129     i2   = 0;                     /* Col inode index */
130     col  = *j++ + ishift;
131     i2   = tvc[col];
132     while (i2<i1 && j<jmax) {
133       ja[work[i2]++] = i1 + oshift;
134       ja[work[i1]++] = i2 + oshift;
135       ++i2;
136       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
137       i2 = tvc[col];
138     }
139     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
140 
141   }
142   ierr = PetscFree(work);CHKERRQ(ierr);
143   ierr = PetscFree(tns);CHKERRQ(ierr);
144   ierr = PetscFree(tvc);CHKERRQ(ierr);
145   PetscFunctionReturn(0);
146 }
147 
148 /*
149       This builds nonsymmetric version of nonzero structure,
150 */
151 #undef __FUNCT__
152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
154 {
155   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
156   PetscErrorCode ierr;
157   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
158   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
159 
160   PetscFunctionBegin;
161   nslim_row = a->inode.node_count;
162   n         = A->cmap->n;
163 
164   /* Create The column_inode for this matrix */
165   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
166 
167   /* allocate space for reformated column_inode structure */
168   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
169   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
170   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
171 
172   for (i1=0,col=0; i1<nslim_col; ++i1){
173     nsz = ns_col[i1];
174     for (i2=0; i2<nsz; ++i2,++col)
175       tvc[col] = i1;
176   }
177   /* allocate space for row pointers */
178   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
179   *iia = ia;
180   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
181   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
182 
183   /* determine the number of columns in each row */
184   ia[0] = oshift;
185   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
186     j   = aj + ai[row] + ishift;
187     col = *j++ + ishift;
188     i2  = tvc[col];
189     nz  = ai[row+1] - ai[row];
190     while (nz-- > 0) {           /* off-diagonal elemets */
191       ia[i1+1]++;
192       i2++;                     /* Start col of next node */
193       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
194       if (nz > 0) i2 = tvc[col];
195     }
196   }
197 
198   /* shift ia[i] to point to next row */
199   for (i1=1; i1<nslim_row+1; i1++) {
200     row        = ia[i1-1];
201     ia[i1]    += row;
202     work[i1-1] = row - oshift;
203   }
204 
205   /* allocate space for column pointers */
206   nz   = ia[nslim_row] + (!ishift);
207   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
208   *jja = ja;
209 
210  /* loop over matrix putting into ja */
211   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
212     j   = aj + ai[row] + ishift;
213     i2  = 0;                     /* Col inode index */
214     col = *j++ + ishift;
215     i2  = tvc[col];
216     nz  = ai[row+1] - ai[row];
217     while (nz-- > 0) {
218       ja[work[i1]++] = i2 + oshift;
219       ++i2;
220       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
221       if (nz > 0) i2 = tvc[col];
222     }
223   }
224   ierr = PetscFree(ns_col);CHKERRQ(ierr);
225   ierr = PetscFree(work);CHKERRQ(ierr);
226   ierr = PetscFree(tns);CHKERRQ(ierr);
227   ierr = PetscFree(tvc);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 #undef __FUNCT__
232 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
233 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
234 {
235   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   *n     = a->inode.node_count;
240   if (!ia) PetscFunctionReturn(0);
241   if (!blockcompressed) {
242     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
243   } else if (symmetric) {
244     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
245   } else {
246     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
247   }
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
253 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
254 {
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   if (!ia) PetscFunctionReturn(0);
259 
260   if (!blockcompressed) {
261     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
262   } else {
263     ierr = PetscFree(*ia);CHKERRQ(ierr);
264     ierr = PetscFree(*ja);CHKERRQ(ierr);
265   }
266 
267   PetscFunctionReturn(0);
268 }
269 
270 /* ----------------------------------------------------------- */
271 
272 #undef __FUNCT__
273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
275 {
276   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
277   PetscErrorCode ierr;
278   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
279   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
280 
281   PetscFunctionBegin;
282   nslim_row = a->inode.node_count;
283   n         = A->cmap->n;
284 
285   /* Create The column_inode for this matrix */
286   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
287 
288   /* allocate space for reformated column_inode structure */
289   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
290   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
291   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
292 
293   for (i1=0,col=0; i1<nslim_col; ++i1){
294     nsz = ns_col[i1];
295     for (i2=0; i2<nsz; ++i2,++col)
296       tvc[col] = i1;
297   }
298   /* allocate space for column pointers */
299   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
300   *iia = ia;
301   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
302   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
303 
304   /* determine the number of columns in each row */
305   ia[0] = oshift;
306   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
307     j   = aj + ai[row] + ishift;
308     col = *j++ + ishift;
309     i2  = tvc[col];
310     nz  = ai[row+1] - ai[row];
311     while (nz-- > 0) {           /* off-diagonal elemets */
312       /* ia[i1+1]++; */
313       ia[i2+1]++;
314       i2++;
315       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
316       if (nz > 0) i2 = tvc[col];
317     }
318   }
319 
320   /* shift ia[i] to point to next col */
321   for (i1=1; i1<nslim_col+1; i1++) {
322     col        = ia[i1-1];
323     ia[i1]    += col;
324     work[i1-1] = col - oshift;
325   }
326 
327   /* allocate space for column pointers */
328   nz   = ia[nslim_col] + (!ishift);
329   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
330   *jja = ja;
331 
332  /* loop over matrix putting into ja */
333   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
334     j   = aj + ai[row] + ishift;
335     i2  = 0;                     /* Col inode index */
336     col = *j++ + ishift;
337     i2  = tvc[col];
338     nz  = ai[row+1] - ai[row];
339     while (nz-- > 0) {
340       /* ja[work[i1]++] = i2 + oshift; */
341       ja[work[i2]++] = i1 + oshift;
342       i2++;
343       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
344       if (nz > 0) i2 = tvc[col];
345     }
346   }
347   ierr = PetscFree(ns_col);CHKERRQ(ierr);
348   ierr = PetscFree(work);CHKERRQ(ierr);
349   ierr = PetscFree(tns);CHKERRQ(ierr);
350   ierr = PetscFree(tvc);CHKERRQ(ierr);
351   PetscFunctionReturn(0);
352 }
353 
354 #undef __FUNCT__
355 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
356 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
357 {
358   PetscErrorCode ierr;
359 
360   PetscFunctionBegin;
361   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
362   if (!ia) PetscFunctionReturn(0);
363 
364   if (!blockcompressed) {
365     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
366   } else if (symmetric) {
367     /* Since the indices are symmetric it does'nt matter */
368     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
369   } else {
370     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
371   }
372   PetscFunctionReturn(0);
373 }
374 
375 #undef __FUNCT__
376 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
377 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool  *done)
378 {
379   PetscErrorCode ierr;
380 
381   PetscFunctionBegin;
382   if (!ia) PetscFunctionReturn(0);
383   if (!blockcompressed) {
384     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
385   } else {
386     ierr = PetscFree(*ia);CHKERRQ(ierr);
387     ierr = PetscFree(*ja);CHKERRQ(ierr);
388   }
389   PetscFunctionReturn(0);
390 }
391 
392 /* ----------------------------------------------------------- */
393 
394 #undef __FUNCT__
395 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
396 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
397 {
398   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
399   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
400   PetscScalar       *y;
401   const PetscScalar *x;
402   const MatScalar   *v1,*v2,*v3,*v4,*v5;
403   PetscErrorCode    ierr;
404   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
405 
406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
408 #endif
409 
410   PetscFunctionBegin;
411   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
412   node_max = a->inode.node_count;
413   ns       = a->inode.size;     /* Node Size array */
414   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
415   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
416   idx  = a->j;
417   v1   = a->a;
418   ii   = a->i;
419 
420   for (i = 0,row = 0; i< node_max; ++i){
421     nsz  = ns[i];
422     n    = ii[1] - ii[0];
423     nonzerorow += (n>0)*nsz;
424     ii  += nsz;
425     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
426     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
427     sz   = n;                   /* No of non zeros in this row */
428                                 /* Switch on the size of Node */
429     switch (nsz){               /* Each loop in 'case' is unrolled */
430     case 1 :
431       sum1  = 0.;
432 
433       for(n = 0; n< sz-1; n+=2) {
434         i1   = idx[0];          /* The instructions are ordered to */
435         i2   = idx[1];          /* make the compiler's job easy */
436         idx += 2;
437         tmp0 = x[i1];
438         tmp1 = x[i2];
439         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
440        }
441 
442       if (n == sz-1){          /* Take care of the last nonzero  */
443         tmp0  = x[*idx++];
444         sum1 += *v1++ * tmp0;
445       }
446       y[row++]=sum1;
447       break;
448     case 2:
449       sum1  = 0.;
450       sum2  = 0.;
451       v2    = v1 + n;
452 
453       for (n = 0; n< sz-1; n+=2) {
454         i1   = idx[0];
455         i2   = idx[1];
456         idx += 2;
457         tmp0 = x[i1];
458         tmp1 = x[i2];
459         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
460         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
461       }
462       if (n == sz-1){
463         tmp0  = x[*idx++];
464         sum1 += *v1++ * tmp0;
465         sum2 += *v2++ * tmp0;
466       }
467       y[row++]=sum1;
468       y[row++]=sum2;
469       v1      =v2;              /* Since the next block to be processed starts there*/
470       idx    +=sz;
471       break;
472     case 3:
473       sum1  = 0.;
474       sum2  = 0.;
475       sum3  = 0.;
476       v2    = v1 + n;
477       v3    = v2 + n;
478 
479       for (n = 0; n< sz-1; n+=2) {
480         i1   = idx[0];
481         i2   = idx[1];
482         idx += 2;
483         tmp0 = x[i1];
484         tmp1 = x[i2];
485         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
486         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
487         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
488       }
489       if (n == sz-1){
490         tmp0  = x[*idx++];
491         sum1 += *v1++ * tmp0;
492         sum2 += *v2++ * tmp0;
493         sum3 += *v3++ * tmp0;
494       }
495       y[row++]=sum1;
496       y[row++]=sum2;
497       y[row++]=sum3;
498       v1       =v3;             /* Since the next block to be processed starts there*/
499       idx     +=2*sz;
500       break;
501     case 4:
502       sum1  = 0.;
503       sum2  = 0.;
504       sum3  = 0.;
505       sum4  = 0.;
506       v2    = v1 + n;
507       v3    = v2 + n;
508       v4    = v3 + n;
509 
510       for (n = 0; n< sz-1; n+=2) {
511         i1   = idx[0];
512         i2   = idx[1];
513         idx += 2;
514         tmp0 = x[i1];
515         tmp1 = x[i2];
516         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
517         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
518         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
519         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
520       }
521       if (n == sz-1){
522         tmp0  = x[*idx++];
523         sum1 += *v1++ * tmp0;
524         sum2 += *v2++ * tmp0;
525         sum3 += *v3++ * tmp0;
526         sum4 += *v4++ * tmp0;
527       }
528       y[row++]=sum1;
529       y[row++]=sum2;
530       y[row++]=sum3;
531       y[row++]=sum4;
532       v1      =v4;              /* Since the next block to be processed starts there*/
533       idx    +=3*sz;
534       break;
535     case 5:
536       sum1  = 0.;
537       sum2  = 0.;
538       sum3  = 0.;
539       sum4  = 0.;
540       sum5  = 0.;
541       v2    = v1 + n;
542       v3    = v2 + n;
543       v4    = v3 + n;
544       v5    = v4 + n;
545 
546       for (n = 0; n<sz-1; n+=2) {
547         i1   = idx[0];
548         i2   = idx[1];
549         idx += 2;
550         tmp0 = x[i1];
551         tmp1 = x[i2];
552         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
553         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
554         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
555         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
556         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
557       }
558       if (n == sz-1){
559         tmp0  = x[*idx++];
560         sum1 += *v1++ * tmp0;
561         sum2 += *v2++ * tmp0;
562         sum3 += *v3++ * tmp0;
563         sum4 += *v4++ * tmp0;
564         sum5 += *v5++ * tmp0;
565       }
566       y[row++]=sum1;
567       y[row++]=sum2;
568       y[row++]=sum3;
569       y[row++]=sum4;
570       y[row++]=sum5;
571       v1      =v5;       /* Since the next block to be processed starts there */
572       idx    +=4*sz;
573       break;
574     default :
575       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
576     }
577   }
578   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
579   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
580   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
581   PetscFunctionReturn(0);
582 }
583 /* ----------------------------------------------------------- */
584 /* Almost same code as the MatMult_SeqAIJ_Inode() */
585 #undef __FUNCT__
586 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
587 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
588 {
589   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
590   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
591   MatScalar      *v1,*v2,*v3,*v4,*v5;
592   PetscScalar    *x,*y,*z,*zt;
593   PetscErrorCode ierr;
594   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
595 
596   PetscFunctionBegin;
597   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
598   node_max = a->inode.node_count;
599   ns       = a->inode.size;     /* Node Size array */
600   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
601   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
602   if (zz != yy) {
603     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
604   } else {
605     z = y;
606   }
607   zt = z;
608 
609   idx  = a->j;
610   v1   = a->a;
611   ii   = a->i;
612 
613   for (i = 0,row = 0; i< node_max; ++i){
614     nsz  = ns[i];
615     n    = ii[1] - ii[0];
616     ii  += nsz;
617     sz   = n;                   /* No of non zeros in this row */
618                                 /* Switch on the size of Node */
619     switch (nsz){               /* Each loop in 'case' is unrolled */
620     case 1 :
621       sum1  = *zt++;
622 
623       for(n = 0; n< sz-1; n+=2) {
624         i1   = idx[0];          /* The instructions are ordered to */
625         i2   = idx[1];          /* make the compiler's job easy */
626         idx += 2;
627         tmp0 = x[i1];
628         tmp1 = x[i2];
629         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
630        }
631 
632       if(n   == sz-1){          /* Take care of the last nonzero  */
633         tmp0  = x[*idx++];
634         sum1 += *v1++ * tmp0;
635       }
636       y[row++]=sum1;
637       break;
638     case 2:
639       sum1  = *zt++;
640       sum2  = *zt++;
641       v2    = v1 + n;
642 
643       for(n = 0; n< sz-1; n+=2) {
644         i1   = idx[0];
645         i2   = idx[1];
646         idx += 2;
647         tmp0 = x[i1];
648         tmp1 = x[i2];
649         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
650         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
651       }
652       if(n   == sz-1){
653         tmp0  = x[*idx++];
654         sum1 += *v1++ * tmp0;
655         sum2 += *v2++ * tmp0;
656       }
657       y[row++]=sum1;
658       y[row++]=sum2;
659       v1      =v2;              /* Since the next block to be processed starts there*/
660       idx    +=sz;
661       break;
662     case 3:
663       sum1  = *zt++;
664       sum2  = *zt++;
665       sum3  = *zt++;
666       v2    = v1 + n;
667       v3    = v2 + n;
668 
669       for (n = 0; n< sz-1; n+=2) {
670         i1   = idx[0];
671         i2   = idx[1];
672         idx += 2;
673         tmp0 = x[i1];
674         tmp1 = x[i2];
675         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
676         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
677         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
678       }
679       if (n == sz-1){
680         tmp0  = x[*idx++];
681         sum1 += *v1++ * tmp0;
682         sum2 += *v2++ * tmp0;
683         sum3 += *v3++ * tmp0;
684       }
685       y[row++]=sum1;
686       y[row++]=sum2;
687       y[row++]=sum3;
688       v1       =v3;             /* Since the next block to be processed starts there*/
689       idx     +=2*sz;
690       break;
691     case 4:
692       sum1  = *zt++;
693       sum2  = *zt++;
694       sum3  = *zt++;
695       sum4  = *zt++;
696       v2    = v1 + n;
697       v3    = v2 + n;
698       v4    = v3 + n;
699 
700       for (n = 0; n< sz-1; n+=2) {
701         i1   = idx[0];
702         i2   = idx[1];
703         idx += 2;
704         tmp0 = x[i1];
705         tmp1 = x[i2];
706         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
707         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
708         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
709         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
710       }
711       if (n == sz-1){
712         tmp0  = x[*idx++];
713         sum1 += *v1++ * tmp0;
714         sum2 += *v2++ * tmp0;
715         sum3 += *v3++ * tmp0;
716         sum4 += *v4++ * tmp0;
717       }
718       y[row++]=sum1;
719       y[row++]=sum2;
720       y[row++]=sum3;
721       y[row++]=sum4;
722       v1      =v4;              /* Since the next block to be processed starts there*/
723       idx    +=3*sz;
724       break;
725     case 5:
726       sum1  = *zt++;
727       sum2  = *zt++;
728       sum3  = *zt++;
729       sum4  = *zt++;
730       sum5  = *zt++;
731       v2    = v1 + n;
732       v3    = v2 + n;
733       v4    = v3 + n;
734       v5    = v4 + n;
735 
736       for (n = 0; n<sz-1; n+=2) {
737         i1   = idx[0];
738         i2   = idx[1];
739         idx += 2;
740         tmp0 = x[i1];
741         tmp1 = x[i2];
742         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
743         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
744         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
745         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
746         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
747       }
748       if(n   == sz-1){
749         tmp0  = x[*idx++];
750         sum1 += *v1++ * tmp0;
751         sum2 += *v2++ * tmp0;
752         sum3 += *v3++ * tmp0;
753         sum4 += *v4++ * tmp0;
754         sum5 += *v5++ * tmp0;
755       }
756       y[row++]=sum1;
757       y[row++]=sum2;
758       y[row++]=sum3;
759       y[row++]=sum4;
760       y[row++]=sum5;
761       v1      =v5;       /* Since the next block to be processed starts there */
762       idx    +=4*sz;
763       break;
764     default :
765       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
766     }
767   }
768   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
770   if (zz != yy) {
771     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
772   }
773   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
774   PetscFunctionReturn(0);
775 }
776 
777 /* ----------------------------------------------------------- */
778 #undef __FUNCT__
779 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
780 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
781 {
782   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
783   IS                iscol = a->col,isrow = a->row;
784   PetscErrorCode    ierr;
785   const PetscInt    *r,*c,*rout,*cout;
786   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
787   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
788   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
789   PetscScalar       sum1,sum2,sum3,sum4,sum5;
790   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
791   const PetscScalar *b;
792 
793   PetscFunctionBegin;
794   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
795   node_max = a->inode.node_count;
796   ns       = a->inode.size;     /* Node Size array */
797 
798   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
799   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
800   tmp  = a->solve_work;
801 
802   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
803   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
804 
805   /* forward solve the lower triangular */
806   tmps = tmp ;
807   aa   = a_a ;
808   aj   = a_j ;
809   ad   = a->diag;
810 
811   for (i = 0,row = 0; i< node_max; ++i){
812     nsz = ns[i];
813     aii = ai[row];
814     v1  = aa + aii;
815     vi  = aj + aii;
816     nz  = ad[row]- aii;
817     if (i < node_max-1) {
818       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
819       * but our indexing to determine it's size could. */
820       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
821       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
822       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
823       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
824     }
825 
826     switch (nsz){               /* Each loop in 'case' is unrolled */
827     case 1 :
828       sum1 = b[*r++];
829       for(j=0; j<nz-1; j+=2){
830         i0   = vi[0];
831         i1   = vi[1];
832         vi  +=2;
833         tmp0 = tmps[i0];
834         tmp1 = tmps[i1];
835         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
836       }
837       if(j == nz-1){
838         tmp0 = tmps[*vi++];
839         sum1 -= *v1++ *tmp0;
840       }
841       tmp[row ++]=sum1;
842       break;
843     case 2:
844       sum1 = b[*r++];
845       sum2 = b[*r++];
846       v2   = aa + ai[row+1];
847 
848       for(j=0; j<nz-1; j+=2){
849         i0   = vi[0];
850         i1   = vi[1];
851         vi  +=2;
852         tmp0 = tmps[i0];
853         tmp1 = tmps[i1];
854         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
855         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
856       }
857       if(j == nz-1){
858         tmp0 = tmps[*vi++];
859         sum1 -= *v1++ *tmp0;
860         sum2 -= *v2++ *tmp0;
861       }
862       sum2 -= *v2++ * sum1;
863       tmp[row ++]=sum1;
864       tmp[row ++]=sum2;
865       break;
866     case 3:
867       sum1 = b[*r++];
868       sum2 = b[*r++];
869       sum3 = b[*r++];
870       v2   = aa + ai[row+1];
871       v3   = aa + ai[row+2];
872 
873       for (j=0; j<nz-1; j+=2){
874         i0   = vi[0];
875         i1   = vi[1];
876         vi  +=2;
877         tmp0 = tmps[i0];
878         tmp1 = tmps[i1];
879         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
880         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
881         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
882       }
883       if (j == nz-1){
884         tmp0 = tmps[*vi++];
885         sum1 -= *v1++ *tmp0;
886         sum2 -= *v2++ *tmp0;
887         sum3 -= *v3++ *tmp0;
888       }
889       sum2 -= *v2++ * sum1;
890       sum3 -= *v3++ * sum1;
891       sum3 -= *v3++ * sum2;
892       tmp[row ++]=sum1;
893       tmp[row ++]=sum2;
894       tmp[row ++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2){
907         i0   = vi[0];
908         i1   = vi[1];
909         vi  +=2;
910         tmp0 = tmps[i0];
911         tmp1 = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1){
918         tmp0 = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row ++]=sum1;
932       tmp[row ++]=sum2;
933       tmp[row ++]=sum3;
934       tmp[row ++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2){
948         i0   = vi[0];
949         i1   = vi[1];
950         vi  +=2;
951         tmp0 = tmps[i0];
952         tmp1 = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1){
960         tmp0 = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row ++]=sum1;
980       tmp[row ++]=sum2;
981       tmp[row ++]=sum3;
982       tmp[row ++]=sum4;
983       tmp[row ++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz){               /* Each loop in 'case' is unrolled */
997     case 1 :
998       sum1 = tmp[row];
999 
1000       for(j=nz ; j>1; j-=2){
1001         vi  -=2;
1002         i0   = vi[2];
1003         i1   = vi[1];
1004         tmp0 = tmps[i0];
1005         tmp1 = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1){
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2 :
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz ; j>1; j-=2){
1020         vi  -=2;
1021         i0   = vi[2];
1022         i1   = vi[1];
1023         tmp0 = tmps[i0];
1024         tmp1 = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1){
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3 :
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz ; j>1; j-=2){
1047         vi  -=2;
1048         i0   = vi[2];
1049         i1   = vi[1];
1050         tmp0 = tmps[i0];
1051         tmp1 = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1){
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4 :
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz ; j>1; j-=2){
1083         vi  -=2;
1084         i0   = vi[2];
1085         i1   = vi[1];
1086         tmp0 = tmps[i0];
1087         tmp1 = tmps[i1];
1088         v1  -= 2;
1089         v2  -= 2;
1090         v3  -= 2;
1091         v4  -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1){
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5 :
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz ; j>1; j-=2){
1127         vi  -= 2;
1128         i0   = vi[2];
1129         i1   = vi[1];
1130         tmp0 = tmps[i0];
1131         tmp1 = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1){
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat              C=B;
1185   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1186   IS               isrow = b->row,isicol = b->icol;
1187   PetscErrorCode   ierr;
1188   const PetscInt   *r,*ic,*ics;
1189   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt         i,j,k,nz,nzL,row,*pj;
1191   const PetscInt   *ajtmp,*bjtmp;
1192   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx   sctx;
1195   const PetscInt   *ddiag;
1196   PetscReal        rs;
1197   MatScalar        d;
1198   PetscInt         inod,nodesz,node_max,col;
1199   const PetscInt   *ns;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   rtmp4 = rtmp3 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1238 
1239   /* If max inode size > 4, split it into two inodes.*/
1240   /* also map the inode sizes according to the ordering */
1241   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1242   for (i=0,j=0; i<node_max; ++i,++j){
1243     if (ns[i] > 4) {
1244       tmp_vec1[j] = 4;
1245       ++j;
1246       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247     } else {
1248       tmp_vec1[j] = ns[i];
1249     }
1250   }
1251   /* Use the correct node_max */
1252   node_max = j;
1253 
1254   /* Now reorder the inode info based on mat re-ordering info */
1255   /* First create a row -> inode_size_array_index map */
1256   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1257   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1258   for (i=0,row=0; i<node_max; i++) {
1259     nodesz = tmp_vec1[i];
1260     for (j=0; j<nodesz; j++,row++) {
1261       nsmap[row] = i;
1262     }
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i=0,j=0; i< node_max; i++) {
1266     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267     tmp_vec2[i]  = nodesz;
1268     j           += nodesz;
1269   }
1270   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1271   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.newshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz){
1283       case 1:
1284       /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i+1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz = bdiag[i]-bdiag[i+1];
1293         bjtmp = bj + bdiag[i+1]+1;
1294         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i]+1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j=0; j<nz; j++) {
1301           rtmp1[ics[ajtmp[j]]] = v[j];
1302         }
1303         /* ZeropivotApply() */
1304         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1305 
1306         /* elimination */
1307         bjtmp = bj + bi[i];
1308         row   = *bjtmp++;
1309         nzL   = bi[i+1] - bi[i];
1310         for(k=0; k < nzL;k++) {
1311           pc = rtmp1 + row;
1312           if (*pc != 0.0) {
1313             pv   = b->a + bdiag[row];
1314             mul1 = *pc * (*pv);
1315             *pc  = mul1;
1316             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317             pv = b->a + bdiag[row+1]+1;
1318             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1321           }
1322           row = *bjtmp++;
1323         }
1324 
1325         /* finished row so stick it into b->a */
1326         rs = 0.0;
1327         /* L part */
1328         pv = b->a + bi[i] ;
1329         pj = b->j + bi[i] ;
1330         nz = bi[i+1] - bi[i];
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i+1]+1;
1337         pj = b->j + bdiag[i+1]+1;
1338         nz = bdiag[i] - bdiag[i+1]-1;
1339         for (j=0; j<nz; j++) {
1340           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341         }
1342 
1343         /* Check zero pivot */
1344         sctx.rs = rs;
1345         sctx.pv = rtmp1[i];
1346         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1347 	if(sctx.newshift) break;
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv   = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394             *pc1 = mul1;       *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         pc1 = b->a + bi[i];
1413         pj  = b->j + bi[i] ;
1414         nz  = bi[i+1] - bi[i];
1415         for (j=0; j<nz; j++) {
1416           col = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419         /* U part */
1420         pc1 = b->a + bdiag[i+1]+1;
1421         pj  = b->j + bdiag[i+1]+1;
1422         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col = pj[j];
1425           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426         }
1427 
1428         sctx.rs  = rs;
1429         sctx.pv  = rtmp1[i];
1430         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1431 	if(sctx.newshift) break;
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1470 	if(sctx.newshift) break;
1471         pc2  = b->a + bdiag[i+1];
1472         *pc2 = 1.0/sctx.pv;
1473         break;
1474 
1475       case 3:
1476       /*----------*/
1477         /* zero rtmp */
1478         /* L part */
1479         nz    = bi[i+1] - bi[i];
1480         bjtmp = bj + bi[i];
1481         for  (j=0; j<nz; j++) {
1482           col = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* U part */
1487         nz = bdiag[i]-bdiag[i+1];
1488         bjtmp = bj + bdiag[i+1]+1;
1489         for  (j=0; j<nz; j++) {
1490           col = bjtmp[j];
1491           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492         }
1493 
1494         /* load in initial (unfactored row) */
1495         nz    = ai[r[i]+1] - ai[r[i]];
1496         ajtmp = aj + ai[r[i]];
1497         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498         for (j=0; j<nz; j++) {
1499           col = ics[ajtmp[j]];
1500           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501         }
1502         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504 
1505         /* elimination */
1506         bjtmp = bj + bi[i];
1507         row   = *bjtmp++; /* pivot row */
1508         nzL   = bi[i+1] - bi[i];
1509         for(k=0; k < nzL;k++) {
1510           pc1 = rtmp1 + row;
1511           pc2 = rtmp2 + row;
1512           pc3 = rtmp3 + row;
1513           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514             pv  = b->a + bdiag[row];
1515             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517 
1518             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519             pv = b->a + bdiag[row+1]+1;
1520             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521             for (j=0; j<nz; j++){
1522               col = pj[j];
1523               rtmp1[col] -= mul1 * pv[j];
1524               rtmp2[col] -= mul2 * pv[j];
1525               rtmp3[col] -= mul3 * pv[j];
1526             }
1527             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1528           }
1529           row = *bjtmp++;
1530         }
1531 
1532         /* finished row i; check zero pivot, then stick row i into b->a */
1533         rs  = 0.0;
1534         /* L part */
1535         pc1 = b->a + bi[i];
1536         pj  = b->j + bi[i] ;
1537         nz  = bi[i+1] - bi[i];
1538         for (j=0; j<nz; j++) {
1539           col = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542         /* U part */
1543         pc1 = b->a + bdiag[i+1]+1;
1544         pj  = b->j + bdiag[i+1]+1;
1545         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546         for (j=0; j<nz; j++) {
1547           col = pj[j];
1548           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549         }
1550 
1551         sctx.rs  = rs;
1552         sctx.pv  = rtmp1[i];
1553         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1554 	if(sctx.newshift) break;
1555         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1556         *pc1 = 1.0/sctx.pv;
1557 
1558         /* Now take care of 1st column of diagonal 3x3 block. */
1559         pc2 = rtmp2 + i;
1560         pc3 = rtmp3 + i;
1561         if (*pc2 != 0.0 || *pc3 != 0.0){
1562           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col = pj[j];
1568             rtmp2[col] -= mul2 * rtmp1[col];
1569             rtmp3[col] -= mul3 * rtmp1[col];
1570           }
1571           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1572         }
1573 
1574         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575         rs = 0.0;
1576         /* L part */
1577         pc2 = b->a + bi[i+1];
1578         pj  = b->j + bi[i+1] ;
1579         nz  = bi[i+2] - bi[i+1];
1580         for (j=0; j<nz; j++) {
1581           col = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584         /* U part */
1585         pc2 = b->a + bdiag[i+2]+1;
1586         pj  = b->j + bdiag[i+2]+1;
1587         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588         for (j=0; j<nz; j++) {
1589           col = pj[j];
1590           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591         }
1592 
1593         sctx.rs  = rs;
1594         sctx.pv  = rtmp2[i+1];
1595         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1596 	if(sctx.newshift) break;
1597         pc2  = b->a + bdiag[i+1];
1598         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1599 
1600         /* Now take care of 2nd column of diagonal 3x3 block. */
1601         pc3 = rtmp3 + i+1;
1602         if (*pc3 != 0.0){
1603           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1605           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606           for (j=0; j<nz; j++) {
1607             col = pj[j];
1608             rtmp3[col] -= mul3 * rtmp2[col];
1609           }
1610           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1611         }
1612 
1613         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614         rs = 0.0;
1615         /* L part */
1616         pc3 = b->a + bi[i+2];
1617         pj  = b->j + bi[i+2] ;
1618         nz  = bi[i+3] - bi[i+2];
1619         for (j=0; j<nz; j++) {
1620           col = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623         /* U part */
1624         pc3 = b->a + bdiag[i+3]+1;
1625         pj  = b->j + bdiag[i+3]+1;
1626         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627         for (j=0; j<nz; j++) {
1628           col = pj[j];
1629           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630         }
1631 
1632         sctx.rs  = rs;
1633         sctx.pv  = rtmp3[i+2];
1634         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1635 	if(sctx.newshift) break;
1636         pc3  = b->a + bdiag[i+2];
1637         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638         break;
1639       case 4:
1640       /*----------*/
1641         /* zero rtmp */
1642         /* L part */
1643         nz    = bi[i+1] - bi[i];
1644         bjtmp = bj + bi[i];
1645         for  (j=0; j<nz; j++) {
1646           col = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648         }
1649 
1650         /* U part */
1651         nz = bdiag[i]-bdiag[i+1];
1652         bjtmp = bj + bdiag[i+1]+1;
1653         for  (j=0; j<nz; j++) {
1654           col = bjtmp[j];
1655           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656         }
1657 
1658         /* load in initial (unfactored row) */
1659         nz    = ai[r[i]+1] - ai[r[i]];
1660         ajtmp = aj + ai[r[i]];
1661         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662         for (j=0; j<nz; j++) {
1663           col = ics[ajtmp[j]];
1664           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665         }
1666         /* ZeropivotApply(): shift the diagonal of the matrix  */
1667         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668 
1669         /* elimination */
1670         bjtmp = bj + bi[i];
1671         row   = *bjtmp++; /* pivot row */
1672         nzL   = bi[i+1] - bi[i];
1673         for(k=0; k < nzL;k++) {
1674           pc1 = rtmp1 + row;
1675           pc2 = rtmp2 + row;
1676           pc3 = rtmp3 + row;
1677 	  pc4 = rtmp4 + row;
1678           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679             pv  = b->a + bdiag[row];
1680             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1682 
1683             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684             pv = b->a + bdiag[row+1]+1;
1685             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686             for (j=0; j<nz; j++){
1687               col = pj[j];
1688               rtmp1[col] -= mul1 * pv[j];
1689               rtmp2[col] -= mul2 * pv[j];
1690               rtmp3[col] -= mul3 * pv[j];
1691 	      rtmp4[col] -= mul4 * pv[j];
1692             }
1693             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1694           }
1695           row = *bjtmp++;
1696         }
1697 
1698         /* finished row i; check zero pivot, then stick row i into b->a */
1699         rs  = 0.0;
1700         /* L part */
1701         pc1 = b->a + bi[i];
1702         pj  = b->j + bi[i] ;
1703         nz  = bi[i+1] - bi[i];
1704         for (j=0; j<nz; j++) {
1705           col = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708         /* U part */
1709         pc1 = b->a + bdiag[i+1]+1;
1710         pj  = b->j + bdiag[i+1]+1;
1711         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712         for (j=0; j<nz; j++) {
1713           col = pj[j];
1714           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715         }
1716 
1717         sctx.rs  = rs;
1718         sctx.pv  = rtmp1[i];
1719         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1720 	if(sctx.newshift) break;
1721         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1722         *pc1 = 1.0/sctx.pv;
1723 
1724         /* Now take care of 1st column of diagonal 4x4 block. */
1725         pc2 = rtmp2 + i;
1726         pc3 = rtmp3 + i;
1727 	pc4 = rtmp4 + i;
1728         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1733           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col = pj[j];
1736             rtmp2[col] -= mul2 * rtmp1[col];
1737             rtmp3[col] -= mul3 * rtmp1[col];
1738 	    rtmp4[col] -= mul4 * rtmp1[col];
1739           }
1740           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1741         }
1742 
1743         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744         rs = 0.0;
1745         /* L part */
1746         pc2 = b->a + bi[i+1];
1747         pj  = b->j + bi[i+1] ;
1748         nz  = bi[i+2] - bi[i+1];
1749         for (j=0; j<nz; j++) {
1750           col = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753         /* U part */
1754         pc2 = b->a + bdiag[i+2]+1;
1755         pj  = b->j + bdiag[i+2]+1;
1756         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757         for (j=0; j<nz; j++) {
1758           col = pj[j];
1759           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760         }
1761 
1762         sctx.rs  = rs;
1763         sctx.pv  = rtmp2[i+1];
1764         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1765 	if(sctx.newshift) break;
1766         pc2  = b->a + bdiag[i+1];
1767         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1768 
1769         /* Now take care of 2nd column of diagonal 4x4 block. */
1770         pc3 = rtmp3 + i+1;
1771 	pc4 = rtmp4 + i+1;
1772         if (*pc3 != 0.0 || *pc4 != 0.0){
1773           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1776           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777           for (j=0; j<nz; j++) {
1778             col = pj[j];
1779             rtmp3[col] -= mul3 * rtmp2[col];
1780 	    rtmp4[col] -= mul4 * rtmp2[col];
1781           }
1782           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1783         }
1784 
1785         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786         rs = 0.0;
1787         /* L part */
1788         pc3 = b->a + bi[i+2];
1789         pj  = b->j + bi[i+2] ;
1790         nz  = bi[i+3] - bi[i+2];
1791         for (j=0; j<nz; j++) {
1792           col = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795         /* U part */
1796         pc3 = b->a + bdiag[i+3]+1;
1797         pj  = b->j + bdiag[i+3]+1;
1798         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799         for (j=0; j<nz; j++) {
1800           col = pj[j];
1801           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802         }
1803 
1804         sctx.rs  = rs;
1805         sctx.pv  = rtmp3[i+2];
1806         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1807 	if(sctx.newshift) break;
1808         pc3  = b->a + bdiag[i+2];
1809         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810 
1811 	/* Now take care of 3rd column of diagonal 4x4 block. */
1812 	pc4 = rtmp4 + i+2;
1813         if (*pc4 != 0.0){
1814 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1816           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817           for (j=0; j<nz; j++) {
1818             col = pj[j];
1819 	    rtmp4[col] -= mul4 * rtmp3[col];
1820           }
1821           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1822         }
1823 
1824         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825         rs = 0.0;
1826         /* L part */
1827         pc4 = b->a + bi[i+3];
1828         pj  = b->j + bi[i+3] ;
1829         nz  = bi[i+4] - bi[i+3];
1830         for (j=0; j<nz; j++) {
1831           col = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834         /* U part */
1835         pc4 = b->a + bdiag[i+4]+1;
1836         pj  = b->j + bdiag[i+4]+1;
1837         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838         for (j=0; j<nz; j++) {
1839           col = pj[j];
1840           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841         }
1842 
1843         sctx.rs  = rs;
1844         sctx.pv  = rtmp4[i+3];
1845         ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr);
1846 	if(sctx.newshift) break;
1847         pc4  = b->a + bdiag[i+3];
1848         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849          break;
1850 
1851       default:
1852         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853       }
1854       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1855       i += nodesz;                 /* Update the row */
1856     }
1857 
1858     /* MatPivotRefine() */
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860       /*
1861        * if no shift in this attempt & shifting & started shifting & can refine,
1862        * then try lower shift
1863        */
1864       sctx.shift_hi       = sctx.shift_fraction;
1865       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1867       sctx.newshift       = PETSC_TRUE;
1868       sctx.nshift++;
1869     }
1870   } while (sctx.newshift);
1871 
1872   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1873   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1874   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1876 
1877   C->ops->solve              = MatSolve_SeqAIJ;
1878   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1879   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1880   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1881   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1882   C->assembled    = PETSC_TRUE;
1883   C->preallocated = PETSC_TRUE;
1884   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1885 
1886   /* MatShiftView(A,info,&sctx) */
1887   if (sctx.nshift){
1888     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1890     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1892     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1894     }
1895   }
1896   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 #undef __FUNCT__
1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903 {
1904   Mat               C = B;
1905   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1907   PetscErrorCode    ierr;
1908   const PetscInt    *r,*ic,*c,*ics;
1909   PetscInt          n = A->rmap->n,*bi = b->i;
1910   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913   PetscScalar       mul1,mul2,mul3,tmp;
1914   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916   PetscReal         rs=0.0;
1917   FactorShiftCtx    sctx;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++){
1936         if (*ajtmp != i){
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics   = ic ;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j){
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995     tmp_vec2[i]  = nodesz;
1996     j           += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.newshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz){
2012       case 1:
2013         for  (j=0; j<nz; j++){
2014           idx        = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx        = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++ ;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0){
2034             pv   = ba + bd[prow];
2035             pj   = nbj + bd[prow];
2036             mul1 = *pc1 * *pv++;
2037             *pc1 = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp = pv[j];
2042               idx = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++ ;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv    = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs         = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs  = rs;
2060         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2061         if (sctx.newshift) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx        = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx        = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++ ;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0){
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++ ;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++){
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2123         if (sctx.newshift) goto endofwhile;
2124 
2125         if (*pc2 != 0.0){
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx = pj[j] ;
2132             tmp = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv = rtmp22[row+1];
2143         rs = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2155         if (sctx.newshift) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx        = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp = aj + ai[idx];
2169         v1    = aa + ai[idx];
2170         v2    = aa + ai[idx+1];
2171         v3    = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx        = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++ ;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp = pv[j];
2203               idx = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++ ;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++){
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2228         if (sctx.newshift) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0){
2231           mul2 = (*pc2)/(*pc1);
2232           mul3 = (*pc3)/(*pc1);
2233           *pc2 = mul2;
2234           *pc3 = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx = pj[j] ;
2239             tmp = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2 = rtmp22 + prow;
2248         pc3 = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++){
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2258         if (sctx.newshift) goto endofwhile;
2259 
2260         if (*pc3 != 0.0){
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx = pj[j] ;
2267             tmp = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv = rtmp33[row+2];
2279         rs = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr);
2294         if (sctx.newshift) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302     endofwhile:;
2303   } while (sctx.newshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313   C->assembled   = PETSC_TRUE;
2314   C->preallocated = PETSC_TRUE;
2315   if (sctx.nshift) {
2316     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2323   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2324   PetscFunctionReturn(0);
2325 }
2326 
2327 
2328 /* ----------------------------------------------------------- */
2329 #undef __FUNCT__
2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332 {
2333   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334   IS                iscol = a->col,isrow = a->row;
2335   PetscErrorCode    ierr;
2336   const PetscInt    *r,*c,*rout,*cout;
2337   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342   const PetscScalar *b;
2343 
2344   PetscFunctionBegin;
2345   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346   node_max = a->inode.node_count;
2347   ns       = a->inode.size;     /* Node Size array */
2348 
2349   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2351   tmp  = a->solve_work;
2352 
2353   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2354   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2355 
2356   /* forward solve the lower triangular */
2357   tmps = tmp ;
2358   aa   = a_a ;
2359   aj   = a_j ;
2360   ad   = a->diag;
2361 
2362   for (i = 0,row = 0; i< node_max; ++i){
2363     nsz = ns[i];
2364     aii = ai[row];
2365     v1  = aa + aii;
2366     vi  = aj + aii;
2367     nz  = ai[row+1]- ai[row];
2368 
2369     if (i < node_max-1) {
2370       /* Prefetch the indices for the next block */
2371       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2372       /* Prefetch the data for the next block */
2373       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2374     }
2375 
2376     switch (nsz){               /* Each loop in 'case' is unrolled */
2377     case 1 :
2378       sum1 = b[r[row]];
2379       for(j=0; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385       }
2386       if(j == nz-1){
2387         tmp0 = tmps[vi[j]];
2388         sum1 -= v1[j]*tmp0;
2389       }
2390       tmp[row++]=sum1;
2391       break;
2392     case 2:
2393       sum1 = b[r[row]];
2394       sum2 = b[r[row+1]];
2395       v2   = aa + ai[row+1];
2396 
2397       for(j=0; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404       }
2405       if(j == nz-1){
2406         tmp0 = tmps[vi[j]];
2407         sum1 -= v1[j] *tmp0;
2408         sum2 -= v2[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       tmp[row ++]=sum1;
2412       tmp[row ++]=sum2;
2413       break;
2414     case 3:
2415       sum1 = b[r[row]];
2416       sum2 = b[r[row+1]];
2417       sum3 = b[r[row+2]];
2418       v2   = aa + ai[row+1];
2419       v3   = aa + ai[row+2];
2420 
2421       for (j=0; j<nz-1; j+=2){
2422         i0   = vi[j];
2423         i1   = vi[j+1];
2424         tmp0 = tmps[i0];
2425         tmp1 = tmps[i1];
2426         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429       }
2430       if (j == nz-1){
2431         tmp0 = tmps[vi[j]];
2432         sum1 -= v1[j] *tmp0;
2433         sum2 -= v2[j] *tmp0;
2434         sum3 -= v3[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum3 -= v3[nz+1] * sum2;
2439       tmp[row ++]=sum1;
2440       tmp[row ++]=sum2;
2441       tmp[row ++]=sum3;
2442       break;
2443 
2444     case 4:
2445       sum1 = b[r[row]];
2446       sum2 = b[r[row+1]];
2447       sum3 = b[r[row+2]];
2448       sum4 = b[r[row+3]];
2449       v2   = aa + ai[row+1];
2450       v3   = aa + ai[row+2];
2451       v4   = aa + ai[row+3];
2452 
2453       for (j=0; j<nz-1; j+=2){
2454         i0   = vi[j];
2455         i1   = vi[j+1];
2456         tmp0 = tmps[i0];
2457         tmp1 = tmps[i1];
2458         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462       }
2463       if (j == nz-1){
2464         tmp0 = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum3 -= v3[nz+1] * sum2;
2474       sum4 -= v4[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476 
2477       tmp[row ++]=sum1;
2478       tmp[row ++]=sum2;
2479       tmp[row ++]=sum3;
2480       tmp[row ++]=sum4;
2481       break;
2482     case 5:
2483       sum1 = b[r[row]];
2484       sum2 = b[r[row+1]];
2485       sum3 = b[r[row+2]];
2486       sum4 = b[r[row+3]];
2487       sum5 = b[r[row+4]];
2488       v2   = aa + ai[row+1];
2489       v3   = aa + ai[row+2];
2490       v4   = aa + ai[row+3];
2491       v5   = aa + ai[row+4];
2492 
2493       for (j=0; j<nz-1; j+=2){
2494         i0   = vi[j];
2495         i1   = vi[j+1];
2496         tmp0 = tmps[i0];
2497         tmp1 = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503       }
2504       if (j == nz-1){
2505         tmp0 = tmps[vi[j]];
2506         sum1 -= v1[j] *tmp0;
2507         sum2 -= v2[j] *tmp0;
2508         sum3 -= v3[j] *tmp0;
2509         sum4 -= v4[j] *tmp0;
2510         sum5 -= v5[j] *tmp0;
2511       }
2512 
2513       sum2 -= v2[nz] * sum1;
2514       sum3 -= v3[nz] * sum1;
2515       sum4 -= v4[nz] * sum1;
2516       sum5 -= v5[nz] * sum1;
2517       sum3 -= v3[nz+1] * sum2;
2518       sum4 -= v4[nz+1] * sum2;
2519       sum5 -= v5[nz+1] * sum2;
2520       sum4 -= v4[nz+2] * sum3;
2521       sum5 -= v5[nz+2] * sum3;
2522       sum5 -= v5[nz+3] * sum4;
2523 
2524       tmp[row ++]=sum1;
2525       tmp[row ++]=sum2;
2526       tmp[row ++]=sum3;
2527       tmp[row ++]=sum4;
2528       tmp[row ++]=sum5;
2529       break;
2530     default:
2531       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532     }
2533   }
2534   /* backward solve the upper triangular */
2535   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536     nsz = ns[i];
2537     aii = ad[row+1] + 1;
2538     v1  = aa + aii;
2539     vi  = aj + aii;
2540     nz  = ad[row]- ad[row+1] - 1;
2541 
2542     if (i > 0) {
2543       /* Prefetch the indices for the next block */
2544       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545       /* Prefetch the data for the next block */
2546       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2547     }
2548 
2549     switch (nsz){               /* Each loop in 'case' is unrolled */
2550     case 1 :
2551       sum1 = tmp[row];
2552 
2553       for(j=0 ; j<nz-1; j+=2){
2554         i0   = vi[j];
2555         i1   = vi[j+1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559       }
2560       if (j == nz-1){
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j]*tmp0;
2563       }
2564       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       break;
2566     case 2 :
2567       sum1 = tmp[row];
2568       sum2 = tmp[row-1];
2569       v2   = aa + ad[row] + 1;
2570       for (j=0 ; j<nz-1; j+=2){
2571         i0   = vi[j];
2572         i1   = vi[j+1];
2573         tmp0 = tmps[i0];
2574         tmp1 = tmps[i1];
2575         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577       }
2578       if (j == nz-1){
2579         tmp0  = tmps[vi[j]];
2580         sum1 -= v1[j]* tmp0;
2581         sum2 -= v2[j+1]* tmp0;
2582       }
2583 
2584       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2   -= v2[0] * tmp0;
2586       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587       break;
2588     case 3 :
2589       sum1 = tmp[row];
2590       sum2 = tmp[row -1];
2591       sum3 = tmp[row -2];
2592       v2   = aa + ad[row] + 1;
2593       v3   = aa + ad[row -1] + 1;
2594       for (j=0 ; j<nz-1; j+=2){
2595         i0   = vi[j];
2596         i1   = vi[j+1];
2597         tmp0 = tmps[i0];
2598         tmp1 = tmps[i1];
2599         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602       }
2603       if (j== nz-1){
2604         tmp0  = tmps[vi[j]];
2605         sum1 -= v1[j] * tmp0;
2606         sum2 -= v2[j+1] * tmp0;
2607         sum3 -= v3[j+2] * tmp0;
2608       }
2609       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610       sum2   -= v2[0]* tmp0;
2611       sum3   -= v3[1] * tmp0;
2612       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613       sum3   -= v3[0]* tmp0;
2614       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615 
2616       break;
2617     case 4 :
2618       sum1 = tmp[row];
2619       sum2 = tmp[row -1];
2620       sum3 = tmp[row -2];
2621       sum4 = tmp[row -3];
2622       v2   = aa + ad[row]+1;
2623       v3   = aa + ad[row -1]+1;
2624       v4   = aa + ad[row -2]+1;
2625 
2626       for (j=0 ; j<nz-1; j+=2){
2627         i0   = vi[j];
2628         i1   = vi[j+1];
2629         tmp0 = tmps[i0];
2630         tmp1 = tmps[i1];
2631         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635       }
2636       if (j== nz-1){
2637         tmp0  = tmps[vi[j]];
2638         sum1 -= v1[j] * tmp0;
2639         sum2 -= v2[j+1] * tmp0;
2640         sum3 -= v3[j+2] * tmp0;
2641         sum4 -= v4[j+3] * tmp0;
2642       }
2643 
2644       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2   -= v2[0] * tmp0;
2646       sum3   -= v3[1] * tmp0;
2647       sum4   -= v4[2] * tmp0;
2648       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649       sum3   -= v3[0] * tmp0;
2650       sum4   -= v4[1] * tmp0;
2651       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4   -= v4[0] * tmp0;
2653       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654       break;
2655     case 5 :
2656       sum1 = tmp[row];
2657       sum2 = tmp[row -1];
2658       sum3 = tmp[row -2];
2659       sum4 = tmp[row -3];
2660       sum5 = tmp[row -4];
2661       v2   = aa + ad[row]+1;
2662       v3   = aa + ad[row -1]+1;
2663       v4   = aa + ad[row -2]+1;
2664       v5   = aa + ad[row -3]+1;
2665       for (j=0 ; j<nz-1; j+=2){
2666         i0   = vi[j];
2667         i1   = vi[j+1];
2668         tmp0 = tmps[i0];
2669         tmp1 = tmps[i1];
2670         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675       }
2676       if (j==nz-1){
2677         tmp0  = tmps[vi[j]];
2678         sum1 -= v1[j] * tmp0;
2679         sum2 -= v2[j+1] * tmp0;
2680         sum3 -= v3[j+2] * tmp0;
2681         sum4 -= v4[j+3] * tmp0;
2682         sum5 -= v5[j+4] * tmp0;
2683       }
2684 
2685       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686       sum2   -= v2[0] * tmp0;
2687       sum3   -= v3[1] * tmp0;
2688       sum4   -= v4[2] * tmp0;
2689       sum5   -= v5[3] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691       sum3   -= v3[0] * tmp0;
2692       sum4   -= v4[1] * tmp0;
2693       sum5   -= v5[2] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695       sum4   -= v4[0] * tmp0;
2696       sum5   -= v5[1] * tmp0;
2697       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698       sum5   -= v5[0] * tmp0;
2699       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700       break;
2701     default:
2702       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703     }
2704   }
2705   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2706   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2707   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2709   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 
2714 /*
2715      Makes a longer coloring[] array and calls the usual code with that
2716 */
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720 {
2721   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722   PetscErrorCode  ierr;
2723   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724   PetscInt        *colorused,i;
2725   ISColoringValue *newcolor;
2726 
2727   PetscFunctionBegin;
2728   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2729   /* loop over inodes, marking a color for each column*/
2730   row = 0;
2731   for (i=0; i<m; i++){
2732     for (j=0; j<ns[i]; j++) {
2733       newcolor[row++] = coloring[i] + j*ncolors;
2734     }
2735   }
2736 
2737   /* eliminate unneeded colors */
2738   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2739   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2740   for (i=0; i<n; i++) {
2741     colorused[newcolor[i]] = 1;
2742   }
2743 
2744   for (i=1; i<5*ncolors; i++) {
2745     colorused[i] += colorused[i-1];
2746   }
2747   ncolors = colorused[5*ncolors-1];
2748   for (i=0; i<n; i++) {
2749     newcolor[i] = colorused[newcolor[i]]-1;
2750   }
2751   ierr = PetscFree(colorused);CHKERRQ(ierr);
2752   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2753   ierr = PetscFree(coloring);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 #include "../src/mat/blockinvert.h"
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762 {
2763   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765   MatScalar          *ibdiag,*bdiag,work[25];
2766   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2767   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768   const PetscScalar  *xb;
2769   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770   PetscErrorCode     ierr;
2771   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2773 
2774   PetscFunctionBegin;
2775   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777   if (its > 1) {
2778     /* switch to non-inode version */
2779     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2780     PetscFunctionReturn(0);
2781   }
2782 
2783   if (!a->inode.ibdiagvalid) {
2784     if (!a->inode.ibdiag) {
2785       /* calculate space needed for diagonal blocks */
2786       for (i=0; i<m; i++) {
2787 	cnt += sizes[i]*sizes[i];
2788       }
2789       a->inode.bdiagsize = cnt;
2790       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2791     }
2792 
2793     /* copy over the diagonal blocks and invert them */
2794     ibdiag = a->inode.ibdiag;
2795     bdiag  = a->inode.bdiag;
2796     cnt = 0;
2797     for (i=0, row = 0; i<m; i++) {
2798       for (j=0; j<sizes[i]; j++) {
2799         for (k=0; k<sizes[i]; k++) {
2800           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2801         }
2802       }
2803       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2804 
2805       switch(sizes[i]) {
2806         case 1:
2807           /* Create matrix data structure */
2808           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2809           ibdiag[cnt] = 1.0/ibdiag[cnt];
2810           break;
2811         case 2:
2812           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2813           break;
2814         case 3:
2815           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2816           break;
2817         case 4:
2818           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2819           break;
2820         case 5:
2821           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2822           break;
2823        default:
2824 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2825       }
2826       cnt += sizes[i]*sizes[i];
2827       row += sizes[i];
2828     }
2829     a->inode.ibdiagvalid = PETSC_TRUE;
2830   }
2831   ibdiag = a->inode.ibdiag;
2832   bdiag  = a->inode.bdiag;
2833 
2834 
2835   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837     const PetscScalar *b;
2838     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2839     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2840     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2841 
2842       for (i=0, row=0; i<m; i++) {
2843         sz  = diag[row] - ii[row];
2844         v1  = a->a + ii[row];
2845         idx = a->j + ii[row];
2846 
2847         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2848         switch (sizes[i]){
2849           case 1:
2850 
2851             sum1  = b[row];
2852             for(n = 0; n<sz-1; n+=2) {
2853               i1   = idx[0];
2854               i2   = idx[1];
2855               idx += 2;
2856               tmp0 = x[i1];
2857               tmp1 = x[i2];
2858               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2859             }
2860 
2861             if (n == sz-1){
2862               tmp0  = x[*idx];
2863               sum1 -= *v1 * tmp0;
2864             }
2865             x[row++] = sum1*(*ibdiag++);
2866             break;
2867           case 2:
2868             v2    = a->a + ii[row+1];
2869             sum1  = b[row];
2870             sum2  = b[row+1];
2871             for(n = 0; n<sz-1; n+=2) {
2872               i1   = idx[0];
2873               i2   = idx[1];
2874               idx += 2;
2875               tmp0 = x[i1];
2876               tmp1 = x[i2];
2877               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2878               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2879             }
2880 
2881             if (n == sz-1){
2882               tmp0  = x[*idx];
2883               sum1 -= v1[0] * tmp0;
2884               sum2 -= v2[0] * tmp0;
2885             }
2886             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888             ibdiag  += 4;
2889             break;
2890           case 3:
2891             v2    = a->a + ii[row+1];
2892             v3    = a->a + ii[row+2];
2893             sum1  = b[row];
2894             sum2  = b[row+1];
2895             sum3  = b[row+2];
2896             for(n = 0; n<sz-1; n+=2) {
2897               i1   = idx[0];
2898               i2   = idx[1];
2899               idx += 2;
2900               tmp0 = x[i1];
2901               tmp1 = x[i2];
2902               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905             }
2906 
2907             if (n == sz-1){
2908               tmp0  = x[*idx];
2909               sum1 -= v1[0] * tmp0;
2910               sum2 -= v2[0] * tmp0;
2911               sum3 -= v3[0] * tmp0;
2912             }
2913             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916             ibdiag  += 9;
2917             break;
2918           case 4:
2919             v2    = a->a + ii[row+1];
2920             v3    = a->a + ii[row+2];
2921             v4    = a->a + ii[row+3];
2922             sum1  = b[row];
2923             sum2  = b[row+1];
2924             sum3  = b[row+2];
2925             sum4  = b[row+3];
2926             for(n = 0; n<sz-1; n+=2) {
2927               i1   = idx[0];
2928               i2   = idx[1];
2929               idx += 2;
2930               tmp0 = x[i1];
2931               tmp1 = x[i2];
2932               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936             }
2937 
2938             if (n == sz-1){
2939               tmp0  = x[*idx];
2940               sum1 -= v1[0] * tmp0;
2941               sum2 -= v2[0] * tmp0;
2942               sum3 -= v3[0] * tmp0;
2943               sum4 -= v4[0] * tmp0;
2944             }
2945             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2946             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2947             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2948             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2949             ibdiag  += 16;
2950             break;
2951           case 5:
2952             v2    = a->a + ii[row+1];
2953             v3    = a->a + ii[row+2];
2954             v4    = a->a + ii[row+3];
2955             v5    = a->a + ii[row+4];
2956             sum1  = b[row];
2957             sum2  = b[row+1];
2958             sum3  = b[row+2];
2959             sum4  = b[row+3];
2960             sum5  = b[row+4];
2961             for(n = 0; n<sz-1; n+=2) {
2962               i1   = idx[0];
2963               i2   = idx[1];
2964               idx += 2;
2965               tmp0 = x[i1];
2966               tmp1 = x[i2];
2967               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2968               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2969               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2970               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2971               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2972             }
2973 
2974             if (n == sz-1){
2975               tmp0  = x[*idx];
2976               sum1 -= v1[0] * tmp0;
2977               sum2 -= v2[0] * tmp0;
2978               sum3 -= v3[0] * tmp0;
2979               sum4 -= v4[0] * tmp0;
2980               sum5 -= v5[0] * tmp0;
2981             }
2982             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2983             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2984             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2985             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2986             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2987             ibdiag  += 25;
2988             break;
2989 	  default:
2990    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2991         }
2992       }
2993 
2994       xb = x;
2995       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2996     } else xb = b;
2997     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2998         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2999       cnt = 0;
3000       for (i=0, row=0; i<m; i++) {
3001 
3002         switch (sizes[i]){
3003           case 1:
3004             x[row++] *= bdiag[cnt++];
3005             break;
3006           case 2:
3007             x1   = x[row]; x2 = x[row+1];
3008             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3009             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3010             x[row++] = tmp1;
3011             x[row++] = tmp2;
3012             cnt += 4;
3013             break;
3014           case 3:
3015             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3016             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3017             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3018             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3019             x[row++] = tmp1;
3020             x[row++] = tmp2;
3021             x[row++] = tmp3;
3022             cnt += 9;
3023             break;
3024           case 4:
3025             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3026             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3027             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3028             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3029             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3030             x[row++] = tmp1;
3031             x[row++] = tmp2;
3032             x[row++] = tmp3;
3033             x[row++] = tmp4;
3034             cnt += 16;
3035             break;
3036           case 5:
3037             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3038             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3039             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3040             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3041             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3042             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3043             x[row++] = tmp1;
3044             x[row++] = tmp2;
3045             x[row++] = tmp3;
3046             x[row++] = tmp4;
3047             x[row++] = tmp5;
3048             cnt += 25;
3049             break;
3050 	  default:
3051    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3052         }
3053       }
3054       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3055     }
3056     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3057 
3058       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3059       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3060         ibdiag -= sizes[i]*sizes[i];
3061         sz      = ii[row+1] - diag[row] - 1;
3062         v1      = a->a + diag[row] + 1;
3063         idx     = a->j + diag[row] + 1;
3064 
3065         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3066         switch (sizes[i]){
3067           case 1:
3068 
3069             sum1  = xb[row];
3070             for(n = 0; n<sz-1; n+=2) {
3071               i1   = idx[0];
3072               i2   = idx[1];
3073               idx += 2;
3074               tmp0 = x[i1];
3075               tmp1 = x[i2];
3076               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077             }
3078 
3079             if (n == sz-1){
3080               tmp0  = x[*idx];
3081               sum1 -= *v1*tmp0;
3082             }
3083             x[row--] = sum1*(*ibdiag);
3084             break;
3085 
3086           case 2:
3087 
3088             sum1  = xb[row];
3089             sum2  = xb[row-1];
3090             /* note that sum1 is associated with the second of the two rows */
3091             v2    = a->a + diag[row-1] + 2;
3092             for(n = 0; n<sz-1; n+=2) {
3093               i1   = idx[0];
3094               i2   = idx[1];
3095               idx += 2;
3096               tmp0 = x[i1];
3097               tmp1 = x[i2];
3098               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100             }
3101 
3102             if (n == sz-1){
3103               tmp0  = x[*idx];
3104               sum1 -= *v1*tmp0;
3105               sum2 -= *v2*tmp0;
3106             }
3107             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3108             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3109             break;
3110           case 3:
3111 
3112             sum1  = xb[row];
3113             sum2  = xb[row-1];
3114             sum3  = xb[row-2];
3115             v2    = a->a + diag[row-1] + 2;
3116             v3    = a->a + diag[row-2] + 3;
3117             for(n = 0; n<sz-1; n+=2) {
3118               i1   = idx[0];
3119               i2   = idx[1];
3120               idx += 2;
3121               tmp0 = x[i1];
3122               tmp1 = x[i2];
3123               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3124               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3125               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3126             }
3127 
3128             if (n == sz-1){
3129               tmp0  = x[*idx];
3130               sum1 -= *v1*tmp0;
3131               sum2 -= *v2*tmp0;
3132               sum3 -= *v3*tmp0;
3133             }
3134             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3135             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3136             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3137             break;
3138           case 4:
3139 
3140             sum1  = xb[row];
3141             sum2  = xb[row-1];
3142             sum3  = xb[row-2];
3143             sum4  = xb[row-3];
3144             v2    = a->a + diag[row-1] + 2;
3145             v3    = a->a + diag[row-2] + 3;
3146             v4    = a->a + diag[row-3] + 4;
3147             for(n = 0; n<sz-1; n+=2) {
3148               i1   = idx[0];
3149               i2   = idx[1];
3150               idx += 2;
3151               tmp0 = x[i1];
3152               tmp1 = x[i2];
3153               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3155               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3156               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3157             }
3158 
3159             if (n == sz-1){
3160               tmp0  = x[*idx];
3161               sum1 -= *v1*tmp0;
3162               sum2 -= *v2*tmp0;
3163               sum3 -= *v3*tmp0;
3164               sum4 -= *v4*tmp0;
3165             }
3166             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3167             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3168             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3169             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3170             break;
3171           case 5:
3172 
3173             sum1  = xb[row];
3174             sum2  = xb[row-1];
3175             sum3  = xb[row-2];
3176             sum4  = xb[row-3];
3177             sum5  = xb[row-4];
3178             v2    = a->a + diag[row-1] + 2;
3179             v3    = a->a + diag[row-2] + 3;
3180             v4    = a->a + diag[row-3] + 4;
3181             v5    = a->a + diag[row-4] + 5;
3182             for(n = 0; n<sz-1; n+=2) {
3183               i1   = idx[0];
3184               i2   = idx[1];
3185               idx += 2;
3186               tmp0 = x[i1];
3187               tmp1 = x[i2];
3188               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3190               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3191               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3192               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3193             }
3194 
3195             if (n == sz-1){
3196               tmp0  = x[*idx];
3197               sum1 -= *v1*tmp0;
3198               sum2 -= *v2*tmp0;
3199               sum3 -= *v3*tmp0;
3200               sum4 -= *v4*tmp0;
3201               sum5 -= *v5*tmp0;
3202             }
3203             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3204             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3205             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3206             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3207             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3208             break;
3209 	  default:
3210    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3211         }
3212       }
3213 
3214       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3215     }
3216     its--;
3217     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3218     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3219   }
3220   if (flag & SOR_EISENSTAT) {
3221     const PetscScalar *b;
3222     MatScalar         *t = a->inode.ssor_work;
3223 
3224     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3225     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3226     /*
3227           Apply  (U + D)^-1  where D is now the block diagonal
3228     */
3229     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3230     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3231       ibdiag -= sizes[i]*sizes[i];
3232       sz      = ii[row+1] - diag[row] - 1;
3233       v1      = a->a + diag[row] + 1;
3234       idx     = a->j + diag[row] + 1;
3235       CHKMEMQ;
3236       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3237       switch (sizes[i]){
3238         case 1:
3239 
3240 	  sum1  = b[row];
3241 	  for(n = 0; n<sz-1; n+=2) {
3242 	    i1   = idx[0];
3243 	    i2   = idx[1];
3244 	    idx += 2;
3245 	    tmp0 = x[i1];
3246 	    tmp1 = x[i2];
3247 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248 	  }
3249 
3250 	  if (n == sz-1){
3251 	    tmp0  = x[*idx];
3252 	    sum1 -= *v1*tmp0;
3253 	  }
3254 	  x[row] = sum1*(*ibdiag);row--;
3255 	  break;
3256 
3257         case 2:
3258 
3259 	  sum1  = b[row];
3260 	  sum2  = b[row-1];
3261 	  /* note that sum1 is associated with the second of the two rows */
3262 	  v2    = a->a + diag[row-1] + 2;
3263 	  for(n = 0; n<sz-1; n+=2) {
3264 	    i1   = idx[0];
3265 	    i2   = idx[1];
3266 	    idx += 2;
3267 	    tmp0 = x[i1];
3268 	    tmp1 = x[i2];
3269 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3270 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3271 	  }
3272 
3273 	  if (n == sz-1){
3274 	    tmp0  = x[*idx];
3275 	    sum1 -= *v1*tmp0;
3276 	    sum2 -= *v2*tmp0;
3277 	  }
3278 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3279 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3280           row -= 2;
3281 	  break;
3282         case 3:
3283 
3284 	  sum1  = b[row];
3285 	  sum2  = b[row-1];
3286 	  sum3  = b[row-2];
3287 	  v2    = a->a + diag[row-1] + 2;
3288 	  v3    = a->a + diag[row-2] + 3;
3289 	  for(n = 0; n<sz-1; n+=2) {
3290 	    i1   = idx[0];
3291 	    i2   = idx[1];
3292 	    idx += 2;
3293 	    tmp0 = x[i1];
3294 	    tmp1 = x[i2];
3295 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3296 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3297 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3298 	  }
3299 
3300 	  if (n == sz-1){
3301 	    tmp0  = x[*idx];
3302 	    sum1 -= *v1*tmp0;
3303 	    sum2 -= *v2*tmp0;
3304 	    sum3 -= *v3*tmp0;
3305 	  }
3306 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3307 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3308 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3309           row -= 3;
3310 	  break;
3311         case 4:
3312 
3313 	  sum1  = b[row];
3314 	  sum2  = b[row-1];
3315 	  sum3  = b[row-2];
3316 	  sum4  = b[row-3];
3317 	  v2    = a->a + diag[row-1] + 2;
3318 	  v3    = a->a + diag[row-2] + 3;
3319 	  v4    = a->a + diag[row-3] + 4;
3320 	  for(n = 0; n<sz-1; n+=2) {
3321 	    i1   = idx[0];
3322 	    i2   = idx[1];
3323 	    idx += 2;
3324 	    tmp0 = x[i1];
3325 	    tmp1 = x[i2];
3326 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3327 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3328 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3329 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3330 	  }
3331 
3332 	  if (n == sz-1){
3333 	    tmp0  = x[*idx];
3334 	    sum1 -= *v1*tmp0;
3335 	    sum2 -= *v2*tmp0;
3336 	    sum3 -= *v3*tmp0;
3337 	    sum4 -= *v4*tmp0;
3338 	  }
3339 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3340 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3341 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3342 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3343           row -= 4;
3344 	  break;
3345         case 5:
3346 
3347 	  sum1  = b[row];
3348 	  sum2  = b[row-1];
3349 	  sum3  = b[row-2];
3350 	  sum4  = b[row-3];
3351 	  sum5  = b[row-4];
3352 	  v2    = a->a + diag[row-1] + 2;
3353 	  v3    = a->a + diag[row-2] + 3;
3354 	  v4    = a->a + diag[row-3] + 4;
3355 	  v5    = a->a + diag[row-4] + 5;
3356 	  for(n = 0; n<sz-1; n+=2) {
3357 	    i1   = idx[0];
3358 	    i2   = idx[1];
3359 	    idx += 2;
3360 	    tmp0 = x[i1];
3361 	    tmp1 = x[i2];
3362 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367 	  }
3368 
3369 	  if (n == sz-1){
3370 	    tmp0  = x[*idx];
3371 	    sum1 -= *v1*tmp0;
3372 	    sum2 -= *v2*tmp0;
3373 	    sum3 -= *v3*tmp0;
3374 	    sum4 -= *v4*tmp0;
3375 	    sum5 -= *v5*tmp0;
3376 	  }
3377 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3378 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3379 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3380 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3381 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3382           row -= 5;
3383 	  break;
3384         default:
3385 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3386       }
3387       CHKMEMQ;
3388     }
3389     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3390 
3391     /*
3392            t = b - D x    where D is the block diagonal
3393     */
3394     cnt = 0;
3395     for (i=0, row=0; i<m; i++) {
3396       CHKMEMQ;
3397       switch (sizes[i]){
3398         case 1:
3399 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3400 	  break;
3401         case 2:
3402 	  x1   = x[row]; x2 = x[row+1];
3403 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3404 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3405 	  t[row]   = b[row] - tmp1;
3406 	  t[row+1] = b[row+1] - tmp2; row += 2;
3407 	  cnt += 4;
3408 	  break;
3409         case 3:
3410 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3411 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3412 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3413 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3414 	  t[row] = b[row] - tmp1;
3415 	  t[row+1] = b[row+1] - tmp2;
3416 	  t[row+2] = b[row+2] - tmp3; row += 3;
3417 	  cnt += 9;
3418 	  break;
3419         case 4:
3420 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3421 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3422 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3423 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3424 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3425 	  t[row] = b[row] - tmp1;
3426 	  t[row+1] = b[row+1] - tmp2;
3427 	  t[row+2] = b[row+2] - tmp3;
3428 	  t[row+3] = b[row+3] - tmp4; row += 4;
3429 	  cnt += 16;
3430 	  break;
3431         case 5:
3432 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3433 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3434 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3435 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3436 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3437 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3438 	  t[row] = b[row] - tmp1;
3439 	  t[row+1] = b[row+1] - tmp2;
3440 	  t[row+2] = b[row+2] - tmp3;
3441 	  t[row+3] = b[row+3] - tmp4;
3442 	  t[row+4] = b[row+4] - tmp5;row += 5;
3443 	  cnt += 25;
3444 	  break;
3445         default:
3446 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3447       }
3448       CHKMEMQ;
3449     }
3450     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3451 
3452 
3453 
3454     /*
3455           Apply (L + D)^-1 where D is the block diagonal
3456     */
3457     for (i=0, row=0; i<m; i++) {
3458       sz  = diag[row] - ii[row];
3459       v1  = a->a + ii[row];
3460       idx = a->j + ii[row];
3461       CHKMEMQ;
3462       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3463       switch (sizes[i]){
3464         case 1:
3465 
3466 	  sum1  = t[row];
3467 	  for(n = 0; n<sz-1; n+=2) {
3468 	    i1   = idx[0];
3469 	    i2   = idx[1];
3470 	    idx += 2;
3471 	    tmp0 = t[i1];
3472 	    tmp1 = t[i2];
3473 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3474 	  }
3475 
3476 	  if (n == sz-1){
3477 	    tmp0  = t[*idx];
3478 	    sum1 -= *v1 * tmp0;
3479 	  }
3480 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3481 	  break;
3482         case 2:
3483 	  v2    = a->a + ii[row+1];
3484 	  sum1  = t[row];
3485 	  sum2  = t[row+1];
3486 	  for(n = 0; n<sz-1; n+=2) {
3487 	    i1   = idx[0];
3488 	    i2   = idx[1];
3489 	    idx += 2;
3490 	    tmp0 = t[i1];
3491 	    tmp1 = t[i2];
3492 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3493 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3494 	  }
3495 
3496 	  if (n == sz-1){
3497 	    tmp0  = t[*idx];
3498 	    sum1 -= v1[0] * tmp0;
3499 	    sum2 -= v2[0] * tmp0;
3500 	  }
3501 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3502 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3503 	  ibdiag  += 4; row += 2;
3504 	  break;
3505         case 3:
3506 	  v2    = a->a + ii[row+1];
3507 	  v3    = a->a + ii[row+2];
3508 	  sum1  = t[row];
3509 	  sum2  = t[row+1];
3510 	  sum3  = t[row+2];
3511 	  for(n = 0; n<sz-1; n+=2) {
3512 	    i1   = idx[0];
3513 	    i2   = idx[1];
3514 	    idx += 2;
3515 	    tmp0 = t[i1];
3516 	    tmp1 = t[i2];
3517 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3518 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3519 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3520 	  }
3521 
3522 	  if (n == sz-1){
3523 	    tmp0  = t[*idx];
3524 	    sum1 -= v1[0] * tmp0;
3525 	    sum2 -= v2[0] * tmp0;
3526 	    sum3 -= v3[0] * tmp0;
3527 	  }
3528 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3529 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3530 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3531 	  ibdiag  += 9; row += 3;
3532 	  break;
3533         case 4:
3534 	  v2    = a->a + ii[row+1];
3535 	  v3    = a->a + ii[row+2];
3536 	  v4    = a->a + ii[row+3];
3537 	  sum1  = t[row];
3538 	  sum2  = t[row+1];
3539 	  sum3  = t[row+2];
3540 	  sum4  = t[row+3];
3541 	  for(n = 0; n<sz-1; n+=2) {
3542 	    i1   = idx[0];
3543 	    i2   = idx[1];
3544 	    idx += 2;
3545 	    tmp0 = t[i1];
3546 	    tmp1 = t[i2];
3547 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3548 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3549 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3550 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3551 	  }
3552 
3553 	  if (n == sz-1){
3554 	    tmp0  = t[*idx];
3555 	    sum1 -= v1[0] * tmp0;
3556 	    sum2 -= v2[0] * tmp0;
3557 	    sum3 -= v3[0] * tmp0;
3558 	    sum4 -= v4[0] * tmp0;
3559 	  }
3560 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3561 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3562 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3563 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3564 	  ibdiag  += 16; row += 4;
3565 	  break;
3566         case 5:
3567 	  v2    = a->a + ii[row+1];
3568 	  v3    = a->a + ii[row+2];
3569 	  v4    = a->a + ii[row+3];
3570 	  v5    = a->a + ii[row+4];
3571 	  sum1  = t[row];
3572 	  sum2  = t[row+1];
3573 	  sum3  = t[row+2];
3574 	  sum4  = t[row+3];
3575 	  sum5  = t[row+4];
3576 	  for(n = 0; n<sz-1; n+=2) {
3577 	    i1   = idx[0];
3578 	    i2   = idx[1];
3579 	    idx += 2;
3580 	    tmp0 = t[i1];
3581 	    tmp1 = t[i2];
3582 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3583 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3584 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3585 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3586 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3587 	  }
3588 
3589 	  if (n == sz-1){
3590 	    tmp0  = t[*idx];
3591 	    sum1 -= v1[0] * tmp0;
3592 	    sum2 -= v2[0] * tmp0;
3593 	    sum3 -= v3[0] * tmp0;
3594 	    sum4 -= v4[0] * tmp0;
3595 	    sum5 -= v5[0] * tmp0;
3596 	  }
3597 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3598 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3599 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3600 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3601 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3602 	  ibdiag  += 25; row += 5;
3603 	  break;
3604         default:
3605 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3606       }
3607       CHKMEMQ;
3608     }
3609     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3610     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3611     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3612   }
3613   PetscFunctionReturn(0);
3614 }
3615 
3616 #undef __FUNCT__
3617 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3618 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3619 {
3620   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3621   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3622   const MatScalar    *bdiag = a->inode.bdiag;
3623   const PetscScalar  *b;
3624   PetscErrorCode      ierr;
3625   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3626   const PetscInt      *sizes = a->inode.size;
3627 
3628   PetscFunctionBegin;
3629   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3630   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3631   cnt = 0;
3632   for (i=0, row=0; i<m; i++) {
3633     switch (sizes[i]){
3634       case 1:
3635 	x[row] = b[row]*bdiag[cnt++];row++;
3636 	break;
3637       case 2:
3638 	x1   = b[row]; x2 = b[row+1];
3639 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3640 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3641 	x[row++] = tmp1;
3642 	x[row++] = tmp2;
3643 	cnt += 4;
3644 	break;
3645       case 3:
3646 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3647 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3648 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3649 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3650 	x[row++] = tmp1;
3651 	x[row++] = tmp2;
3652 	x[row++] = tmp3;
3653 	cnt += 9;
3654 	break;
3655       case 4:
3656 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3657 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3658 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3659 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3660 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3661 	x[row++] = tmp1;
3662 	x[row++] = tmp2;
3663 	x[row++] = tmp3;
3664 	x[row++] = tmp4;
3665 	cnt += 16;
3666 	break;
3667       case 5:
3668 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3669 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3670 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3671 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3672 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3673 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3674 	x[row++] = tmp1;
3675 	x[row++] = tmp2;
3676 	x[row++] = tmp3;
3677 	x[row++] = tmp4;
3678 	x[row++] = tmp5;
3679 	cnt += 25;
3680 	break;
3681       default:
3682 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3683     }
3684   }
3685   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3686   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3687   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3688   PetscFunctionReturn(0);
3689 }
3690 
3691 /*
3692     samestructure indicates that the matrix has not changed its nonzero structure so we
3693     do not need to recompute the inodes
3694 */
3695 #undef __FUNCT__
3696 #define __FUNCT__ "Mat_CheckInode"
3697 PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3698 {
3699   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3700   PetscErrorCode ierr;
3701   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3702   PetscBool      flag;
3703 
3704   PetscFunctionBegin;
3705   if (!a->inode.use)                     PetscFunctionReturn(0);
3706   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3707 
3708 
3709   m = A->rmap->n;
3710   if (a->inode.size) {ns = a->inode.size;}
3711   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3712 
3713   i          = 0;
3714   node_count = 0;
3715   idx        = a->j;
3716   ii         = a->i;
3717   while (i < m){                /* For each row */
3718     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3719     /* Limits the number of elements in a node to 'a->inode.limit' */
3720     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3721       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3722       if (nzy != nzx) break;
3723       idy  += nzx;             /* Same nonzero pattern */
3724       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3725       if (!flag) break;
3726     }
3727     ns[node_count++] = blk_size;
3728     idx += blk_size*nzx;
3729     i    = j;
3730   }
3731   /* If not enough inodes found,, do not use inode version of the routines */
3732   if (!m || node_count > .8*m) {
3733     ierr = PetscFree(ns);CHKERRQ(ierr);
3734     a->inode.node_count       = 0;
3735     a->inode.size             = PETSC_NULL;
3736     a->inode.use              = PETSC_FALSE;
3737     A->ops->mult              = MatMult_SeqAIJ;
3738     A->ops->sor               = MatSOR_SeqAIJ;
3739     A->ops->multadd           = MatMultAdd_SeqAIJ;
3740     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
3741     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
3742     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
3743     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
3744     A->ops->coloringpatch     = 0;
3745     A->ops->multdiagonalblock = 0;
3746     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3747   } else {
3748     if (!A->factortype) {
3749       A->ops->mult              = MatMult_SeqAIJ_Inode;
3750       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3751       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3752       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3753       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3754       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3755       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3756       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3757       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3758     } else {
3759       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3760     }
3761     a->inode.node_count       = node_count;
3762     a->inode.size             = ns;
3763     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3764   }
3765   a->inode.checked = PETSC_TRUE;
3766   PetscFunctionReturn(0);
3767 }
3768 
3769 #undef __FUNCT__
3770 #define __FUNCT__ "MatGetRow_FactoredLU"
3771 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
3772 {
3773   PetscInt k, *vi;
3774   PetscFunctionBegin;
3775   vi = aj + ai[row];
3776   for(k=0;k<nzl;k++) cols[k] = vi[k];
3777   vi = aj + adiag[row];
3778   cols[nzl] = vi[0];
3779   vi = aj + adiag[row+1]+1;
3780   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
3781   PetscFunctionReturn(0);
3782 }
3783 /*
3784    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3785    Modified from Mat_CheckInode().
3786 
3787    Input Parameters:
3788 +  Mat A - ILU or LU matrix factor
3789 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
3790     do not need to recompute the inodes
3791 */
3792 #undef __FUNCT__
3793 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3794 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
3795 {
3796   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3797   PetscErrorCode ierr;
3798   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3799   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3800   PetscBool      flag;
3801 
3802   PetscFunctionBegin;
3803   if (!a->inode.use)                     PetscFunctionReturn(0);
3804   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3805 
3806   m = A->rmap->n;
3807   if (a->inode.size) {ns = a->inode.size;}
3808   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3809 
3810   i          = 0;
3811   node_count = 0;
3812   while (i < m){                /* For each row */
3813     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3814     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3815     nzx  = nzl1 + nzu1 + 1;
3816     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3817     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3818 
3819     /* Limits the number of elements in a node to 'a->inode.limit' */
3820     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3821       nzl2    = ai[j+1] - ai[j];
3822       nzu2    = adiag[j] - adiag[j+1] - 1;
3823       nzy     = nzl2 + nzu2 + 1;
3824       if( nzy != nzx) break;
3825       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3826       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
3827       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3828       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3829       ierr = PetscFree(cols2);CHKERRQ(ierr);
3830     }
3831     ns[node_count++] = blk_size;
3832     ierr = PetscFree(cols1);CHKERRQ(ierr);
3833     i    = j;
3834   }
3835   /* If not enough inodes found,, do not use inode version of the routines */
3836   if (!m || node_count > .8*m) {
3837     ierr = PetscFree(ns);CHKERRQ(ierr);
3838     a->inode.node_count     = 0;
3839     a->inode.size           = PETSC_NULL;
3840     a->inode.use            = PETSC_FALSE;
3841     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3842   } else {
3843     A->ops->mult              = 0;
3844     A->ops->sor               = 0;
3845     A->ops->multadd           = 0;
3846     A->ops->getrowij          = 0;
3847     A->ops->restorerowij      = 0;
3848     A->ops->getcolumnij       = 0;
3849     A->ops->restorecolumnij   = 0;
3850     A->ops->coloringpatch     = 0;
3851     A->ops->multdiagonalblock = 0;
3852     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3853     a->inode.node_count       = node_count;
3854     a->inode.size             = ns;
3855     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3856   }
3857   a->inode.checked = PETSC_TRUE;
3858   PetscFunctionReturn(0);
3859 }
3860 
3861 /*
3862      This is really ugly. if inodes are used this replaces the
3863   permutations with ones that correspond to rows/cols of the matrix
3864   rather then inode blocks
3865 */
3866 #undef __FUNCT__
3867 #define __FUNCT__ "MatInodeAdjustForInodes"
3868 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3869 {
3870   PetscErrorCode ierr;
3871 
3872   PetscFunctionBegin;
3873   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
3874   PetscFunctionReturn(0);
3875 }
3876 
3877 EXTERN_C_BEGIN
3878 #undef __FUNCT__
3879 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
3880 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3881 {
3882   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3883   PetscErrorCode ierr;
3884   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3885   const PetscInt *ridx,*cidx;
3886   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3887   PetscInt       nslim_col,*ns_col;
3888   IS             ris = *rperm,cis = *cperm;
3889 
3890   PetscFunctionBegin;
3891   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3892   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3893 
3894   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3895   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3896   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3897 
3898   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3899   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3900 
3901   /* Form the inode structure for the rows of permuted matric using inv perm*/
3902   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3903 
3904   /* Construct the permutations for rows*/
3905   for (i=0,row = 0; i<nslim_row; ++i){
3906     indx      = ridx[i];
3907     start_val = tns[indx];
3908     end_val   = tns[indx + 1];
3909     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3910   }
3911 
3912   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3913   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3914 
3915  /* Construct permutations for columns */
3916   for (i=0,col=0; i<nslim_col; ++i){
3917     indx      = cidx[i];
3918     start_val = tns[indx];
3919     end_val   = tns[indx + 1];
3920     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3921   }
3922 
3923   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
3924   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3925   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
3926   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3927 
3928   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3929   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3930 
3931   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3932   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3933   ierr = ISDestroy(cis);CHKERRQ(ierr);
3934   ierr = ISDestroy(ris);CHKERRQ(ierr);
3935   ierr = PetscFree(tns);CHKERRQ(ierr);
3936   PetscFunctionReturn(0);
3937 }
3938 EXTERN_C_END
3939 
3940 #undef __FUNCT__
3941 #define __FUNCT__ "MatInodeGetInodeSizes"
3942 /*@C
3943    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3944 
3945    Not Collective
3946 
3947    Input Parameter:
3948 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3949 
3950    Output Parameter:
3951 +  node_count - no of inodes present in the matrix.
3952 .  sizes      - an array of size node_count,with sizes of each inode.
3953 -  limit      - the max size used to generate the inodes.
3954 
3955    Level: advanced
3956 
3957    Notes: This routine returns some internal storage information
3958    of the matrix, it is intended to be used by advanced users.
3959    It should be called after the matrix is assembled.
3960    The contents of the sizes[] array should not be changed.
3961    PETSC_NULL may be passed for information not requested.
3962 
3963 .keywords: matrix, seqaij, get, inode
3964 
3965 .seealso: MatGetInfo()
3966 @*/
3967 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3968 {
3969   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3970 
3971   PetscFunctionBegin;
3972   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3973   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3974   if (f) {
3975     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3976   }
3977   PetscFunctionReturn(0);
3978 }
3979 
3980 EXTERN_C_BEGIN
3981 #undef __FUNCT__
3982 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3983 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3984 {
3985   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3986 
3987   PetscFunctionBegin;
3988   if (node_count) *node_count = a->inode.node_count;
3989   if (sizes)      *sizes      = a->inode.size;
3990   if (limit)      *limit      = a->inode.limit;
3991   PetscFunctionReturn(0);
3992 }
3993 EXTERN_C_END
3994