xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 668f157ea6d169bb50bcdb9ebcdd418abd089fa7)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap,newshift = 0;
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp2 = rtmp1 + n;
1231   rtmp3 = rtmp2 + n;
1232   rtmp4 = rtmp3 + n;
1233   ics   = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1238 
1239   /* If max inode size > 4, split it into two inodes.*/
1240   /* also map the inode sizes according to the ordering */
1241   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1242   for (i=0,j=0; i<node_max; ++i,++j){
1243     if (ns[i]>3) {
1244       tmp_vec1[j] = ns[i]/2;
1245       ++j;
1246       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1247     } else {
1248       tmp_vec1[j] = ns[i];
1249     }
1250   }
1251   /* Use the correct node_max */
1252   node_max = j;
1253 
1254   /* Now reorder the inode info based on mat re-ordering info */
1255   /* First create a row -> inode_size_array_index map */
1256   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1257   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1258   for (i=0,row=0; i<node_max; i++) {
1259     nodesz = tmp_vec1[i];
1260     for (j=0; j<nodesz; j++,row++) {
1261       nsmap[row] = i;
1262     }
1263   }
1264   /* Using nsmap, create a reordered ns structure */
1265   for (i=0,j=0; i< node_max; i++) {
1266     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1267     tmp_vec2[i]  = nodesz;
1268     j           += nodesz;
1269   }
1270   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1271   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1272 
1273   /* Now use the correct ns */
1274   ns = tmp_vec2;
1275 
1276   do {
1277     sctx.useshift = PETSC_FALSE;
1278     /* Now loop over each block-row, and do the factorization */
1279     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1280       nodesz = ns[inod];
1281 
1282       switch (nodesz){
1283       case 1:
1284       /*----------*/
1285         /* zero rtmp1 */
1286         /* L part */
1287         nz    = bi[i+1] - bi[i];
1288         bjtmp = bj + bi[i];
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* U part */
1292         nz = bdiag[i]-bdiag[i+1];
1293         bjtmp = bj + bdiag[i+1]+1;
1294         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1295 
1296         /* load in initial (unfactored row) */
1297         nz    = ai[r[i]+1] - ai[r[i]];
1298         ajtmp = aj + ai[r[i]];
1299         v     = aa + ai[r[i]];
1300         for (j=0; j<nz; j++) {
1301           rtmp1[ics[ajtmp[j]]] = v[j];
1302         }
1303         /* ZeropivotApply() */
1304         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1305 
1306         /* elimination */
1307         bjtmp = bj + bi[i];
1308         row   = *bjtmp++;
1309         nzL   = bi[i+1] - bi[i];
1310         for(k=0; k < nzL;k++) {
1311           pc = rtmp1 + row;
1312           if (*pc != 0.0) {
1313             pv   = b->a + bdiag[row];
1314             mul1 = *pc * (*pv);
1315             *pc  = mul1;
1316             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1317             pv = b->a + bdiag[row+1]+1;
1318             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1319             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1320             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1321           }
1322           row = *bjtmp++;
1323         }
1324 
1325         /* finished row so stick it into b->a */
1326         rs = 0.0;
1327         /* L part */
1328         pv = b->a + bi[i] ;
1329         pj = b->j + bi[i] ;
1330         nz = bi[i+1] - bi[i];
1331         for (j=0; j<nz; j++) {
1332           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1333         }
1334 
1335         /* U part */
1336         pv = b->a + bdiag[i+1]+1;
1337         pj = b->j + bdiag[i+1]+1;
1338         nz = bdiag[i] - bdiag[i+1]-1;
1339         for (j=0; j<nz; j++) {
1340           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1341         }
1342 
1343         /* Check zero pivot */
1344         sctx.rs = rs;
1345         sctx.pv = rtmp1[i];
1346         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1347 	if(newshift == 1) break;
1348 
1349         /* Mark diagonal and invert diagonal for simplier triangular solves */
1350         pv  = b->a + bdiag[i];
1351         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1352         break;
1353 
1354       case 2:
1355       /*----------*/
1356         /* zero rtmp1 and rtmp2 */
1357         /* L part */
1358         nz    = bi[i+1] - bi[i];
1359         bjtmp = bj + bi[i];
1360         for  (j=0; j<nz; j++) {
1361           col = bjtmp[j];
1362           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1363         }
1364 
1365         /* U part */
1366         nz = bdiag[i]-bdiag[i+1];
1367         bjtmp = bj + bdiag[i+1]+1;
1368         for  (j=0; j<nz; j++) {
1369           col = bjtmp[j];
1370           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1371         }
1372 
1373         /* load in initial (unfactored row) */
1374         nz    = ai[r[i]+1] - ai[r[i]];
1375         ajtmp = aj + ai[r[i]];
1376         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1377         for (j=0; j<nz; j++) {
1378           col = ics[ajtmp[j]];
1379           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1380         }
1381         /* ZeropivotApply(): shift the diagonal of the matrix  */
1382         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1383 
1384         /* elimination */
1385         bjtmp = bj + bi[i];
1386         row   = *bjtmp++; /* pivot row */
1387         nzL   = bi[i+1] - bi[i];
1388         for(k=0; k < nzL;k++) {
1389           pc1 = rtmp1 + row;
1390           pc2 = rtmp2 + row;
1391           if (*pc1 != 0.0 || *pc2 != 0.0) {
1392             pv   = b->a + bdiag[row];
1393             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1394             *pc1 = mul1;       *pc2 = mul2;
1395 
1396             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1397             pv = b->a + bdiag[row+1]+1;
1398             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1399             for (j=0; j<nz; j++){
1400               col = pj[j];
1401               rtmp1[col] -= mul1 * pv[j];
1402               rtmp2[col] -= mul2 * pv[j];
1403             }
1404             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1405           }
1406           row = *bjtmp++;
1407         }
1408 
1409         /* finished row i; check zero pivot, then stick row i into b->a */
1410         rs  = 0.0;
1411         /* L part */
1412         pc1 = b->a + bi[i];
1413         pj  = b->j + bi[i] ;
1414         nz  = bi[i+1] - bi[i];
1415         for (j=0; j<nz; j++) {
1416           col = pj[j];
1417           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1418         }
1419         /* U part */
1420         pc1 = b->a + bdiag[i+1]+1;
1421         pj  = b->j + bdiag[i+1]+1;
1422         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col = pj[j];
1425           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1426         }
1427 
1428         sctx.rs  = rs;
1429         sctx.pv  = rtmp1[i];
1430         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1431 	if(newshift == 1) break;
1432         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1433         *pc1 = 1.0/sctx.pv;
1434 
1435         /* Now take care of diagonal 2x2 block. */
1436         pc2 = rtmp2 + i;
1437         if (*pc2 != 0.0){
1438           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1439           *pc2 = mul1;          /* insert L entry */
1440           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1441           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1442           for (j=0; j<nz; j++) {
1443             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1444           }
1445           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1446         }
1447 
1448         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1449         rs = 0.0;
1450         /* L part */
1451         pc2 = b->a + bi[i+1];
1452         pj  = b->j + bi[i+1] ;
1453         nz  = bi[i+2] - bi[i+1];
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1457         }
1458         /* U part */
1459         pc2 = b->a + bdiag[i+2]+1;
1460         pj  = b->j + bdiag[i+2]+1;
1461         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1462         for (j=0; j<nz; j++) {
1463           col = pj[j];
1464           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1465         }
1466 
1467         sctx.rs  = rs;
1468         sctx.pv  = rtmp2[i+1];
1469         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1470 	if(newshift == 1) break;
1471         pc2  = b->a + bdiag[i+1];
1472         *pc2 = 1.0/sctx.pv;
1473         break;
1474 
1475       case 3:
1476       /*----------*/
1477         /* zero rtmp */
1478         /* L part */
1479         nz    = bi[i+1] - bi[i];
1480         bjtmp = bj + bi[i];
1481         for  (j=0; j<nz; j++) {
1482           col = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* U part */
1487         nz = bdiag[i]-bdiag[i+1];
1488         bjtmp = bj + bdiag[i+1]+1;
1489         for  (j=0; j<nz; j++) {
1490           col = bjtmp[j];
1491           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492         }
1493 
1494         /* load in initial (unfactored row) */
1495         nz    = ai[r[i]+1] - ai[r[i]];
1496         ajtmp = aj + ai[r[i]];
1497         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498         for (j=0; j<nz; j++) {
1499           col = ics[ajtmp[j]];
1500           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501         }
1502         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504 
1505         /* elimination */
1506         bjtmp = bj + bi[i];
1507         row   = *bjtmp++; /* pivot row */
1508         nzL   = bi[i+1] - bi[i];
1509         for(k=0; k < nzL;k++) {
1510           pc1 = rtmp1 + row;
1511           pc2 = rtmp2 + row;
1512           pc3 = rtmp3 + row;
1513           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514             pv  = b->a + bdiag[row];
1515             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517 
1518             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519             pv = b->a + bdiag[row+1]+1;
1520             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521             for (j=0; j<nz; j++){
1522               col = pj[j];
1523               rtmp1[col] -= mul1 * pv[j];
1524               rtmp2[col] -= mul2 * pv[j];
1525               rtmp3[col] -= mul3 * pv[j];
1526             }
1527             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1528           }
1529           row = *bjtmp++;
1530         }
1531 
1532         /* finished row i; check zero pivot, then stick row i into b->a */
1533         rs  = 0.0;
1534         /* L part */
1535         pc1 = b->a + bi[i];
1536         pj  = b->j + bi[i] ;
1537         nz  = bi[i+1] - bi[i];
1538         for (j=0; j<nz; j++) {
1539           col = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542         /* U part */
1543         pc1 = b->a + bdiag[i+1]+1;
1544         pj  = b->j + bdiag[i+1]+1;
1545         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546         for (j=0; j<nz; j++) {
1547           col = pj[j];
1548           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549         }
1550 
1551         sctx.rs  = rs;
1552         sctx.pv  = rtmp1[i];
1553         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1554 	if(newshift == 1) break;
1555         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1556         *pc1 = 1.0/sctx.pv;
1557 
1558         /* Now take care of 1st column of diagonal 3x3 block. */
1559         pc2 = rtmp2 + i;
1560         pc3 = rtmp3 + i;
1561         if (*pc2 != 0.0 || *pc3 != 0.0){
1562           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1563           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1564           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col = pj[j];
1568             rtmp2[col] -= mul2 * rtmp1[col];
1569             rtmp3[col] -= mul3 * rtmp1[col];
1570           }
1571           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1572         }
1573 
1574         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1575         rs = 0.0;
1576         /* L part */
1577         pc2 = b->a + bi[i+1];
1578         pj  = b->j + bi[i+1] ;
1579         nz  = bi[i+2] - bi[i+1];
1580         for (j=0; j<nz; j++) {
1581           col = pj[j];
1582           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1583         }
1584         /* U part */
1585         pc2 = b->a + bdiag[i+2]+1;
1586         pj  = b->j + bdiag[i+2]+1;
1587         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1588         for (j=0; j<nz; j++) {
1589           col = pj[j];
1590           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1591         }
1592 
1593         sctx.rs  = rs;
1594         sctx.pv  = rtmp2[i+1];
1595         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1596 	if(newshift == 1) break;
1597         pc2  = b->a + bdiag[i+1];
1598         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1599 
1600         /* Now take care of 2nd column of diagonal 3x3 block. */
1601         pc3 = rtmp3 + i+1;
1602         if (*pc3 != 0.0){
1603           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1604           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1605           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1606           for (j=0; j<nz; j++) {
1607             col = pj[j];
1608             rtmp3[col] -= mul3 * rtmp2[col];
1609           }
1610           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1611         }
1612 
1613         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1614         rs = 0.0;
1615         /* L part */
1616         pc3 = b->a + bi[i+2];
1617         pj  = b->j + bi[i+2] ;
1618         nz  = bi[i+3] - bi[i+2];
1619         for (j=0; j<nz; j++) {
1620           col = pj[j];
1621           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1622         }
1623         /* U part */
1624         pc3 = b->a + bdiag[i+3]+1;
1625         pj  = b->j + bdiag[i+3]+1;
1626         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1627         for (j=0; j<nz; j++) {
1628           col = pj[j];
1629           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1630         }
1631 
1632         sctx.rs  = rs;
1633         sctx.pv  = rtmp3[i+2];
1634         ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr);
1635 	if(newshift == 1) break;
1636         pc3  = b->a + bdiag[i+2];
1637         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1638         break;
1639       case 4:
1640       /*----------*/
1641         /* zero rtmp */
1642         /* L part */
1643         nz    = bi[i+1] - bi[i];
1644         bjtmp = bj + bi[i];
1645         for  (j=0; j<nz; j++) {
1646           col = bjtmp[j];
1647           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1648         }
1649 
1650         /* U part */
1651         nz = bdiag[i]-bdiag[i+1];
1652         bjtmp = bj + bdiag[i+1]+1;
1653         for  (j=0; j<nz; j++) {
1654           col = bjtmp[j];
1655           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1656         }
1657 
1658         /* load in initial (unfactored row) */
1659         nz    = ai[r[i]+1] - ai[r[i]];
1660         ajtmp = aj + ai[r[i]];
1661         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1662         for (j=0; j<nz; j++) {
1663           col = ics[ajtmp[j]];
1664           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1665         }
1666         /* ZeropivotApply(): shift the diagonal of the matrix  */
1667         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1668 
1669         /* elimination */
1670         bjtmp = bj + bi[i];
1671         row   = *bjtmp++; /* pivot row */
1672         nzL   = bi[i+1] - bi[i];
1673         for(k=0; k < nzL;k++) {
1674           pc1 = rtmp1 + row;
1675           pc2 = rtmp2 + row;
1676           pc3 = rtmp3 + row;
1677 	  pc4 = rtmp4 + row;
1678           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1679             pv  = b->a + bdiag[row];
1680             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1681             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1682 
1683             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1684             pv = b->a + bdiag[row+1]+1;
1685             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1686             for (j=0; j<nz; j++){
1687               col = pj[j];
1688               rtmp1[col] -= mul1 * pv[j];
1689               rtmp2[col] -= mul2 * pv[j];
1690               rtmp3[col] -= mul3 * pv[j];
1691 	      rtmp4[col] -= mul4 * pv[j];
1692             }
1693             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1694           }
1695           row = *bjtmp++;
1696         }
1697 
1698         /* finished row i; check zero pivot, then stick row i into b->a */
1699         rs  = 0.0;
1700         /* L part */
1701         pc1 = b->a + bi[i];
1702         pj  = b->j + bi[i] ;
1703         nz  = bi[i+1] - bi[i];
1704         for (j=0; j<nz; j++) {
1705           col = pj[j];
1706           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1707         }
1708         /* U part */
1709         pc1 = b->a + bdiag[i+1]+1;
1710         pj  = b->j + bdiag[i+1]+1;
1711         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1712         for (j=0; j<nz; j++) {
1713           col = pj[j];
1714           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1715         }
1716 
1717         sctx.rs  = rs;
1718         sctx.pv  = rtmp1[i];
1719         ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr);
1720 	if(newshift == 1) break;
1721         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1722         *pc1 = 1.0/sctx.pv;
1723 
1724         /* Now take care of 1st column of diagonal 4x4 block. */
1725         pc2 = rtmp2 + i;
1726         pc3 = rtmp3 + i;
1727 	pc4 = rtmp4 + i;
1728         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1729           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1730           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1731 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1732           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1733           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1734           for (j=0; j<nz; j++) {
1735             col = pj[j];
1736             rtmp2[col] -= mul2 * rtmp1[col];
1737             rtmp3[col] -= mul3 * rtmp1[col];
1738 	    rtmp4[col] -= mul4 * rtmp1[col];
1739           }
1740           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1741         }
1742 
1743         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1744         rs = 0.0;
1745         /* L part */
1746         pc2 = b->a + bi[i+1];
1747         pj  = b->j + bi[i+1] ;
1748         nz  = bi[i+2] - bi[i+1];
1749         for (j=0; j<nz; j++) {
1750           col = pj[j];
1751           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1752         }
1753         /* U part */
1754         pc2 = b->a + bdiag[i+2]+1;
1755         pj  = b->j + bdiag[i+2]+1;
1756         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1757         for (j=0; j<nz; j++) {
1758           col = pj[j];
1759           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1760         }
1761 
1762         sctx.rs  = rs;
1763         sctx.pv  = rtmp2[i+1];
1764         ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr);
1765 	if(newshift == 1) break;
1766         pc2  = b->a + bdiag[i+1];
1767         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1768 
1769         /* Now take care of 2nd column of diagonal 4x4 block. */
1770         pc3 = rtmp3 + i+1;
1771 	pc4 = rtmp4 + i+1;
1772         if (*pc3 != 0.0 || *pc4 != 0.0){
1773           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1774 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1775           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1776           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1777           for (j=0; j<nz; j++) {
1778             col = pj[j];
1779             rtmp3[col] -= mul3 * rtmp2[col];
1780 	    rtmp4[col] -= mul4 * rtmp2[col];
1781           }
1782           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1783         }
1784 
1785         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1786         rs = 0.0;
1787         /* L part */
1788         pc3 = b->a + bi[i+2];
1789         pj  = b->j + bi[i+2] ;
1790         nz  = bi[i+3] - bi[i+2];
1791         for (j=0; j<nz; j++) {
1792           col = pj[j];
1793           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1794         }
1795         /* U part */
1796         pc3 = b->a + bdiag[i+3]+1;
1797         pj  = b->j + bdiag[i+3]+1;
1798         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1799         for (j=0; j<nz; j++) {
1800           col = pj[j];
1801           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1802         }
1803 
1804         sctx.rs  = rs;
1805         sctx.pv  = rtmp3[i+2];
1806         ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr);
1807 	if(newshift == 1) break;
1808         pc3  = b->a + bdiag[i+2];
1809         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1810 
1811 	/* Now take care of 3rd column of diagonal 4x4 block. */
1812 	pc4 = rtmp4 + i+2;
1813         if (*pc4 != 0.0){
1814 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1815           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1816           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1817           for (j=0; j<nz; j++) {
1818             col = pj[j];
1819 	    rtmp4[col] -= mul4 * rtmp3[col];
1820           }
1821           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1822         }
1823 
1824         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1825         rs = 0.0;
1826         /* L part */
1827         pc4 = b->a + bi[i+3];
1828         pj  = b->j + bi[i+3] ;
1829         nz  = bi[i+4] - bi[i+3];
1830         for (j=0; j<nz; j++) {
1831           col = pj[j];
1832           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1833         }
1834         /* U part */
1835         pc4 = b->a + bdiag[i+4]+1;
1836         pj  = b->j + bdiag[i+4]+1;
1837         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1838         for (j=0; j<nz; j++) {
1839           col = pj[j];
1840           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1841         }
1842 
1843         sctx.rs  = rs;
1844         sctx.pv  = rtmp4[i+3];
1845         ierr = MatPivotCheck(info,&sctx,i+3,&newshift);CHKERRQ(ierr);
1846 	if(newshift == 1) break;
1847         pc4  = b->a + bdiag[i+3];
1848         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1849          break;
1850 
1851       default:
1852         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1853       }
1854       i += nodesz;                 /* Update the row */
1855     }
1856 
1857     /* MatPivotRefine() */
1858     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1859       /*
1860        * if no shift in this attempt & shifting & started shifting & can refine,
1861        * then try lower shift
1862        */
1863       sctx.shift_hi       = sctx.shift_fraction;
1864       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1865       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1866       sctx.useshift       = PETSC_TRUE;
1867       sctx.nshift++;
1868     }
1869   } while (sctx.useshift);
1870 
1871   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1872   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1873   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1874   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1875 
1876   C->ops->solve              = MatSolve_SeqAIJ;
1877   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1878   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1879   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1880   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1881   C->assembled    = PETSC_TRUE;
1882   C->preallocated = PETSC_TRUE;
1883   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1884 
1885   /* MatShiftView(A,info,&sctx) */
1886   if (sctx.nshift){
1887     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1889     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1892       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1893     }
1894   }
1895   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 #undef __FUNCT__
1900 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1901 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1902 {
1903   Mat               C = B;
1904   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1905   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1906   PetscErrorCode    ierr;
1907   const PetscInt    *r,*ic,*c,*ics;
1908   PetscInt          n = A->rmap->n,*bi = b->i;
1909   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1910   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1911   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912   PetscScalar       mul1,mul2,mul3,tmp;
1913   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915   PetscReal         rs=0.0;
1916   FactorShiftCtx    sctx;
1917   PetscInt          newshift;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++){
1936         if (*ajtmp != i){
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics   = ic ;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j){
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995     tmp_vec2[i]  = nodesz;
1996     j           += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.useshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz){
2012       case 1:
2013         for  (j=0; j<nz; j++){
2014           idx        = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx        = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++ ;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0){
2034             pv   = ba + bd[prow];
2035             pj   = nbj + bd[prow];
2036             mul1 = *pc1 * *pv++;
2037             *pc1 = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp = pv[j];
2042               idx = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++ ;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv    = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs         = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs  = rs;
2060         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2061         if (newshift == 1) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx        = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx        = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++ ;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0){
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++ ;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++){
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2123         if (newshift == 1) goto endofwhile;
2124 
2125         if (*pc2 != 0.0){
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx = pj[j] ;
2132             tmp = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv = rtmp22[row+1];
2143         rs = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2155         if (newshift == 1) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx        = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp = aj + ai[idx];
2169         v1    = aa + ai[idx];
2170         v2    = aa + ai[idx+1];
2171         v3    = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx        = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++ ;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp = pv[j];
2203               idx = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++ ;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++){
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2228         if (newshift == 1) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0){
2231           mul2 = (*pc2)/(*pc1);
2232           mul3 = (*pc3)/(*pc1);
2233           *pc2 = mul2;
2234           *pc3 = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx = pj[j] ;
2239             tmp = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2 = rtmp22 + prow;
2248         pc3 = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++){
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2258         if (newshift == 1) goto endofwhile;
2259 
2260         if (*pc3 != 0.0){
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx = pj[j] ;
2267             tmp = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv = rtmp33[row+2];
2279         rs = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2294         if (newshift == 1) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302     endofwhile:;
2303   } while (sctx.useshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313   C->assembled   = PETSC_TRUE;
2314   C->preallocated = PETSC_TRUE;
2315   if (sctx.nshift) {
2316     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2323   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2324   PetscFunctionReturn(0);
2325 }
2326 
2327 
2328 /* ----------------------------------------------------------- */
2329 #undef __FUNCT__
2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332 {
2333   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334   IS                iscol = a->col,isrow = a->row;
2335   PetscErrorCode    ierr;
2336   const PetscInt    *r,*c,*rout,*cout;
2337   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342   const PetscScalar *b;
2343 
2344   PetscFunctionBegin;
2345   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346   node_max = a->inode.node_count;
2347   ns       = a->inode.size;     /* Node Size array */
2348 
2349   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2351   tmp  = a->solve_work;
2352 
2353   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2354   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2355 
2356   /* forward solve the lower triangular */
2357   tmps = tmp ;
2358   aa   = a_a ;
2359   aj   = a_j ;
2360   ad   = a->diag;
2361 
2362   for (i = 0,row = 0; i< node_max; ++i){
2363     nsz = ns[i];
2364     aii = ai[row];
2365     v1  = aa + aii;
2366     vi  = aj + aii;
2367     nz  = ai[row+1]- ai[row];
2368 
2369     if (i < node_max-1) {
2370       /* Prefetch the indices for the next block */
2371       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2372       /* Prefetch the data for the next block */
2373       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2374     }
2375 
2376     switch (nsz){               /* Each loop in 'case' is unrolled */
2377     case 1 :
2378       sum1 = b[r[row]];
2379       for(j=0; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385       }
2386       if(j == nz-1){
2387         tmp0 = tmps[vi[j]];
2388         sum1 -= v1[j]*tmp0;
2389       }
2390       tmp[row++]=sum1;
2391       break;
2392     case 2:
2393       sum1 = b[r[row]];
2394       sum2 = b[r[row+1]];
2395       v2   = aa + ai[row+1];
2396 
2397       for(j=0; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404       }
2405       if(j == nz-1){
2406         tmp0 = tmps[vi[j]];
2407         sum1 -= v1[j] *tmp0;
2408         sum2 -= v2[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       tmp[row ++]=sum1;
2412       tmp[row ++]=sum2;
2413       break;
2414     case 3:
2415       sum1 = b[r[row]];
2416       sum2 = b[r[row+1]];
2417       sum3 = b[r[row+2]];
2418       v2   = aa + ai[row+1];
2419       v3   = aa + ai[row+2];
2420 
2421       for (j=0; j<nz-1; j+=2){
2422         i0   = vi[j];
2423         i1   = vi[j+1];
2424         tmp0 = tmps[i0];
2425         tmp1 = tmps[i1];
2426         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429       }
2430       if (j == nz-1){
2431         tmp0 = tmps[vi[j]];
2432         sum1 -= v1[j] *tmp0;
2433         sum2 -= v2[j] *tmp0;
2434         sum3 -= v3[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum3 -= v3[nz+1] * sum2;
2439       tmp[row ++]=sum1;
2440       tmp[row ++]=sum2;
2441       tmp[row ++]=sum3;
2442       break;
2443 
2444     case 4:
2445       sum1 = b[r[row]];
2446       sum2 = b[r[row+1]];
2447       sum3 = b[r[row+2]];
2448       sum4 = b[r[row+3]];
2449       v2   = aa + ai[row+1];
2450       v3   = aa + ai[row+2];
2451       v4   = aa + ai[row+3];
2452 
2453       for (j=0; j<nz-1; j+=2){
2454         i0   = vi[j];
2455         i1   = vi[j+1];
2456         tmp0 = tmps[i0];
2457         tmp1 = tmps[i1];
2458         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462       }
2463       if (j == nz-1){
2464         tmp0 = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum3 -= v3[nz+1] * sum2;
2474       sum4 -= v4[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476 
2477       tmp[row ++]=sum1;
2478       tmp[row ++]=sum2;
2479       tmp[row ++]=sum3;
2480       tmp[row ++]=sum4;
2481       break;
2482     case 5:
2483       sum1 = b[r[row]];
2484       sum2 = b[r[row+1]];
2485       sum3 = b[r[row+2]];
2486       sum4 = b[r[row+3]];
2487       sum5 = b[r[row+4]];
2488       v2   = aa + ai[row+1];
2489       v3   = aa + ai[row+2];
2490       v4   = aa + ai[row+3];
2491       v5   = aa + ai[row+4];
2492 
2493       for (j=0; j<nz-1; j+=2){
2494         i0   = vi[j];
2495         i1   = vi[j+1];
2496         tmp0 = tmps[i0];
2497         tmp1 = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503       }
2504       if (j == nz-1){
2505         tmp0 = tmps[vi[j]];
2506         sum1 -= v1[j] *tmp0;
2507         sum2 -= v2[j] *tmp0;
2508         sum3 -= v3[j] *tmp0;
2509         sum4 -= v4[j] *tmp0;
2510         sum5 -= v5[j] *tmp0;
2511       }
2512 
2513       sum2 -= v2[nz] * sum1;
2514       sum3 -= v3[nz] * sum1;
2515       sum4 -= v4[nz] * sum1;
2516       sum5 -= v5[nz] * sum1;
2517       sum3 -= v3[nz+1] * sum2;
2518       sum4 -= v4[nz+1] * sum2;
2519       sum5 -= v5[nz+1] * sum2;
2520       sum4 -= v4[nz+2] * sum3;
2521       sum5 -= v5[nz+2] * sum3;
2522       sum5 -= v5[nz+3] * sum4;
2523 
2524       tmp[row ++]=sum1;
2525       tmp[row ++]=sum2;
2526       tmp[row ++]=sum3;
2527       tmp[row ++]=sum4;
2528       tmp[row ++]=sum5;
2529       break;
2530     default:
2531       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532     }
2533   }
2534   /* backward solve the upper triangular */
2535   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536     nsz = ns[i];
2537     aii = ad[row+1] + 1;
2538     v1  = aa + aii;
2539     vi  = aj + aii;
2540     nz  = ad[row]- ad[row+1] - 1;
2541 
2542     if (i > 0) {
2543       /* Prefetch the indices for the next block */
2544       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2545       /* Prefetch the data for the next block */
2546       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2547     }
2548 
2549     switch (nsz){               /* Each loop in 'case' is unrolled */
2550     case 1 :
2551       sum1 = tmp[row];
2552 
2553       for(j=0 ; j<nz-1; j+=2){
2554         i0   = vi[j];
2555         i1   = vi[j+1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559       }
2560       if (j == nz-1){
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j]*tmp0;
2563       }
2564       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       break;
2566     case 2 :
2567       sum1 = tmp[row];
2568       sum2 = tmp[row-1];
2569       v2   = aa + ad[row] + 1;
2570       for (j=0 ; j<nz-1; j+=2){
2571         i0   = vi[j];
2572         i1   = vi[j+1];
2573         tmp0 = tmps[i0];
2574         tmp1 = tmps[i1];
2575         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577       }
2578       if (j == nz-1){
2579         tmp0  = tmps[vi[j]];
2580         sum1 -= v1[j]* tmp0;
2581         sum2 -= v2[j+1]* tmp0;
2582       }
2583 
2584       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2   -= v2[0] * tmp0;
2586       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587       break;
2588     case 3 :
2589       sum1 = tmp[row];
2590       sum2 = tmp[row -1];
2591       sum3 = tmp[row -2];
2592       v2   = aa + ad[row] + 1;
2593       v3   = aa + ad[row -1] + 1;
2594       for (j=0 ; j<nz-1; j+=2){
2595         i0   = vi[j];
2596         i1   = vi[j+1];
2597         tmp0 = tmps[i0];
2598         tmp1 = tmps[i1];
2599         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602       }
2603       if (j== nz-1){
2604         tmp0  = tmps[vi[j]];
2605         sum1 -= v1[j] * tmp0;
2606         sum2 -= v2[j+1] * tmp0;
2607         sum3 -= v3[j+2] * tmp0;
2608       }
2609       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610       sum2   -= v2[0]* tmp0;
2611       sum3   -= v3[1] * tmp0;
2612       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613       sum3   -= v3[0]* tmp0;
2614       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615 
2616       break;
2617     case 4 :
2618       sum1 = tmp[row];
2619       sum2 = tmp[row -1];
2620       sum3 = tmp[row -2];
2621       sum4 = tmp[row -3];
2622       v2   = aa + ad[row]+1;
2623       v3   = aa + ad[row -1]+1;
2624       v4   = aa + ad[row -2]+1;
2625 
2626       for (j=0 ; j<nz-1; j+=2){
2627         i0   = vi[j];
2628         i1   = vi[j+1];
2629         tmp0 = tmps[i0];
2630         tmp1 = tmps[i1];
2631         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635       }
2636       if (j== nz-1){
2637         tmp0  = tmps[vi[j]];
2638         sum1 -= v1[j] * tmp0;
2639         sum2 -= v2[j+1] * tmp0;
2640         sum3 -= v3[j+2] * tmp0;
2641         sum4 -= v4[j+3] * tmp0;
2642       }
2643 
2644       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2   -= v2[0] * tmp0;
2646       sum3   -= v3[1] * tmp0;
2647       sum4   -= v4[2] * tmp0;
2648       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649       sum3   -= v3[0] * tmp0;
2650       sum4   -= v4[1] * tmp0;
2651       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4   -= v4[0] * tmp0;
2653       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654       break;
2655     case 5 :
2656       sum1 = tmp[row];
2657       sum2 = tmp[row -1];
2658       sum3 = tmp[row -2];
2659       sum4 = tmp[row -3];
2660       sum5 = tmp[row -4];
2661       v2   = aa + ad[row]+1;
2662       v3   = aa + ad[row -1]+1;
2663       v4   = aa + ad[row -2]+1;
2664       v5   = aa + ad[row -3]+1;
2665       for (j=0 ; j<nz-1; j+=2){
2666         i0   = vi[j];
2667         i1   = vi[j+1];
2668         tmp0 = tmps[i0];
2669         tmp1 = tmps[i1];
2670         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675       }
2676       if (j==nz-1){
2677         tmp0  = tmps[vi[j]];
2678         sum1 -= v1[j] * tmp0;
2679         sum2 -= v2[j+1] * tmp0;
2680         sum3 -= v3[j+2] * tmp0;
2681         sum4 -= v4[j+3] * tmp0;
2682         sum5 -= v5[j+4] * tmp0;
2683       }
2684 
2685       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686       sum2   -= v2[0] * tmp0;
2687       sum3   -= v3[1] * tmp0;
2688       sum4   -= v4[2] * tmp0;
2689       sum5   -= v5[3] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691       sum3   -= v3[0] * tmp0;
2692       sum4   -= v4[1] * tmp0;
2693       sum5   -= v5[2] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695       sum4   -= v4[0] * tmp0;
2696       sum5   -= v5[1] * tmp0;
2697       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698       sum5   -= v5[0] * tmp0;
2699       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700       break;
2701     default:
2702       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703     }
2704   }
2705   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2706   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2707   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2709   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 
2714 /*
2715      Makes a longer coloring[] array and calls the usual code with that
2716 */
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720 {
2721   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722   PetscErrorCode  ierr;
2723   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724   PetscInt        *colorused,i;
2725   ISColoringValue *newcolor;
2726 
2727   PetscFunctionBegin;
2728   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2729   /* loop over inodes, marking a color for each column*/
2730   row = 0;
2731   for (i=0; i<m; i++){
2732     for (j=0; j<ns[i]; j++) {
2733       newcolor[row++] = coloring[i] + j*ncolors;
2734     }
2735   }
2736 
2737   /* eliminate unneeded colors */
2738   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2739   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2740   for (i=0; i<n; i++) {
2741     colorused[newcolor[i]] = 1;
2742   }
2743 
2744   for (i=1; i<5*ncolors; i++) {
2745     colorused[i] += colorused[i-1];
2746   }
2747   ncolors = colorused[5*ncolors-1];
2748   for (i=0; i<n; i++) {
2749     newcolor[i] = colorused[newcolor[i]]-1;
2750   }
2751   ierr = PetscFree(colorused);CHKERRQ(ierr);
2752   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2753   ierr = PetscFree(coloring);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 #include "../src/mat/blockinvert.h"
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762 {
2763   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765   MatScalar          *ibdiag,*bdiag,work[25];
2766   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2767   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2769   PetscErrorCode     ierr;
2770   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2771   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2772 
2773   PetscFunctionBegin;
2774   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2775   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2776   if (its > 1) {
2777     /* switch to non-inode version */
2778     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2779     PetscFunctionReturn(0);
2780   }
2781 
2782   if (!a->inode.ibdiagvalid) {
2783     if (!a->inode.ibdiag) {
2784       /* calculate space needed for diagonal blocks */
2785       for (i=0; i<m; i++) {
2786 	cnt += sizes[i]*sizes[i];
2787       }
2788       a->inode.bdiagsize = cnt;
2789       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2790     }
2791 
2792     /* copy over the diagonal blocks and invert them */
2793     ibdiag = a->inode.ibdiag;
2794     bdiag  = a->inode.bdiag;
2795     cnt = 0;
2796     for (i=0, row = 0; i<m; i++) {
2797       for (j=0; j<sizes[i]; j++) {
2798         for (k=0; k<sizes[i]; k++) {
2799           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2800         }
2801       }
2802       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2803 
2804       switch(sizes[i]) {
2805         case 1:
2806           /* Create matrix data structure */
2807           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2808           ibdiag[cnt] = 1.0/ibdiag[cnt];
2809           break;
2810         case 2:
2811           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2812           break;
2813         case 3:
2814           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2815           break;
2816         case 4:
2817           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2818           break;
2819         case 5:
2820           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2821           break;
2822        default:
2823 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2824       }
2825       cnt += sizes[i]*sizes[i];
2826       row += sizes[i];
2827     }
2828     a->inode.ibdiagvalid = PETSC_TRUE;
2829   }
2830   ibdiag = a->inode.ibdiag;
2831   bdiag  = a->inode.bdiag;
2832 
2833 
2834   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2835   if (flag & SOR_ZERO_INITIAL_GUESS) {
2836     PetscScalar *b;
2837     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838     if (xx != bb) {
2839       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2840     } else {
2841       b = x;
2842     }
2843     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2844 
2845       for (i=0, row=0; i<m; i++) {
2846         sz  = diag[row] - ii[row];
2847         v1  = a->a + ii[row];
2848         idx = a->j + ii[row];
2849 
2850         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2851         switch (sizes[i]){
2852           case 1:
2853 
2854             sum1  = b[row];
2855             for(n = 0; n<sz-1; n+=2) {
2856               i1   = idx[0];
2857               i2   = idx[1];
2858               idx += 2;
2859               tmp0 = x[i1];
2860               tmp1 = x[i2];
2861               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2862             }
2863 
2864             if (n == sz-1){
2865               tmp0  = x[*idx];
2866               sum1 -= *v1 * tmp0;
2867             }
2868             x[row++] = sum1*(*ibdiag++);
2869             break;
2870           case 2:
2871             v2    = a->a + ii[row+1];
2872             sum1  = b[row];
2873             sum2  = b[row+1];
2874             for(n = 0; n<sz-1; n+=2) {
2875               i1   = idx[0];
2876               i2   = idx[1];
2877               idx += 2;
2878               tmp0 = x[i1];
2879               tmp1 = x[i2];
2880               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2881               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2882             }
2883 
2884             if (n == sz-1){
2885               tmp0  = x[*idx];
2886               sum1 -= v1[0] * tmp0;
2887               sum2 -= v2[0] * tmp0;
2888             }
2889             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2890             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2891             ibdiag  += 4;
2892             break;
2893           case 3:
2894             v2    = a->a + ii[row+1];
2895             v3    = a->a + ii[row+2];
2896             sum1  = b[row];
2897             sum2  = b[row+1];
2898             sum3  = b[row+2];
2899             for(n = 0; n<sz-1; n+=2) {
2900               i1   = idx[0];
2901               i2   = idx[1];
2902               idx += 2;
2903               tmp0 = x[i1];
2904               tmp1 = x[i2];
2905               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2906               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2907               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2908             }
2909 
2910             if (n == sz-1){
2911               tmp0  = x[*idx];
2912               sum1 -= v1[0] * tmp0;
2913               sum2 -= v2[0] * tmp0;
2914               sum3 -= v3[0] * tmp0;
2915             }
2916             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2917             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2918             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2919             ibdiag  += 9;
2920             break;
2921           case 4:
2922             v2    = a->a + ii[row+1];
2923             v3    = a->a + ii[row+2];
2924             v4    = a->a + ii[row+3];
2925             sum1  = b[row];
2926             sum2  = b[row+1];
2927             sum3  = b[row+2];
2928             sum4  = b[row+3];
2929             for(n = 0; n<sz-1; n+=2) {
2930               i1   = idx[0];
2931               i2   = idx[1];
2932               idx += 2;
2933               tmp0 = x[i1];
2934               tmp1 = x[i2];
2935               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2936               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2937               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2938               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2939             }
2940 
2941             if (n == sz-1){
2942               tmp0  = x[*idx];
2943               sum1 -= v1[0] * tmp0;
2944               sum2 -= v2[0] * tmp0;
2945               sum3 -= v3[0] * tmp0;
2946               sum4 -= v4[0] * tmp0;
2947             }
2948             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2949             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2950             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2951             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2952             ibdiag  += 16;
2953             break;
2954           case 5:
2955             v2    = a->a + ii[row+1];
2956             v3    = a->a + ii[row+2];
2957             v4    = a->a + ii[row+3];
2958             v5    = a->a + ii[row+4];
2959             sum1  = b[row];
2960             sum2  = b[row+1];
2961             sum3  = b[row+2];
2962             sum4  = b[row+3];
2963             sum5  = b[row+4];
2964             for(n = 0; n<sz-1; n+=2) {
2965               i1   = idx[0];
2966               i2   = idx[1];
2967               idx += 2;
2968               tmp0 = x[i1];
2969               tmp1 = x[i2];
2970               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2971               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2972               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2973               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2974               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2975             }
2976 
2977             if (n == sz-1){
2978               tmp0  = x[*idx];
2979               sum1 -= v1[0] * tmp0;
2980               sum2 -= v2[0] * tmp0;
2981               sum3 -= v3[0] * tmp0;
2982               sum4 -= v4[0] * tmp0;
2983               sum5 -= v5[0] * tmp0;
2984             }
2985             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2986             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2987             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2988             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2989             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2990             ibdiag  += 25;
2991             break;
2992 	  default:
2993    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2994         }
2995       }
2996 
2997       xb = x;
2998       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2999     } else xb = b;
3000     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
3001         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
3002       cnt = 0;
3003       for (i=0, row=0; i<m; i++) {
3004 
3005         switch (sizes[i]){
3006           case 1:
3007             x[row++] *= bdiag[cnt++];
3008             break;
3009           case 2:
3010             x1   = x[row]; x2 = x[row+1];
3011             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3012             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3013             x[row++] = tmp1;
3014             x[row++] = tmp2;
3015             cnt += 4;
3016             break;
3017           case 3:
3018             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3019             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3020             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3021             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3022             x[row++] = tmp1;
3023             x[row++] = tmp2;
3024             x[row++] = tmp3;
3025             cnt += 9;
3026             break;
3027           case 4:
3028             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3029             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3030             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3031             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3032             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3033             x[row++] = tmp1;
3034             x[row++] = tmp2;
3035             x[row++] = tmp3;
3036             x[row++] = tmp4;
3037             cnt += 16;
3038             break;
3039           case 5:
3040             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3041             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3042             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3043             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3044             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3045             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3046             x[row++] = tmp1;
3047             x[row++] = tmp2;
3048             x[row++] = tmp3;
3049             x[row++] = tmp4;
3050             x[row++] = tmp5;
3051             cnt += 25;
3052             break;
3053 	  default:
3054    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3055         }
3056       }
3057       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3058     }
3059     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3060 
3061       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3062       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3063         ibdiag -= sizes[i]*sizes[i];
3064         sz      = ii[row+1] - diag[row] - 1;
3065         v1      = a->a + diag[row] + 1;
3066         idx     = a->j + diag[row] + 1;
3067 
3068         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3069         switch (sizes[i]){
3070           case 1:
3071 
3072             sum1  = xb[row];
3073             for(n = 0; n<sz-1; n+=2) {
3074               i1   = idx[0];
3075               i2   = idx[1];
3076               idx += 2;
3077               tmp0 = x[i1];
3078               tmp1 = x[i2];
3079               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3080             }
3081 
3082             if (n == sz-1){
3083               tmp0  = x[*idx];
3084               sum1 -= *v1*tmp0;
3085             }
3086             x[row--] = sum1*(*ibdiag);
3087             break;
3088 
3089           case 2:
3090 
3091             sum1  = xb[row];
3092             sum2  = xb[row-1];
3093             /* note that sum1 is associated with the second of the two rows */
3094             v2    = a->a + diag[row-1] + 2;
3095             for(n = 0; n<sz-1; n+=2) {
3096               i1   = idx[0];
3097               i2   = idx[1];
3098               idx += 2;
3099               tmp0 = x[i1];
3100               tmp1 = x[i2];
3101               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3102               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3103             }
3104 
3105             if (n == sz-1){
3106               tmp0  = x[*idx];
3107               sum1 -= *v1*tmp0;
3108               sum2 -= *v2*tmp0;
3109             }
3110             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3111             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3112             break;
3113           case 3:
3114 
3115             sum1  = xb[row];
3116             sum2  = xb[row-1];
3117             sum3  = xb[row-2];
3118             v2    = a->a + diag[row-1] + 2;
3119             v3    = a->a + diag[row-2] + 3;
3120             for(n = 0; n<sz-1; n+=2) {
3121               i1   = idx[0];
3122               i2   = idx[1];
3123               idx += 2;
3124               tmp0 = x[i1];
3125               tmp1 = x[i2];
3126               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3127               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3128               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3129             }
3130 
3131             if (n == sz-1){
3132               tmp0  = x[*idx];
3133               sum1 -= *v1*tmp0;
3134               sum2 -= *v2*tmp0;
3135               sum3 -= *v3*tmp0;
3136             }
3137             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3138             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3139             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3140             break;
3141           case 4:
3142 
3143             sum1  = xb[row];
3144             sum2  = xb[row-1];
3145             sum3  = xb[row-2];
3146             sum4  = xb[row-3];
3147             v2    = a->a + diag[row-1] + 2;
3148             v3    = a->a + diag[row-2] + 3;
3149             v4    = a->a + diag[row-3] + 4;
3150             for(n = 0; n<sz-1; n+=2) {
3151               i1   = idx[0];
3152               i2   = idx[1];
3153               idx += 2;
3154               tmp0 = x[i1];
3155               tmp1 = x[i2];
3156               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3157               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3158               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3159               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3160             }
3161 
3162             if (n == sz-1){
3163               tmp0  = x[*idx];
3164               sum1 -= *v1*tmp0;
3165               sum2 -= *v2*tmp0;
3166               sum3 -= *v3*tmp0;
3167               sum4 -= *v4*tmp0;
3168             }
3169             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3170             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3171             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3172             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3173             break;
3174           case 5:
3175 
3176             sum1  = xb[row];
3177             sum2  = xb[row-1];
3178             sum3  = xb[row-2];
3179             sum4  = xb[row-3];
3180             sum5  = xb[row-4];
3181             v2    = a->a + diag[row-1] + 2;
3182             v3    = a->a + diag[row-2] + 3;
3183             v4    = a->a + diag[row-3] + 4;
3184             v5    = a->a + diag[row-4] + 5;
3185             for(n = 0; n<sz-1; n+=2) {
3186               i1   = idx[0];
3187               i2   = idx[1];
3188               idx += 2;
3189               tmp0 = x[i1];
3190               tmp1 = x[i2];
3191               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3192               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3193               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3194               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3195               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3196             }
3197 
3198             if (n == sz-1){
3199               tmp0  = x[*idx];
3200               sum1 -= *v1*tmp0;
3201               sum2 -= *v2*tmp0;
3202               sum3 -= *v3*tmp0;
3203               sum4 -= *v4*tmp0;
3204               sum5 -= *v5*tmp0;
3205             }
3206             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3207             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3208             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3209             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3210             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3211             break;
3212 	  default:
3213    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3214         }
3215       }
3216 
3217       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3218     }
3219     its--;
3220     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3221     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3222   }
3223   if (flag & SOR_EISENSTAT) {
3224     const PetscScalar *b;
3225     MatScalar         *t = a->inode.ssor_work;
3226 
3227     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3228     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3229     /*
3230           Apply  (U + D)^-1  where D is now the block diagonal
3231     */
3232     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3233     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3234       ibdiag -= sizes[i]*sizes[i];
3235       sz      = ii[row+1] - diag[row] - 1;
3236       v1      = a->a + diag[row] + 1;
3237       idx     = a->j + diag[row] + 1;
3238       CHKMEMQ;
3239       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3240       switch (sizes[i]){
3241         case 1:
3242 
3243 	  sum1  = b[row];
3244 	  for(n = 0; n<sz-1; n+=2) {
3245 	    i1   = idx[0];
3246 	    i2   = idx[1];
3247 	    idx += 2;
3248 	    tmp0 = x[i1];
3249 	    tmp1 = x[i2];
3250 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3251 	  }
3252 
3253 	  if (n == sz-1){
3254 	    tmp0  = x[*idx];
3255 	    sum1 -= *v1*tmp0;
3256 	  }
3257 	  x[row] = sum1*(*ibdiag);row--;
3258 	  break;
3259 
3260         case 2:
3261 
3262 	  sum1  = b[row];
3263 	  sum2  = b[row-1];
3264 	  /* note that sum1 is associated with the second of the two rows */
3265 	  v2    = a->a + diag[row-1] + 2;
3266 	  for(n = 0; n<sz-1; n+=2) {
3267 	    i1   = idx[0];
3268 	    i2   = idx[1];
3269 	    idx += 2;
3270 	    tmp0 = x[i1];
3271 	    tmp1 = x[i2];
3272 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274 	  }
3275 
3276 	  if (n == sz-1){
3277 	    tmp0  = x[*idx];
3278 	    sum1 -= *v1*tmp0;
3279 	    sum2 -= *v2*tmp0;
3280 	  }
3281 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3282 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3283           row -= 2;
3284 	  break;
3285         case 3:
3286 
3287 	  sum1  = b[row];
3288 	  sum2  = b[row-1];
3289 	  sum3  = b[row-2];
3290 	  v2    = a->a + diag[row-1] + 2;
3291 	  v3    = a->a + diag[row-2] + 3;
3292 	  for(n = 0; n<sz-1; n+=2) {
3293 	    i1   = idx[0];
3294 	    i2   = idx[1];
3295 	    idx += 2;
3296 	    tmp0 = x[i1];
3297 	    tmp1 = x[i2];
3298 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3299 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3300 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3301 	  }
3302 
3303 	  if (n == sz-1){
3304 	    tmp0  = x[*idx];
3305 	    sum1 -= *v1*tmp0;
3306 	    sum2 -= *v2*tmp0;
3307 	    sum3 -= *v3*tmp0;
3308 	  }
3309 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3310 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3311 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3312           row -= 3;
3313 	  break;
3314         case 4:
3315 
3316 	  sum1  = b[row];
3317 	  sum2  = b[row-1];
3318 	  sum3  = b[row-2];
3319 	  sum4  = b[row-3];
3320 	  v2    = a->a + diag[row-1] + 2;
3321 	  v3    = a->a + diag[row-2] + 3;
3322 	  v4    = a->a + diag[row-3] + 4;
3323 	  for(n = 0; n<sz-1; n+=2) {
3324 	    i1   = idx[0];
3325 	    i2   = idx[1];
3326 	    idx += 2;
3327 	    tmp0 = x[i1];
3328 	    tmp1 = x[i2];
3329 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333 	  }
3334 
3335 	  if (n == sz-1){
3336 	    tmp0  = x[*idx];
3337 	    sum1 -= *v1*tmp0;
3338 	    sum2 -= *v2*tmp0;
3339 	    sum3 -= *v3*tmp0;
3340 	    sum4 -= *v4*tmp0;
3341 	  }
3342 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3343 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3344 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3345 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3346           row -= 4;
3347 	  break;
3348         case 5:
3349 
3350 	  sum1  = b[row];
3351 	  sum2  = b[row-1];
3352 	  sum3  = b[row-2];
3353 	  sum4  = b[row-3];
3354 	  sum5  = b[row-4];
3355 	  v2    = a->a + diag[row-1] + 2;
3356 	  v3    = a->a + diag[row-2] + 3;
3357 	  v4    = a->a + diag[row-3] + 4;
3358 	  v5    = a->a + diag[row-4] + 5;
3359 	  for(n = 0; n<sz-1; n+=2) {
3360 	    i1   = idx[0];
3361 	    i2   = idx[1];
3362 	    idx += 2;
3363 	    tmp0 = x[i1];
3364 	    tmp1 = x[i2];
3365 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3370 	  }
3371 
3372 	  if (n == sz-1){
3373 	    tmp0  = x[*idx];
3374 	    sum1 -= *v1*tmp0;
3375 	    sum2 -= *v2*tmp0;
3376 	    sum3 -= *v3*tmp0;
3377 	    sum4 -= *v4*tmp0;
3378 	    sum5 -= *v5*tmp0;
3379 	  }
3380 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3381 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3382 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3383 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3384 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3385           row -= 5;
3386 	  break;
3387         default:
3388 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3389       }
3390       CHKMEMQ;
3391     }
3392     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3393 
3394     /*
3395            t = b - D x    where D is the block diagonal
3396     */
3397     cnt = 0;
3398     for (i=0, row=0; i<m; i++) {
3399       CHKMEMQ;
3400       switch (sizes[i]){
3401         case 1:
3402 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3403 	  break;
3404         case 2:
3405 	  x1   = x[row]; x2 = x[row+1];
3406 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3407 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3408 	  t[row]   = b[row] - tmp1;
3409 	  t[row+1] = b[row+1] - tmp2; row += 2;
3410 	  cnt += 4;
3411 	  break;
3412         case 3:
3413 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3414 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3415 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3416 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3417 	  t[row] = b[row] - tmp1;
3418 	  t[row+1] = b[row+1] - tmp2;
3419 	  t[row+2] = b[row+2] - tmp3; row += 3;
3420 	  cnt += 9;
3421 	  break;
3422         case 4:
3423 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3424 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3425 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3426 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3427 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3428 	  t[row] = b[row] - tmp1;
3429 	  t[row+1] = b[row+1] - tmp2;
3430 	  t[row+2] = b[row+2] - tmp3;
3431 	  t[row+3] = b[row+3] - tmp4; row += 4;
3432 	  cnt += 16;
3433 	  break;
3434         case 5:
3435 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3436 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3437 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3438 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3439 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3440 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3441 	  t[row] = b[row] - tmp1;
3442 	  t[row+1] = b[row+1] - tmp2;
3443 	  t[row+2] = b[row+2] - tmp3;
3444 	  t[row+3] = b[row+3] - tmp4;
3445 	  t[row+4] = b[row+4] - tmp5;row += 5;
3446 	  cnt += 25;
3447 	  break;
3448         default:
3449 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3450       }
3451       CHKMEMQ;
3452     }
3453     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3454 
3455 
3456 
3457     /*
3458           Apply (L + D)^-1 where D is the block diagonal
3459     */
3460     for (i=0, row=0; i<m; i++) {
3461       sz  = diag[row] - ii[row];
3462       v1  = a->a + ii[row];
3463       idx = a->j + ii[row];
3464       CHKMEMQ;
3465       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3466       switch (sizes[i]){
3467         case 1:
3468 
3469 	  sum1  = t[row];
3470 	  for(n = 0; n<sz-1; n+=2) {
3471 	    i1   = idx[0];
3472 	    i2   = idx[1];
3473 	    idx += 2;
3474 	    tmp0 = t[i1];
3475 	    tmp1 = t[i2];
3476 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3477 	  }
3478 
3479 	  if (n == sz-1){
3480 	    tmp0  = t[*idx];
3481 	    sum1 -= *v1 * tmp0;
3482 	  }
3483 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3484 	  break;
3485         case 2:
3486 	  v2    = a->a + ii[row+1];
3487 	  sum1  = t[row];
3488 	  sum2  = t[row+1];
3489 	  for(n = 0; n<sz-1; n+=2) {
3490 	    i1   = idx[0];
3491 	    i2   = idx[1];
3492 	    idx += 2;
3493 	    tmp0 = t[i1];
3494 	    tmp1 = t[i2];
3495 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3496 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3497 	  }
3498 
3499 	  if (n == sz-1){
3500 	    tmp0  = t[*idx];
3501 	    sum1 -= v1[0] * tmp0;
3502 	    sum2 -= v2[0] * tmp0;
3503 	  }
3504 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3505 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3506 	  ibdiag  += 4; row += 2;
3507 	  break;
3508         case 3:
3509 	  v2    = a->a + ii[row+1];
3510 	  v3    = a->a + ii[row+2];
3511 	  sum1  = t[row];
3512 	  sum2  = t[row+1];
3513 	  sum3  = t[row+2];
3514 	  for(n = 0; n<sz-1; n+=2) {
3515 	    i1   = idx[0];
3516 	    i2   = idx[1];
3517 	    idx += 2;
3518 	    tmp0 = t[i1];
3519 	    tmp1 = t[i2];
3520 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3521 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3522 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3523 	  }
3524 
3525 	  if (n == sz-1){
3526 	    tmp0  = t[*idx];
3527 	    sum1 -= v1[0] * tmp0;
3528 	    sum2 -= v2[0] * tmp0;
3529 	    sum3 -= v3[0] * tmp0;
3530 	  }
3531 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3532 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3533 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3534 	  ibdiag  += 9; row += 3;
3535 	  break;
3536         case 4:
3537 	  v2    = a->a + ii[row+1];
3538 	  v3    = a->a + ii[row+2];
3539 	  v4    = a->a + ii[row+3];
3540 	  sum1  = t[row];
3541 	  sum2  = t[row+1];
3542 	  sum3  = t[row+2];
3543 	  sum4  = t[row+3];
3544 	  for(n = 0; n<sz-1; n+=2) {
3545 	    i1   = idx[0];
3546 	    i2   = idx[1];
3547 	    idx += 2;
3548 	    tmp0 = t[i1];
3549 	    tmp1 = t[i2];
3550 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3551 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3552 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3553 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3554 	  }
3555 
3556 	  if (n == sz-1){
3557 	    tmp0  = t[*idx];
3558 	    sum1 -= v1[0] * tmp0;
3559 	    sum2 -= v2[0] * tmp0;
3560 	    sum3 -= v3[0] * tmp0;
3561 	    sum4 -= v4[0] * tmp0;
3562 	  }
3563 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3564 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3565 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3566 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3567 	  ibdiag  += 16; row += 4;
3568 	  break;
3569         case 5:
3570 	  v2    = a->a + ii[row+1];
3571 	  v3    = a->a + ii[row+2];
3572 	  v4    = a->a + ii[row+3];
3573 	  v5    = a->a + ii[row+4];
3574 	  sum1  = t[row];
3575 	  sum2  = t[row+1];
3576 	  sum3  = t[row+2];
3577 	  sum4  = t[row+3];
3578 	  sum5  = t[row+4];
3579 	  for(n = 0; n<sz-1; n+=2) {
3580 	    i1   = idx[0];
3581 	    i2   = idx[1];
3582 	    idx += 2;
3583 	    tmp0 = t[i1];
3584 	    tmp1 = t[i2];
3585 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3586 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3587 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3588 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3589 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3590 	  }
3591 
3592 	  if (n == sz-1){
3593 	    tmp0  = t[*idx];
3594 	    sum1 -= v1[0] * tmp0;
3595 	    sum2 -= v2[0] * tmp0;
3596 	    sum3 -= v3[0] * tmp0;
3597 	    sum4 -= v4[0] * tmp0;
3598 	    sum5 -= v5[0] * tmp0;
3599 	  }
3600 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3601 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3602 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3603 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3604 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3605 	  ibdiag  += 25; row += 5;
3606 	  break;
3607         default:
3608 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3609       }
3610       CHKMEMQ;
3611     }
3612     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3613     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3614     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3615   }
3616   PetscFunctionReturn(0);
3617 }
3618 
3619 #undef __FUNCT__
3620 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3621 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3622 {
3623   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3624   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3625   const MatScalar    *bdiag = a->inode.bdiag;
3626   const PetscScalar  *b;
3627   PetscErrorCode      ierr;
3628   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3629   const PetscInt      *sizes = a->inode.size;
3630 
3631   PetscFunctionBegin;
3632   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3633   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3634   cnt = 0;
3635   for (i=0, row=0; i<m; i++) {
3636     switch (sizes[i]){
3637       case 1:
3638 	x[row] = b[row]*bdiag[cnt++];row++;
3639 	break;
3640       case 2:
3641 	x1   = b[row]; x2 = b[row+1];
3642 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3643 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3644 	x[row++] = tmp1;
3645 	x[row++] = tmp2;
3646 	cnt += 4;
3647 	break;
3648       case 3:
3649 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3650 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3651 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3652 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3653 	x[row++] = tmp1;
3654 	x[row++] = tmp2;
3655 	x[row++] = tmp3;
3656 	cnt += 9;
3657 	break;
3658       case 4:
3659 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3660 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3661 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3662 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3663 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3664 	x[row++] = tmp1;
3665 	x[row++] = tmp2;
3666 	x[row++] = tmp3;
3667 	x[row++] = tmp4;
3668 	cnt += 16;
3669 	break;
3670       case 5:
3671 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3672 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3673 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3674 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3675 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3676 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3677 	x[row++] = tmp1;
3678 	x[row++] = tmp2;
3679 	x[row++] = tmp3;
3680 	x[row++] = tmp4;
3681 	x[row++] = tmp5;
3682 	cnt += 25;
3683 	break;
3684       default:
3685 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3686     }
3687   }
3688   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3689   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3690   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 /*
3695     samestructure indicates that the matrix has not changed its nonzero structure so we
3696     do not need to recompute the inodes
3697 */
3698 #undef __FUNCT__
3699 #define __FUNCT__ "Mat_CheckInode"
3700 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3701 {
3702   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3703   PetscErrorCode ierr;
3704   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3705   PetscTruth     flag;
3706 
3707   PetscFunctionBegin;
3708   if (!a->inode.use)                     PetscFunctionReturn(0);
3709   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3710 
3711 
3712   m = A->rmap->n;
3713   if (a->inode.size) {ns = a->inode.size;}
3714   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3715 
3716   i          = 0;
3717   node_count = 0;
3718   idx        = a->j;
3719   ii         = a->i;
3720   while (i < m){                /* For each row */
3721     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3722     /* Limits the number of elements in a node to 'a->inode.limit' */
3723     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3724       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3725       if (nzy != nzx) break;
3726       idy  += nzx;             /* Same nonzero pattern */
3727       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3728       if (!flag) break;
3729     }
3730     ns[node_count++] = blk_size;
3731     idx += blk_size*nzx;
3732     i    = j;
3733   }
3734   /* If not enough inodes found,, do not use inode version of the routines */
3735   if (!a->inode.size && m && node_count > .9*m) {
3736     ierr = PetscFree(ns);CHKERRQ(ierr);
3737     a->inode.node_count     = 0;
3738     a->inode.size           = PETSC_NULL;
3739     a->inode.use            = PETSC_FALSE;
3740     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3741   } else {
3742     if (!A->factortype) {
3743       A->ops->mult              = MatMult_SeqAIJ_Inode;
3744       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3745       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3746       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3747       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3748       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3749       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3750       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3751       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3752     } else {
3753       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3754     }
3755     a->inode.node_count       = node_count;
3756     a->inode.size             = ns;
3757     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3758   }
3759   PetscFunctionReturn(0);
3760 }
3761 
3762 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3763 PetscInt __k, *__vi; \
3764 __vi = aj + ai[row];				\
3765 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3766 __vi = aj + adiag[row];				\
3767 cols[nzl] = __vi[0];\
3768 __vi = aj + adiag[row+1]+1;\
3769 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3770 
3771 
3772 /*
3773    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3774    Modified from Mat_CheckInode().
3775 
3776    Input Parameters:
3777 +  Mat A - ILU or LU matrix factor
3778 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3779     do not need to recompute the inodes
3780 */
3781 #undef __FUNCT__
3782 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3783 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3784 {
3785   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3786   PetscErrorCode ierr;
3787   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3788   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3789   PetscTruth     flag;
3790 
3791   PetscFunctionBegin;
3792   if (!a->inode.use)                     PetscFunctionReturn(0);
3793   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3794 
3795   m = A->rmap->n;
3796   if (a->inode.size) {ns = a->inode.size;}
3797   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3798 
3799   i          = 0;
3800   node_count = 0;
3801   while (i < m){                /* For each row */
3802     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3803     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3804     nzx  = nzl1 + nzu1 + 1;
3805     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3806     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3807 
3808     /* Limits the number of elements in a node to 'a->inode.limit' */
3809     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3810       nzl2    = ai[j+1] - ai[j];
3811       nzu2    = adiag[j] - adiag[j+1] - 1;
3812       nzy     = nzl2 + nzu2 + 1;
3813       if( nzy != nzx) break;
3814       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3815       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3816       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3817       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3818       ierr = PetscFree(cols2);CHKERRQ(ierr);
3819     }
3820     ns[node_count++] = blk_size;
3821     ierr = PetscFree(cols1);CHKERRQ(ierr);
3822     i    = j;
3823   }
3824   /* If not enough inodes found,, do not use inode version of the routines */
3825   if (!a->inode.size && m && node_count > .9*m) {
3826     ierr = PetscFree(ns);CHKERRQ(ierr);
3827     a->inode.node_count     = 0;
3828     a->inode.size           = PETSC_NULL;
3829     a->inode.use            = PETSC_FALSE;
3830     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3831   } else {
3832     A->ops->mult              = 0;
3833     A->ops->sor               = 0;
3834     A->ops->multadd           = 0;
3835     A->ops->getrowij          = 0;
3836     A->ops->restorerowij      = 0;
3837     A->ops->getcolumnij       = 0;
3838     A->ops->restorecolumnij   = 0;
3839     A->ops->coloringpatch     = 0;
3840     A->ops->multdiagonalblock = 0;
3841     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3842     a->inode.node_count       = node_count;
3843     a->inode.size             = ns;
3844     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3845   }
3846   PetscFunctionReturn(0);
3847 }
3848 
3849 /*
3850      This is really ugly. if inodes are used this replaces the
3851   permutations with ones that correspond to rows/cols of the matrix
3852   rather then inode blocks
3853 */
3854 #undef __FUNCT__
3855 #define __FUNCT__ "MatInodeAdjustForInodes"
3856 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3857 {
3858   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3859 
3860   PetscFunctionBegin;
3861   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3862   if (f) {
3863     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3864   }
3865   PetscFunctionReturn(0);
3866 }
3867 
3868 EXTERN_C_BEGIN
3869 #undef __FUNCT__
3870 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3871 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3872 {
3873   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3874   PetscErrorCode ierr;
3875   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3876   const PetscInt *ridx,*cidx;
3877   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3878   PetscInt       nslim_col,*ns_col;
3879   IS             ris = *rperm,cis = *cperm;
3880 
3881   PetscFunctionBegin;
3882   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3883   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3884 
3885   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3886   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3887   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3888 
3889   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3890   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3891 
3892   /* Form the inode structure for the rows of permuted matric using inv perm*/
3893   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3894 
3895   /* Construct the permutations for rows*/
3896   for (i=0,row = 0; i<nslim_row; ++i){
3897     indx      = ridx[i];
3898     start_val = tns[indx];
3899     end_val   = tns[indx + 1];
3900     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3901   }
3902 
3903   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3904   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3905 
3906  /* Construct permutations for columns */
3907   for (i=0,col=0; i<nslim_col; ++i){
3908     indx      = cidx[i];
3909     start_val = tns[indx];
3910     end_val   = tns[indx + 1];
3911     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3912   }
3913 
3914   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3915   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3916   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3917   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3918 
3919   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3920   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3921 
3922   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3923   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3924   ierr = ISDestroy(cis);CHKERRQ(ierr);
3925   ierr = ISDestroy(ris);CHKERRQ(ierr);
3926   ierr = PetscFree(tns);CHKERRQ(ierr);
3927   PetscFunctionReturn(0);
3928 }
3929 EXTERN_C_END
3930 
3931 #undef __FUNCT__
3932 #define __FUNCT__ "MatInodeGetInodeSizes"
3933 /*@C
3934    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3935 
3936    Collective on Mat
3937 
3938    Input Parameter:
3939 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3940 
3941    Output Parameter:
3942 +  node_count - no of inodes present in the matrix.
3943 .  sizes      - an array of size node_count,with sizes of each inode.
3944 -  limit      - the max size used to generate the inodes.
3945 
3946    Level: advanced
3947 
3948    Notes: This routine returns some internal storage information
3949    of the matrix, it is intended to be used by advanced users.
3950    It should be called after the matrix is assembled.
3951    The contents of the sizes[] array should not be changed.
3952    PETSC_NULL may be passed for information not requested.
3953 
3954 .keywords: matrix, seqaij, get, inode
3955 
3956 .seealso: MatGetInfo()
3957 @*/
3958 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3959 {
3960   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3961 
3962   PetscFunctionBegin;
3963   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3964   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3965   if (f) {
3966     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3967   }
3968   PetscFunctionReturn(0);
3969 }
3970 
3971 EXTERN_C_BEGIN
3972 #undef __FUNCT__
3973 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3974 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3975 {
3976   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3977 
3978   PetscFunctionBegin;
3979   if (node_count) *node_count = a->inode.node_count;
3980   if (sizes)      *sizes      = a->inode.size;
3981   if (limit)      *limit      = a->inode.limit;
3982   PetscFunctionReturn(0);
3983 }
3984 EXTERN_C_END
3985