xref: /petsc/src/mat/impls/aij/seq/inode.c (revision f3fe499b4cc4d64bf04aa4f5e4963dcc4eb56541)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   rtmp4 = rtmp3 + n;
1234   ics   = ic;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1239 
1240   /* If max inode size > 4, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i] > 4) {
1245       tmp_vec1[j] = 4;
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.newshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1348 	if(sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1432 	if(sctx.newshift) break;
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         for (j=0; j<nz; j++) {
1456           col = pj[j];
1457           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1458         }
1459         /* U part */
1460         pc2 = b->a + bdiag[i+2]+1;
1461         pj  = b->j + bdiag[i+2]+1;
1462         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1463         for (j=0; j<nz; j++) {
1464           col = pj[j];
1465           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1466         }
1467 
1468         sctx.rs  = rs;
1469         sctx.pv  = rtmp2[i+1];
1470         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1471 	if(sctx.newshift) break;
1472         pc2  = b->a + bdiag[i+1];
1473         *pc2 = 1.0/sctx.pv;
1474         break;
1475 
1476       case 3:
1477       /*----------*/
1478         /* zero rtmp */
1479         /* L part */
1480         nz    = bi[i+1] - bi[i];
1481         bjtmp = bj + bi[i];
1482         for  (j=0; j<nz; j++) {
1483           col = bjtmp[j];
1484           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1485         }
1486 
1487         /* U part */
1488         nz = bdiag[i]-bdiag[i+1];
1489         bjtmp = bj + bdiag[i+1]+1;
1490         for  (j=0; j<nz; j++) {
1491           col = bjtmp[j];
1492           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1493         }
1494 
1495         /* load in initial (unfactored row) */
1496         nz    = ai[r[i]+1] - ai[r[i]];
1497         ajtmp = aj + ai[r[i]];
1498         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1499         for (j=0; j<nz; j++) {
1500           col = ics[ajtmp[j]];
1501           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1502         }
1503         /* ZeropivotApply(): shift the diagonal of the matrix  */
1504         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1505 
1506         /* elimination */
1507         bjtmp = bj + bi[i];
1508         row   = *bjtmp++; /* pivot row */
1509         nzL   = bi[i+1] - bi[i];
1510         for(k=0; k < nzL;k++) {
1511           pc1 = rtmp1 + row;
1512           pc2 = rtmp2 + row;
1513           pc3 = rtmp3 + row;
1514           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1515             pv  = b->a + bdiag[row];
1516             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1517             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1518 
1519             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1520             pv = b->a + bdiag[row+1]+1;
1521             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1522             for (j=0; j<nz; j++){
1523               col = pj[j];
1524               rtmp1[col] -= mul1 * pv[j];
1525               rtmp2[col] -= mul2 * pv[j];
1526               rtmp3[col] -= mul3 * pv[j];
1527             }
1528             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1529           }
1530           row = *bjtmp++;
1531         }
1532 
1533         /* finished row i; check zero pivot, then stick row i into b->a */
1534         rs  = 0.0;
1535         /* L part */
1536         pc1 = b->a + bi[i];
1537         pj  = b->j + bi[i] ;
1538         nz  = bi[i+1] - bi[i];
1539         for (j=0; j<nz; j++) {
1540           col = pj[j];
1541           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1542         }
1543         /* U part */
1544         pc1 = b->a + bdiag[i+1]+1;
1545         pj  = b->j + bdiag[i+1]+1;
1546         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1547         for (j=0; j<nz; j++) {
1548           col = pj[j];
1549           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1550         }
1551 
1552         sctx.rs  = rs;
1553         sctx.pv  = rtmp1[i];
1554         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1555 	if(sctx.newshift) break;
1556         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1557         *pc1 = 1.0/sctx.pv;
1558 
1559         /* Now take care of 1st column of diagonal 3x3 block. */
1560         pc2 = rtmp2 + i;
1561         pc3 = rtmp3 + i;
1562         if (*pc2 != 0.0 || *pc3 != 0.0){
1563           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1564           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1565           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1566           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1567           for (j=0; j<nz; j++) {
1568             col = pj[j];
1569             rtmp2[col] -= mul2 * rtmp1[col];
1570             rtmp3[col] -= mul3 * rtmp1[col];
1571           }
1572           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1573         }
1574 
1575         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1576         rs = 0.0;
1577         /* L part */
1578         pc2 = b->a + bi[i+1];
1579         pj  = b->j + bi[i+1] ;
1580         nz  = bi[i+2] - bi[i+1];
1581         for (j=0; j<nz; j++) {
1582           col = pj[j];
1583           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1584         }
1585         /* U part */
1586         pc2 = b->a + bdiag[i+2]+1;
1587         pj  = b->j + bdiag[i+2]+1;
1588         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1589         for (j=0; j<nz; j++) {
1590           col = pj[j];
1591           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1592         }
1593 
1594         sctx.rs  = rs;
1595         sctx.pv  = rtmp2[i+1];
1596         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1597 	if(sctx.newshift) break;
1598         pc2  = b->a + bdiag[i+1];
1599         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1600 
1601         /* Now take care of 2nd column of diagonal 3x3 block. */
1602         pc3 = rtmp3 + i+1;
1603         if (*pc3 != 0.0){
1604           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1605           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1606           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1607           for (j=0; j<nz; j++) {
1608             col = pj[j];
1609             rtmp3[col] -= mul3 * rtmp2[col];
1610           }
1611           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1612         }
1613 
1614         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1615         rs = 0.0;
1616         /* L part */
1617         pc3 = b->a + bi[i+2];
1618         pj  = b->j + bi[i+2] ;
1619         nz  = bi[i+3] - bi[i+2];
1620         for (j=0; j<nz; j++) {
1621           col = pj[j];
1622           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1623         }
1624         /* U part */
1625         pc3 = b->a + bdiag[i+3]+1;
1626         pj  = b->j + bdiag[i+3]+1;
1627         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1628         for (j=0; j<nz; j++) {
1629           col = pj[j];
1630           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1631         }
1632 
1633         sctx.rs  = rs;
1634         sctx.pv  = rtmp3[i+2];
1635         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1636 	if(sctx.newshift) break;
1637         pc3  = b->a + bdiag[i+2];
1638         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1639         break;
1640       case 4:
1641       /*----------*/
1642         /* zero rtmp */
1643         /* L part */
1644         nz    = bi[i+1] - bi[i];
1645         bjtmp = bj + bi[i];
1646         for  (j=0; j<nz; j++) {
1647           col = bjtmp[j];
1648           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1649         }
1650 
1651         /* U part */
1652         nz = bdiag[i]-bdiag[i+1];
1653         bjtmp = bj + bdiag[i+1]+1;
1654         for  (j=0; j<nz; j++) {
1655           col = bjtmp[j];
1656           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1657         }
1658 
1659         /* load in initial (unfactored row) */
1660         nz    = ai[r[i]+1] - ai[r[i]];
1661         ajtmp = aj + ai[r[i]];
1662         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1663         for (j=0; j<nz; j++) {
1664           col = ics[ajtmp[j]];
1665           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1666         }
1667         /* ZeropivotApply(): shift the diagonal of the matrix  */
1668         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1669 
1670         /* elimination */
1671         bjtmp = bj + bi[i];
1672         row   = *bjtmp++; /* pivot row */
1673         nzL   = bi[i+1] - bi[i];
1674         for(k=0; k < nzL;k++) {
1675           pc1 = rtmp1 + row;
1676           pc2 = rtmp2 + row;
1677           pc3 = rtmp3 + row;
1678 	  pc4 = rtmp4 + row;
1679           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1680             pv  = b->a + bdiag[row];
1681             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1682             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1683 
1684             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1685             pv = b->a + bdiag[row+1]+1;
1686             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1687             for (j=0; j<nz; j++){
1688               col = pj[j];
1689               rtmp1[col] -= mul1 * pv[j];
1690               rtmp2[col] -= mul2 * pv[j];
1691               rtmp3[col] -= mul3 * pv[j];
1692 	      rtmp4[col] -= mul4 * pv[j];
1693             }
1694             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1695           }
1696           row = *bjtmp++;
1697         }
1698 
1699         /* finished row i; check zero pivot, then stick row i into b->a */
1700         rs  = 0.0;
1701         /* L part */
1702         pc1 = b->a + bi[i];
1703         pj  = b->j + bi[i] ;
1704         nz  = bi[i+1] - bi[i];
1705         for (j=0; j<nz; j++) {
1706           col = pj[j];
1707           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1708         }
1709         /* U part */
1710         pc1 = b->a + bdiag[i+1]+1;
1711         pj  = b->j + bdiag[i+1]+1;
1712         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1713         for (j=0; j<nz; j++) {
1714           col = pj[j];
1715           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1716         }
1717 
1718         sctx.rs  = rs;
1719         sctx.pv  = rtmp1[i];
1720         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1721 	if(sctx.newshift) break;
1722         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1723         *pc1 = 1.0/sctx.pv;
1724 
1725         /* Now take care of 1st column of diagonal 4x4 block. */
1726         pc2 = rtmp2 + i;
1727         pc3 = rtmp3 + i;
1728 	pc4 = rtmp4 + i;
1729         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1730           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1731           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1732 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1733           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1734           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1735           for (j=0; j<nz; j++) {
1736             col = pj[j];
1737             rtmp2[col] -= mul2 * rtmp1[col];
1738             rtmp3[col] -= mul3 * rtmp1[col];
1739 	    rtmp4[col] -= mul4 * rtmp1[col];
1740           }
1741           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1742         }
1743 
1744         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1745         rs = 0.0;
1746         /* L part */
1747         pc2 = b->a + bi[i+1];
1748         pj  = b->j + bi[i+1] ;
1749         nz  = bi[i+2] - bi[i+1];
1750         for (j=0; j<nz; j++) {
1751           col = pj[j];
1752           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1753         }
1754         /* U part */
1755         pc2 = b->a + bdiag[i+2]+1;
1756         pj  = b->j + bdiag[i+2]+1;
1757         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1758         for (j=0; j<nz; j++) {
1759           col = pj[j];
1760           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1761         }
1762 
1763         sctx.rs  = rs;
1764         sctx.pv  = rtmp2[i+1];
1765         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1766 	if(sctx.newshift) break;
1767         pc2  = b->a + bdiag[i+1];
1768         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1769 
1770         /* Now take care of 2nd column of diagonal 4x4 block. */
1771         pc3 = rtmp3 + i+1;
1772 	pc4 = rtmp4 + i+1;
1773         if (*pc3 != 0.0 || *pc4 != 0.0){
1774           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1775 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1776           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1777           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1778           for (j=0; j<nz; j++) {
1779             col = pj[j];
1780             rtmp3[col] -= mul3 * rtmp2[col];
1781 	    rtmp4[col] -= mul4 * rtmp2[col];
1782           }
1783           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1784         }
1785 
1786         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1787         rs = 0.0;
1788         /* L part */
1789         pc3 = b->a + bi[i+2];
1790         pj  = b->j + bi[i+2] ;
1791         nz  = bi[i+3] - bi[i+2];
1792         for (j=0; j<nz; j++) {
1793           col = pj[j];
1794           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1795         }
1796         /* U part */
1797         pc3 = b->a + bdiag[i+3]+1;
1798         pj  = b->j + bdiag[i+3]+1;
1799         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1800         for (j=0; j<nz; j++) {
1801           col = pj[j];
1802           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1803         }
1804 
1805         sctx.rs  = rs;
1806         sctx.pv  = rtmp3[i+2];
1807         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1808 	if(sctx.newshift) break;
1809         pc3  = b->a + bdiag[i+2];
1810         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1811 
1812 	/* Now take care of 3rd column of diagonal 4x4 block. */
1813 	pc4 = rtmp4 + i+2;
1814         if (*pc4 != 0.0){
1815 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1816           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1817           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1818           for (j=0; j<nz; j++) {
1819             col = pj[j];
1820 	    rtmp4[col] -= mul4 * rtmp3[col];
1821           }
1822           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1823         }
1824 
1825         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1826         rs = 0.0;
1827         /* L part */
1828         pc4 = b->a + bi[i+3];
1829         pj  = b->j + bi[i+3] ;
1830         nz  = bi[i+4] - bi[i+3];
1831         for (j=0; j<nz; j++) {
1832           col = pj[j];
1833           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1834         }
1835         /* U part */
1836         pc4 = b->a + bdiag[i+4]+1;
1837         pj  = b->j + bdiag[i+4]+1;
1838         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1839         for (j=0; j<nz; j++) {
1840           col = pj[j];
1841           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1842         }
1843 
1844         sctx.rs  = rs;
1845         sctx.pv  = rtmp4[i+3];
1846         ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr);
1847 	if(sctx.newshift) break;
1848         pc4  = b->a + bdiag[i+3];
1849         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1850          break;
1851 
1852       default:
1853         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1854       }
1855       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1856       i += nodesz;                 /* Update the row */
1857     }
1858 
1859     /* MatPivotRefine() */
1860     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1861       /*
1862        * if no shift in this attempt & shifting & started shifting & can refine,
1863        * then try lower shift
1864        */
1865       sctx.shift_hi       = sctx.shift_fraction;
1866       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1867       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1868       sctx.newshift       = PETSC_TRUE;
1869       sctx.nshift++;
1870     }
1871   } while (sctx.newshift);
1872 
1873   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1874   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1876   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1877 
1878   C->ops->solve              = MatSolve_SeqAIJ;
1879   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1880   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1881   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1882   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1883   C->assembled    = PETSC_TRUE;
1884   C->preallocated = PETSC_TRUE;
1885   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1886 
1887   /* MatShiftView(A,info,&sctx) */
1888   if (sctx.nshift){
1889     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1890       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1892       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1894       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1895     }
1896   }
1897   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 #undef __FUNCT__
1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1904 {
1905   Mat               C = B;
1906   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1907   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1908   PetscErrorCode    ierr;
1909   const PetscInt    *r,*ic,*c,*ics;
1910   PetscInt          n = A->rmap->n,*bi = b->i;
1911   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1912   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1913   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1914   PetscScalar       mul1,mul2,mul3,tmp;
1915   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1916   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1917   PetscReal         rs=0.0;
1918   FactorShiftCtx    sctx;
1919 
1920   PetscFunctionBegin;
1921   sctx.shift_top      = 0;
1922   sctx.nshift_max     = 0;
1923   sctx.shift_lo       = 0;
1924   sctx.shift_hi       = 0;
1925   sctx.shift_fraction = 0;
1926 
1927   /* if both shift schemes are chosen by user, only use info->shiftpd */
1928   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1929     sctx.shift_top = 0;
1930     for (i=0; i<n; i++) {
1931       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1932       rs    = 0.0;
1933       ajtmp = aj + ai[i];
1934       rtmp1 = aa + ai[i];
1935       nz = ai[i+1] - ai[i];
1936       for (j=0; j<nz; j++){
1937         if (*ajtmp != i){
1938           rs += PetscAbsScalar(*rtmp1++);
1939         } else {
1940           rs -= PetscRealPart(*rtmp1++);
1941         }
1942         ajtmp++;
1943       }
1944       if (rs>sctx.shift_top) sctx.shift_top = rs;
1945     }
1946     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1947     sctx.shift_top *= 1.1;
1948     sctx.nshift_max = 5;
1949     sctx.shift_lo   = 0.;
1950     sctx.shift_hi   = 1.;
1951   }
1952   sctx.shift_amount = 0;
1953   sctx.nshift       = 0;
1954 
1955   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1957   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1958   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1959   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1960   ics   = ic ;
1961   rtmp22 = rtmp11 + n;
1962   rtmp33 = rtmp22 + n;
1963 
1964   node_max = a->inode.node_count;
1965   ns       = a->inode.size;
1966   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1967 
1968   /* If max inode size > 3, split it into two inodes.*/
1969   /* also map the inode sizes according to the ordering */
1970   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1971   for (i=0,j=0; i<node_max; ++i,++j){
1972     if (ns[i]>3) {
1973       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1974       ++j;
1975       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1976     } else {
1977       tmp_vec1[j] = ns[i];
1978     }
1979   }
1980   /* Use the correct node_max */
1981   node_max = j;
1982 
1983   /* Now reorder the inode info based on mat re-ordering info */
1984   /* First create a row -> inode_size_array_index map */
1985   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1986   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1987   for (i=0,row=0; i<node_max; i++) {
1988     nodesz = tmp_vec1[i];
1989     for (j=0; j<nodesz; j++,row++) {
1990       nsmap[row] = i;
1991     }
1992   }
1993   /* Using nsmap, create a reordered ns structure */
1994   for (i=0,j=0; i< node_max; i++) {
1995     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1996     tmp_vec2[i]  = nodesz;
1997     j           += nodesz;
1998   }
1999   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2000   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2001   /* Now use the correct ns */
2002   ns = tmp_vec2;
2003 
2004   do {
2005     sctx.newshift = PETSC_FALSE;
2006     /* Now loop over each block-row, and do the factorization */
2007     for (i=0,row=0; i<node_max; i++) {
2008       nodesz = ns[i];
2009       nz     = bi[row+1] - bi[row];
2010       bjtmp  = bj + bi[row];
2011 
2012       switch (nodesz){
2013       case 1:
2014         for  (j=0; j<nz; j++){
2015           idx        = bjtmp[j];
2016           rtmp11[idx] = 0.0;
2017         }
2018 
2019         /* load in initial (unfactored row) */
2020         idx    = r[row];
2021         nz_tmp = ai[idx+1] - ai[idx];
2022         ajtmp  = aj + ai[idx];
2023         v1     = aa + ai[idx];
2024 
2025         for (j=0; j<nz_tmp; j++) {
2026           idx        = ics[ajtmp[j]];
2027           rtmp11[idx] = v1[j];
2028         }
2029         rtmp11[ics[r[row]]] += sctx.shift_amount;
2030 
2031         prow = *bjtmp++ ;
2032         while (prow < row) {
2033           pc1 = rtmp11 + prow;
2034           if (*pc1 != 0.0){
2035             pv   = ba + bd[prow];
2036             pj   = nbj + bd[prow];
2037             mul1 = *pc1 * *pv++;
2038             *pc1 = mul1;
2039             nz_tmp = bi[prow+1] - bd[prow] - 1;
2040             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2041             for (j=0; j<nz_tmp; j++) {
2042               tmp = pv[j];
2043               idx = pj[j];
2044               rtmp11[idx] -= mul1 * tmp;
2045             }
2046           }
2047           prow = *bjtmp++ ;
2048         }
2049         pj  = bj + bi[row];
2050         pc1 = ba + bi[row];
2051 
2052         sctx.pv    = rtmp11[row];
2053         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2054         rs         = 0.0;
2055         for (j=0; j<nz; j++) {
2056           idx    = pj[j];
2057           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2058           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2059         }
2060         sctx.rs  = rs;
2061         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2062         if (sctx.newshift) goto endofwhile;
2063         break;
2064 
2065       case 2:
2066         for (j=0; j<nz; j++) {
2067           idx        = bjtmp[j];
2068           rtmp11[idx] = 0.0;
2069           rtmp22[idx] = 0.0;
2070         }
2071 
2072         /* load in initial (unfactored row) */
2073         idx    = r[row];
2074         nz_tmp = ai[idx+1] - ai[idx];
2075         ajtmp  = aj + ai[idx];
2076         v1     = aa + ai[idx];
2077         v2     = aa + ai[idx+1];
2078         for (j=0; j<nz_tmp; j++) {
2079           idx        = ics[ajtmp[j]];
2080           rtmp11[idx] = v1[j];
2081           rtmp22[idx] = v2[j];
2082         }
2083         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2084         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2085 
2086         prow = *bjtmp++ ;
2087         while (prow < row) {
2088           pc1 = rtmp11 + prow;
2089           pc2 = rtmp22 + prow;
2090           if (*pc1 != 0.0 || *pc2 != 0.0){
2091             pv   = ba + bd[prow];
2092             pj   = nbj + bd[prow];
2093             mul1 = *pc1 * *pv;
2094             mul2 = *pc2 * *pv;
2095             ++pv;
2096             *pc1 = mul1;
2097             *pc2 = mul2;
2098 
2099             nz_tmp = bi[prow+1] - bd[prow] - 1;
2100             for (j=0; j<nz_tmp; j++) {
2101               tmp = pv[j];
2102               idx = pj[j];
2103               rtmp11[idx] -= mul1 * tmp;
2104               rtmp22[idx] -= mul2 * tmp;
2105             }
2106             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2107           }
2108           prow = *bjtmp++ ;
2109         }
2110 
2111         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2112         pc1 = rtmp11 + prow;
2113         pc2 = rtmp22 + prow;
2114 
2115         sctx.pv = *pc1;
2116         pj      = bj + bi[prow];
2117         rs      = 0.0;
2118         for (j=0; j<nz; j++){
2119           idx = pj[j];
2120           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2121         }
2122         sctx.rs = rs;
2123         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2124         if (sctx.newshift) goto endofwhile;
2125 
2126         if (*pc2 != 0.0){
2127           pj     = nbj + bd[prow];
2128           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2129           *pc2   = mul2;
2130           nz_tmp = bi[prow+1] - bd[prow] - 1;
2131           for (j=0; j<nz_tmp; j++) {
2132             idx = pj[j] ;
2133             tmp = rtmp11[idx];
2134             rtmp22[idx] -= mul2 * tmp;
2135           }
2136           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2137         }
2138 
2139         pj  = bj + bi[row];
2140         pc1 = ba + bi[row];
2141         pc2 = ba + bi[row+1];
2142 
2143         sctx.pv = rtmp22[row+1];
2144         rs = 0.0;
2145         rtmp11[row]   = 1.0/rtmp11[row];
2146         rtmp22[row+1] = 1.0/rtmp22[row+1];
2147         /* copy row entries from dense representation to sparse */
2148         for (j=0; j<nz; j++) {
2149           idx    = pj[j];
2150           pc1[j] = rtmp11[idx];
2151           pc2[j] = rtmp22[idx];
2152           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2153         }
2154         sctx.rs = rs;
2155         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2156         if (sctx.newshift) goto endofwhile;
2157         break;
2158 
2159       case 3:
2160         for  (j=0; j<nz; j++) {
2161           idx        = bjtmp[j];
2162           rtmp11[idx] = 0.0;
2163           rtmp22[idx] = 0.0;
2164           rtmp33[idx] = 0.0;
2165         }
2166         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2167         idx    = r[row];
2168         nz_tmp = ai[idx+1] - ai[idx];
2169         ajtmp = aj + ai[idx];
2170         v1    = aa + ai[idx];
2171         v2    = aa + ai[idx+1];
2172         v3    = aa + ai[idx+2];
2173         for (j=0; j<nz_tmp; j++) {
2174           idx        = ics[ajtmp[j]];
2175           rtmp11[idx] = v1[j];
2176           rtmp22[idx] = v2[j];
2177           rtmp33[idx] = v3[j];
2178         }
2179         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2180         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2181         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2182 
2183         /* loop over all pivot row blocks above this row block */
2184         prow = *bjtmp++ ;
2185         while (prow < row) {
2186           pc1 = rtmp11 + prow;
2187           pc2 = rtmp22 + prow;
2188           pc3 = rtmp33 + prow;
2189           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2190             pv   = ba  + bd[prow];
2191             pj   = nbj + bd[prow];
2192             mul1 = *pc1 * *pv;
2193             mul2 = *pc2 * *pv;
2194             mul3 = *pc3 * *pv;
2195             ++pv;
2196             *pc1 = mul1;
2197             *pc2 = mul2;
2198             *pc3 = mul3;
2199 
2200             nz_tmp = bi[prow+1] - bd[prow] - 1;
2201             /* update this row based on pivot row */
2202             for (j=0; j<nz_tmp; j++) {
2203               tmp = pv[j];
2204               idx = pj[j];
2205               rtmp11[idx] -= mul1 * tmp;
2206               rtmp22[idx] -= mul2 * tmp;
2207               rtmp33[idx] -= mul3 * tmp;
2208             }
2209             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2210           }
2211           prow = *bjtmp++ ;
2212         }
2213 
2214         /* Now take care of diagonal 3x3 block in this set of rows */
2215         /* note: prow = row here */
2216         pc1 = rtmp11 + prow;
2217         pc2 = rtmp22 + prow;
2218         pc3 = rtmp33 + prow;
2219 
2220         sctx.pv = *pc1;
2221         pj      = bj + bi[prow];
2222         rs      = 0.0;
2223         for (j=0; j<nz; j++){
2224           idx = pj[j];
2225           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2226         }
2227         sctx.rs = rs;
2228         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2229         if (sctx.newshift) goto endofwhile;
2230 
2231         if (*pc2 != 0.0 || *pc3 != 0.0){
2232           mul2 = (*pc2)/(*pc1);
2233           mul3 = (*pc3)/(*pc1);
2234           *pc2 = mul2;
2235           *pc3 = mul3;
2236           nz_tmp = bi[prow+1] - bd[prow] - 1;
2237           pj     = nbj + bd[prow];
2238           for (j=0; j<nz_tmp; j++) {
2239             idx = pj[j] ;
2240             tmp = rtmp11[idx];
2241             rtmp22[idx] -= mul2 * tmp;
2242             rtmp33[idx] -= mul3 * tmp;
2243           }
2244           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2245         }
2246         ++prow;
2247 
2248         pc2 = rtmp22 + prow;
2249         pc3 = rtmp33 + prow;
2250         sctx.pv = *pc2;
2251         pj      = bj + bi[prow];
2252         rs      = 0.0;
2253         for (j=0; j<nz; j++){
2254           idx = pj[j];
2255           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2256         }
2257         sctx.rs = rs;
2258         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2259         if (sctx.newshift) goto endofwhile;
2260 
2261         if (*pc3 != 0.0){
2262           mul3   = (*pc3)/(*pc2);
2263           *pc3   = mul3;
2264           pj     = nbj + bd[prow];
2265           nz_tmp = bi[prow+1] - bd[prow] - 1;
2266           for (j=0; j<nz_tmp; j++) {
2267             idx = pj[j] ;
2268             tmp = rtmp22[idx];
2269             rtmp33[idx] -= mul3 * tmp;
2270           }
2271           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2272         }
2273 
2274         pj  = bj + bi[row];
2275         pc1 = ba + bi[row];
2276         pc2 = ba + bi[row+1];
2277         pc3 = ba + bi[row+2];
2278 
2279         sctx.pv = rtmp33[row+2];
2280         rs = 0.0;
2281         rtmp11[row]   = 1.0/rtmp11[row];
2282         rtmp22[row+1] = 1.0/rtmp22[row+1];
2283         rtmp33[row+2] = 1.0/rtmp33[row+2];
2284         /* copy row entries from dense representation to sparse */
2285         for (j=0; j<nz; j++) {
2286           idx    = pj[j];
2287           pc1[j] = rtmp11[idx];
2288           pc2[j] = rtmp22[idx];
2289           pc3[j] = rtmp33[idx];
2290           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2291         }
2292 
2293         sctx.rs = rs;
2294         ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr);
2295         if (sctx.newshift) goto endofwhile;
2296         break;
2297 
2298       default:
2299         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2300       }
2301       row += nodesz;                 /* Update the row */
2302     }
2303     endofwhile:;
2304   } while (sctx.newshift);
2305   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2306   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2309   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2310   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2311   /* do not set solve add, since MatSolve_Inode + Add is faster */
2312   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2313   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2314   C->assembled   = PETSC_TRUE;
2315   C->preallocated = PETSC_TRUE;
2316   if (sctx.nshift) {
2317     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2318       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2319     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2320       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2321     }
2322   }
2323   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2324   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2325   PetscFunctionReturn(0);
2326 }
2327 
2328 
2329 /* ----------------------------------------------------------- */
2330 #undef __FUNCT__
2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2333 {
2334   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2335   IS                iscol = a->col,isrow = a->row;
2336   PetscErrorCode    ierr;
2337   const PetscInt    *r,*c,*rout,*cout;
2338   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2339   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2340   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2341   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2342   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2343   const PetscScalar *b;
2344 
2345   PetscFunctionBegin;
2346   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2347   node_max = a->inode.node_count;
2348   ns       = a->inode.size;     /* Node Size array */
2349 
2350   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2351   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2352   tmp  = a->solve_work;
2353 
2354   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2355   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2356 
2357   /* forward solve the lower triangular */
2358   tmps = tmp ;
2359   aa   = a_a ;
2360   aj   = a_j ;
2361   ad   = a->diag;
2362 
2363   for (i = 0,row = 0; i< node_max; ++i){
2364     nsz = ns[i];
2365     aii = ai[row];
2366     v1  = aa + aii;
2367     vi  = aj + aii;
2368     nz  = ai[row+1]- ai[row];
2369 
2370     if (i < node_max-1) {
2371       /* Prefetch the indices for the next block */
2372       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2373       /* Prefetch the data for the next block */
2374       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2375     }
2376 
2377     switch (nsz){               /* Each loop in 'case' is unrolled */
2378     case 1 :
2379       sum1 = b[r[row]];
2380       for(j=0; j<nz-1; j+=2){
2381         i0   = vi[j];
2382         i1   = vi[j+1];
2383         tmp0 = tmps[i0];
2384         tmp1 = tmps[i1];
2385         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2386       }
2387       if(j == nz-1){
2388         tmp0 = tmps[vi[j]];
2389         sum1 -= v1[j]*tmp0;
2390       }
2391       tmp[row++]=sum1;
2392       break;
2393     case 2:
2394       sum1 = b[r[row]];
2395       sum2 = b[r[row+1]];
2396       v2   = aa + ai[row+1];
2397 
2398       for(j=0; j<nz-1; j+=2){
2399         i0   = vi[j];
2400         i1   = vi[j+1];
2401         tmp0 = tmps[i0];
2402         tmp1 = tmps[i1];
2403         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2404         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2405       }
2406       if(j == nz-1){
2407         tmp0 = tmps[vi[j]];
2408         sum1 -= v1[j] *tmp0;
2409         sum2 -= v2[j] *tmp0;
2410       }
2411       sum2 -= v2[nz] * sum1;
2412       tmp[row ++]=sum1;
2413       tmp[row ++]=sum2;
2414       break;
2415     case 3:
2416       sum1 = b[r[row]];
2417       sum2 = b[r[row+1]];
2418       sum3 = b[r[row+2]];
2419       v2   = aa + ai[row+1];
2420       v3   = aa + ai[row+2];
2421 
2422       for (j=0; j<nz-1; j+=2){
2423         i0   = vi[j];
2424         i1   = vi[j+1];
2425         tmp0 = tmps[i0];
2426         tmp1 = tmps[i1];
2427         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2428         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2429         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2430       }
2431       if (j == nz-1){
2432         tmp0 = tmps[vi[j]];
2433         sum1 -= v1[j] *tmp0;
2434         sum2 -= v2[j] *tmp0;
2435         sum3 -= v3[j] *tmp0;
2436       }
2437       sum2 -= v2[nz] * sum1;
2438       sum3 -= v3[nz] * sum1;
2439       sum3 -= v3[nz+1] * sum2;
2440       tmp[row ++]=sum1;
2441       tmp[row ++]=sum2;
2442       tmp[row ++]=sum3;
2443       break;
2444 
2445     case 4:
2446       sum1 = b[r[row]];
2447       sum2 = b[r[row+1]];
2448       sum3 = b[r[row+2]];
2449       sum4 = b[r[row+3]];
2450       v2   = aa + ai[row+1];
2451       v3   = aa + ai[row+2];
2452       v4   = aa + ai[row+3];
2453 
2454       for (j=0; j<nz-1; j+=2){
2455         i0   = vi[j];
2456         i1   = vi[j+1];
2457         tmp0 = tmps[i0];
2458         tmp1 = tmps[i1];
2459         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2460         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2461         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2462         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2463       }
2464       if (j == nz-1){
2465         tmp0 = tmps[vi[j]];
2466         sum1 -= v1[j] *tmp0;
2467         sum2 -= v2[j] *tmp0;
2468         sum3 -= v3[j] *tmp0;
2469         sum4 -= v4[j] *tmp0;
2470       }
2471       sum2 -= v2[nz] * sum1;
2472       sum3 -= v3[nz] * sum1;
2473       sum4 -= v4[nz] * sum1;
2474       sum3 -= v3[nz+1] * sum2;
2475       sum4 -= v4[nz+1] * sum2;
2476       sum4 -= v4[nz+2] * sum3;
2477 
2478       tmp[row ++]=sum1;
2479       tmp[row ++]=sum2;
2480       tmp[row ++]=sum3;
2481       tmp[row ++]=sum4;
2482       break;
2483     case 5:
2484       sum1 = b[r[row]];
2485       sum2 = b[r[row+1]];
2486       sum3 = b[r[row+2]];
2487       sum4 = b[r[row+3]];
2488       sum5 = b[r[row+4]];
2489       v2   = aa + ai[row+1];
2490       v3   = aa + ai[row+2];
2491       v4   = aa + ai[row+3];
2492       v5   = aa + ai[row+4];
2493 
2494       for (j=0; j<nz-1; j+=2){
2495         i0   = vi[j];
2496         i1   = vi[j+1];
2497         tmp0 = tmps[i0];
2498         tmp1 = tmps[i1];
2499         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2500         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2501         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2502         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2503         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2504       }
2505       if (j == nz-1){
2506         tmp0 = tmps[vi[j]];
2507         sum1 -= v1[j] *tmp0;
2508         sum2 -= v2[j] *tmp0;
2509         sum3 -= v3[j] *tmp0;
2510         sum4 -= v4[j] *tmp0;
2511         sum5 -= v5[j] *tmp0;
2512       }
2513 
2514       sum2 -= v2[nz] * sum1;
2515       sum3 -= v3[nz] * sum1;
2516       sum4 -= v4[nz] * sum1;
2517       sum5 -= v5[nz] * sum1;
2518       sum3 -= v3[nz+1] * sum2;
2519       sum4 -= v4[nz+1] * sum2;
2520       sum5 -= v5[nz+1] * sum2;
2521       sum4 -= v4[nz+2] * sum3;
2522       sum5 -= v5[nz+2] * sum3;
2523       sum5 -= v5[nz+3] * sum4;
2524 
2525       tmp[row ++]=sum1;
2526       tmp[row ++]=sum2;
2527       tmp[row ++]=sum3;
2528       tmp[row ++]=sum4;
2529       tmp[row ++]=sum5;
2530       break;
2531     default:
2532       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2533     }
2534   }
2535   /* backward solve the upper triangular */
2536   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2537     nsz = ns[i];
2538     aii = ad[row+1] + 1;
2539     v1  = aa + aii;
2540     vi  = aj + aii;
2541     nz  = ad[row]- ad[row+1] - 1;
2542 
2543     if (i > 0) {
2544       /* Prefetch the indices for the next block */
2545       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2546       /* Prefetch the data for the next block */
2547       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2548     }
2549 
2550     switch (nsz){               /* Each loop in 'case' is unrolled */
2551     case 1 :
2552       sum1 = tmp[row];
2553 
2554       for(j=0 ; j<nz-1; j+=2){
2555         i0   = vi[j];
2556         i1   = vi[j+1];
2557         tmp0 = tmps[i0];
2558         tmp1 = tmps[i1];
2559         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2560       }
2561       if (j == nz-1){
2562         tmp0  = tmps[vi[j]];
2563         sum1 -= v1[j]*tmp0;
2564       }
2565       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2566       break;
2567     case 2 :
2568       sum1 = tmp[row];
2569       sum2 = tmp[row-1];
2570       v2   = aa + ad[row] + 1;
2571       for (j=0 ; j<nz-1; j+=2){
2572         i0   = vi[j];
2573         i1   = vi[j+1];
2574         tmp0 = tmps[i0];
2575         tmp1 = tmps[i1];
2576         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2577         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2578       }
2579       if (j == nz-1){
2580         tmp0  = tmps[vi[j]];
2581         sum1 -= v1[j]* tmp0;
2582         sum2 -= v2[j+1]* tmp0;
2583       }
2584 
2585       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2586       sum2   -= v2[0] * tmp0;
2587       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2588       break;
2589     case 3 :
2590       sum1 = tmp[row];
2591       sum2 = tmp[row -1];
2592       sum3 = tmp[row -2];
2593       v2   = aa + ad[row] + 1;
2594       v3   = aa + ad[row -1] + 1;
2595       for (j=0 ; j<nz-1; j+=2){
2596         i0   = vi[j];
2597         i1   = vi[j+1];
2598         tmp0 = tmps[i0];
2599         tmp1 = tmps[i1];
2600         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2601         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2602         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2603       }
2604       if (j== nz-1){
2605         tmp0  = tmps[vi[j]];
2606         sum1 -= v1[j] * tmp0;
2607         sum2 -= v2[j+1] * tmp0;
2608         sum3 -= v3[j+2] * tmp0;
2609       }
2610       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611       sum2   -= v2[0]* tmp0;
2612       sum3   -= v3[1] * tmp0;
2613       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2614       sum3   -= v3[0]* tmp0;
2615       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2616 
2617       break;
2618     case 4 :
2619       sum1 = tmp[row];
2620       sum2 = tmp[row -1];
2621       sum3 = tmp[row -2];
2622       sum4 = tmp[row -3];
2623       v2   = aa + ad[row]+1;
2624       v3   = aa + ad[row -1]+1;
2625       v4   = aa + ad[row -2]+1;
2626 
2627       for (j=0 ; j<nz-1; j+=2){
2628         i0   = vi[j];
2629         i1   = vi[j+1];
2630         tmp0 = tmps[i0];
2631         tmp1 = tmps[i1];
2632         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2633         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2634         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2635         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2636       }
2637       if (j== nz-1){
2638         tmp0  = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640         sum2 -= v2[j+1] * tmp0;
2641         sum3 -= v3[j+2] * tmp0;
2642         sum4 -= v4[j+3] * tmp0;
2643       }
2644 
2645       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2646       sum2   -= v2[0] * tmp0;
2647       sum3   -= v3[1] * tmp0;
2648       sum4   -= v4[2] * tmp0;
2649       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650       sum3   -= v3[0] * tmp0;
2651       sum4   -= v4[1] * tmp0;
2652       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2653       sum4   -= v4[0] * tmp0;
2654       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655       break;
2656     case 5 :
2657       sum1 = tmp[row];
2658       sum2 = tmp[row -1];
2659       sum3 = tmp[row -2];
2660       sum4 = tmp[row -3];
2661       sum5 = tmp[row -4];
2662       v2   = aa + ad[row]+1;
2663       v3   = aa + ad[row -1]+1;
2664       v4   = aa + ad[row -2]+1;
2665       v5   = aa + ad[row -3]+1;
2666       for (j=0 ; j<nz-1; j+=2){
2667         i0   = vi[j];
2668         i1   = vi[j+1];
2669         tmp0 = tmps[i0];
2670         tmp1 = tmps[i1];
2671         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2672         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2673         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2674         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2675         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2676       }
2677       if (j==nz-1){
2678         tmp0  = tmps[vi[j]];
2679         sum1 -= v1[j] * tmp0;
2680         sum2 -= v2[j+1] * tmp0;
2681         sum3 -= v3[j+2] * tmp0;
2682         sum4 -= v4[j+3] * tmp0;
2683         sum5 -= v5[j+4] * tmp0;
2684       }
2685 
2686       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2687       sum2   -= v2[0] * tmp0;
2688       sum3   -= v3[1] * tmp0;
2689       sum4   -= v4[2] * tmp0;
2690       sum5   -= v5[3] * tmp0;
2691       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2692       sum3   -= v3[0] * tmp0;
2693       sum4   -= v4[1] * tmp0;
2694       sum5   -= v5[2] * tmp0;
2695       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2696       sum4   -= v4[0] * tmp0;
2697       sum5   -= v5[1] * tmp0;
2698       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2699       sum5   -= v5[0] * tmp0;
2700       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2701       break;
2702     default:
2703       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2704     }
2705   }
2706   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2707   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2708   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2710   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 
2715 /*
2716      Makes a longer coloring[] array and calls the usual code with that
2717 */
2718 #undef __FUNCT__
2719 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2720 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2721 {
2722   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2723   PetscErrorCode  ierr;
2724   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2725   PetscInt        *colorused,i;
2726   ISColoringValue *newcolor;
2727 
2728   PetscFunctionBegin;
2729   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2730   /* loop over inodes, marking a color for each column*/
2731   row = 0;
2732   for (i=0; i<m; i++){
2733     for (j=0; j<ns[i]; j++) {
2734       newcolor[row++] = coloring[i] + j*ncolors;
2735     }
2736   }
2737 
2738   /* eliminate unneeded colors */
2739   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2740   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2741   for (i=0; i<n; i++) {
2742     colorused[newcolor[i]] = 1;
2743   }
2744 
2745   for (i=1; i<5*ncolors; i++) {
2746     colorused[i] += colorused[i-1];
2747   }
2748   ncolors = colorused[5*ncolors-1];
2749   for (i=0; i<n; i++) {
2750     newcolor[i] = colorused[newcolor[i]]-1;
2751   }
2752   ierr = PetscFree(colorused);CHKERRQ(ierr);
2753   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2754   ierr = PetscFree(coloring);CHKERRQ(ierr);
2755   PetscFunctionReturn(0);
2756 }
2757 
2758 #include "../src/mat/blockinvert.h"
2759 
2760 #undef __FUNCT__
2761 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2762 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2763 {
2764   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2765   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2766   MatScalar          *ibdiag,*bdiag,work[25];
2767   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2768   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2769   const PetscScalar  *xb;
2770   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2771   PetscErrorCode     ierr;
2772   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2773   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2774 
2775   PetscFunctionBegin;
2776   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2777   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2778   if (its > 1) {
2779     /* switch to non-inode version */
2780     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2781     PetscFunctionReturn(0);
2782   }
2783 
2784   if (!a->inode.ibdiagvalid) {
2785     if (!a->inode.ibdiag) {
2786       /* calculate space needed for diagonal blocks */
2787       for (i=0; i<m; i++) {
2788 	cnt += sizes[i]*sizes[i];
2789       }
2790       a->inode.bdiagsize = cnt;
2791       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2792     }
2793 
2794     /* copy over the diagonal blocks and invert them */
2795     ibdiag = a->inode.ibdiag;
2796     bdiag  = a->inode.bdiag;
2797     cnt = 0;
2798     for (i=0, row = 0; i<m; i++) {
2799       for (j=0; j<sizes[i]; j++) {
2800         for (k=0; k<sizes[i]; k++) {
2801           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2802         }
2803       }
2804       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2805 
2806       switch(sizes[i]) {
2807         case 1:
2808           /* Create matrix data structure */
2809           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2810           ibdiag[cnt] = 1.0/ibdiag[cnt];
2811           break;
2812         case 2:
2813           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2814           break;
2815         case 3:
2816           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2817           break;
2818         case 4:
2819           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2820           break;
2821         case 5:
2822           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2823           break;
2824        default:
2825 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2826       }
2827       cnt += sizes[i]*sizes[i];
2828       row += sizes[i];
2829     }
2830     a->inode.ibdiagvalid = PETSC_TRUE;
2831   }
2832   ibdiag = a->inode.ibdiag;
2833   bdiag  = a->inode.bdiag;
2834 
2835 
2836   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2837   if (flag & SOR_ZERO_INITIAL_GUESS) {
2838     const PetscScalar *b;
2839     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2840     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2841     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2842 
2843       for (i=0, row=0; i<m; i++) {
2844         sz  = diag[row] - ii[row];
2845         v1  = a->a + ii[row];
2846         idx = a->j + ii[row];
2847 
2848         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849         switch (sizes[i]){
2850           case 1:
2851 
2852             sum1  = b[row];
2853             for(n = 0; n<sz-1; n+=2) {
2854               i1   = idx[0];
2855               i2   = idx[1];
2856               idx += 2;
2857               tmp0 = x[i1];
2858               tmp1 = x[i2];
2859               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860             }
2861 
2862             if (n == sz-1){
2863               tmp0  = x[*idx];
2864               sum1 -= *v1 * tmp0;
2865             }
2866             x[row++] = sum1*(*ibdiag++);
2867             break;
2868           case 2:
2869             v2    = a->a + ii[row+1];
2870             sum1  = b[row];
2871             sum2  = b[row+1];
2872             for(n = 0; n<sz-1; n+=2) {
2873               i1   = idx[0];
2874               i2   = idx[1];
2875               idx += 2;
2876               tmp0 = x[i1];
2877               tmp1 = x[i2];
2878               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2879               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2880             }
2881 
2882             if (n == sz-1){
2883               tmp0  = x[*idx];
2884               sum1 -= v1[0] * tmp0;
2885               sum2 -= v2[0] * tmp0;
2886             }
2887             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2888             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2889             ibdiag  += 4;
2890             break;
2891           case 3:
2892             v2    = a->a + ii[row+1];
2893             v3    = a->a + ii[row+2];
2894             sum1  = b[row];
2895             sum2  = b[row+1];
2896             sum3  = b[row+2];
2897             for(n = 0; n<sz-1; n+=2) {
2898               i1   = idx[0];
2899               i2   = idx[1];
2900               idx += 2;
2901               tmp0 = x[i1];
2902               tmp1 = x[i2];
2903               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906             }
2907 
2908             if (n == sz-1){
2909               tmp0  = x[*idx];
2910               sum1 -= v1[0] * tmp0;
2911               sum2 -= v2[0] * tmp0;
2912               sum3 -= v3[0] * tmp0;
2913             }
2914             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2915             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2916             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2917             ibdiag  += 9;
2918             break;
2919           case 4:
2920             v2    = a->a + ii[row+1];
2921             v3    = a->a + ii[row+2];
2922             v4    = a->a + ii[row+3];
2923             sum1  = b[row];
2924             sum2  = b[row+1];
2925             sum3  = b[row+2];
2926             sum4  = b[row+3];
2927             for(n = 0; n<sz-1; n+=2) {
2928               i1   = idx[0];
2929               i2   = idx[1];
2930               idx += 2;
2931               tmp0 = x[i1];
2932               tmp1 = x[i2];
2933               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2934               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2935               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2936               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2937             }
2938 
2939             if (n == sz-1){
2940               tmp0  = x[*idx];
2941               sum1 -= v1[0] * tmp0;
2942               sum2 -= v2[0] * tmp0;
2943               sum3 -= v3[0] * tmp0;
2944               sum4 -= v4[0] * tmp0;
2945             }
2946             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2947             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2948             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2949             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2950             ibdiag  += 16;
2951             break;
2952           case 5:
2953             v2    = a->a + ii[row+1];
2954             v3    = a->a + ii[row+2];
2955             v4    = a->a + ii[row+3];
2956             v5    = a->a + ii[row+4];
2957             sum1  = b[row];
2958             sum2  = b[row+1];
2959             sum3  = b[row+2];
2960             sum4  = b[row+3];
2961             sum5  = b[row+4];
2962             for(n = 0; n<sz-1; n+=2) {
2963               i1   = idx[0];
2964               i2   = idx[1];
2965               idx += 2;
2966               tmp0 = x[i1];
2967               tmp1 = x[i2];
2968               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2969               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2970               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2971               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2972               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2973             }
2974 
2975             if (n == sz-1){
2976               tmp0  = x[*idx];
2977               sum1 -= v1[0] * tmp0;
2978               sum2 -= v2[0] * tmp0;
2979               sum3 -= v3[0] * tmp0;
2980               sum4 -= v4[0] * tmp0;
2981               sum5 -= v5[0] * tmp0;
2982             }
2983             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2984             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2985             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2986             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2987             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2988             ibdiag  += 25;
2989             break;
2990 	  default:
2991    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2992         }
2993       }
2994 
2995       xb = x;
2996       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2997     } else xb = b;
2998     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2999         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
3000       cnt = 0;
3001       for (i=0, row=0; i<m; i++) {
3002 
3003         switch (sizes[i]){
3004           case 1:
3005             x[row++] *= bdiag[cnt++];
3006             break;
3007           case 2:
3008             x1   = x[row]; x2 = x[row+1];
3009             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3010             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3011             x[row++] = tmp1;
3012             x[row++] = tmp2;
3013             cnt += 4;
3014             break;
3015           case 3:
3016             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3017             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3018             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3019             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3020             x[row++] = tmp1;
3021             x[row++] = tmp2;
3022             x[row++] = tmp3;
3023             cnt += 9;
3024             break;
3025           case 4:
3026             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3027             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3028             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3029             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3030             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3031             x[row++] = tmp1;
3032             x[row++] = tmp2;
3033             x[row++] = tmp3;
3034             x[row++] = tmp4;
3035             cnt += 16;
3036             break;
3037           case 5:
3038             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3039             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3040             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3041             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3042             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3043             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3044             x[row++] = tmp1;
3045             x[row++] = tmp2;
3046             x[row++] = tmp3;
3047             x[row++] = tmp4;
3048             x[row++] = tmp5;
3049             cnt += 25;
3050             break;
3051 	  default:
3052    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3053         }
3054       }
3055       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3056     }
3057     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3058 
3059       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3060       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3061         ibdiag -= sizes[i]*sizes[i];
3062         sz      = ii[row+1] - diag[row] - 1;
3063         v1      = a->a + diag[row] + 1;
3064         idx     = a->j + diag[row] + 1;
3065 
3066         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3067         switch (sizes[i]){
3068           case 1:
3069 
3070             sum1  = xb[row];
3071             for(n = 0; n<sz-1; n+=2) {
3072               i1   = idx[0];
3073               i2   = idx[1];
3074               idx += 2;
3075               tmp0 = x[i1];
3076               tmp1 = x[i2];
3077               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3078             }
3079 
3080             if (n == sz-1){
3081               tmp0  = x[*idx];
3082               sum1 -= *v1*tmp0;
3083             }
3084             x[row--] = sum1*(*ibdiag);
3085             break;
3086 
3087           case 2:
3088 
3089             sum1  = xb[row];
3090             sum2  = xb[row-1];
3091             /* note that sum1 is associated with the second of the two rows */
3092             v2    = a->a + diag[row-1] + 2;
3093             for(n = 0; n<sz-1; n+=2) {
3094               i1   = idx[0];
3095               i2   = idx[1];
3096               idx += 2;
3097               tmp0 = x[i1];
3098               tmp1 = x[i2];
3099               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3100               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3101             }
3102 
3103             if (n == sz-1){
3104               tmp0  = x[*idx];
3105               sum1 -= *v1*tmp0;
3106               sum2 -= *v2*tmp0;
3107             }
3108             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3109             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3110             break;
3111           case 3:
3112 
3113             sum1  = xb[row];
3114             sum2  = xb[row-1];
3115             sum3  = xb[row-2];
3116             v2    = a->a + diag[row-1] + 2;
3117             v3    = a->a + diag[row-2] + 3;
3118             for(n = 0; n<sz-1; n+=2) {
3119               i1   = idx[0];
3120               i2   = idx[1];
3121               idx += 2;
3122               tmp0 = x[i1];
3123               tmp1 = x[i2];
3124               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3125               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3126               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3127             }
3128 
3129             if (n == sz-1){
3130               tmp0  = x[*idx];
3131               sum1 -= *v1*tmp0;
3132               sum2 -= *v2*tmp0;
3133               sum3 -= *v3*tmp0;
3134             }
3135             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3136             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3137             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3138             break;
3139           case 4:
3140 
3141             sum1  = xb[row];
3142             sum2  = xb[row-1];
3143             sum3  = xb[row-2];
3144             sum4  = xb[row-3];
3145             v2    = a->a + diag[row-1] + 2;
3146             v3    = a->a + diag[row-2] + 3;
3147             v4    = a->a + diag[row-3] + 4;
3148             for(n = 0; n<sz-1; n+=2) {
3149               i1   = idx[0];
3150               i2   = idx[1];
3151               idx += 2;
3152               tmp0 = x[i1];
3153               tmp1 = x[i2];
3154               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3155               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3156               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3157               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3158             }
3159 
3160             if (n == sz-1){
3161               tmp0  = x[*idx];
3162               sum1 -= *v1*tmp0;
3163               sum2 -= *v2*tmp0;
3164               sum3 -= *v3*tmp0;
3165               sum4 -= *v4*tmp0;
3166             }
3167             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3168             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3169             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3170             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3171             break;
3172           case 5:
3173 
3174             sum1  = xb[row];
3175             sum2  = xb[row-1];
3176             sum3  = xb[row-2];
3177             sum4  = xb[row-3];
3178             sum5  = xb[row-4];
3179             v2    = a->a + diag[row-1] + 2;
3180             v3    = a->a + diag[row-2] + 3;
3181             v4    = a->a + diag[row-3] + 4;
3182             v5    = a->a + diag[row-4] + 5;
3183             for(n = 0; n<sz-1; n+=2) {
3184               i1   = idx[0];
3185               i2   = idx[1];
3186               idx += 2;
3187               tmp0 = x[i1];
3188               tmp1 = x[i2];
3189               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3190               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3191               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3192               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3193               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3194             }
3195 
3196             if (n == sz-1){
3197               tmp0  = x[*idx];
3198               sum1 -= *v1*tmp0;
3199               sum2 -= *v2*tmp0;
3200               sum3 -= *v3*tmp0;
3201               sum4 -= *v4*tmp0;
3202               sum5 -= *v5*tmp0;
3203             }
3204             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3205             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3206             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3207             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3208             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3209             break;
3210 	  default:
3211    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3212         }
3213       }
3214 
3215       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3216     }
3217     its--;
3218     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3219     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3220   }
3221   if (flag & SOR_EISENSTAT) {
3222     const PetscScalar *b;
3223     MatScalar         *t = a->inode.ssor_work;
3224 
3225     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3226     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3227     /*
3228           Apply  (U + D)^-1  where D is now the block diagonal
3229     */
3230     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3231     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3232       ibdiag -= sizes[i]*sizes[i];
3233       sz      = ii[row+1] - diag[row] - 1;
3234       v1      = a->a + diag[row] + 1;
3235       idx     = a->j + diag[row] + 1;
3236       CHKMEMQ;
3237       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3238       switch (sizes[i]){
3239         case 1:
3240 
3241 	  sum1  = b[row];
3242 	  for(n = 0; n<sz-1; n+=2) {
3243 	    i1   = idx[0];
3244 	    i2   = idx[1];
3245 	    idx += 2;
3246 	    tmp0 = x[i1];
3247 	    tmp1 = x[i2];
3248 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3249 	  }
3250 
3251 	  if (n == sz-1){
3252 	    tmp0  = x[*idx];
3253 	    sum1 -= *v1*tmp0;
3254 	  }
3255 	  x[row] = sum1*(*ibdiag);row--;
3256 	  break;
3257 
3258         case 2:
3259 
3260 	  sum1  = b[row];
3261 	  sum2  = b[row-1];
3262 	  /* note that sum1 is associated with the second of the two rows */
3263 	  v2    = a->a + diag[row-1] + 2;
3264 	  for(n = 0; n<sz-1; n+=2) {
3265 	    i1   = idx[0];
3266 	    i2   = idx[1];
3267 	    idx += 2;
3268 	    tmp0 = x[i1];
3269 	    tmp1 = x[i2];
3270 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3271 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3272 	  }
3273 
3274 	  if (n == sz-1){
3275 	    tmp0  = x[*idx];
3276 	    sum1 -= *v1*tmp0;
3277 	    sum2 -= *v2*tmp0;
3278 	  }
3279 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3280 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3281           row -= 2;
3282 	  break;
3283         case 3:
3284 
3285 	  sum1  = b[row];
3286 	  sum2  = b[row-1];
3287 	  sum3  = b[row-2];
3288 	  v2    = a->a + diag[row-1] + 2;
3289 	  v3    = a->a + diag[row-2] + 3;
3290 	  for(n = 0; n<sz-1; n+=2) {
3291 	    i1   = idx[0];
3292 	    i2   = idx[1];
3293 	    idx += 2;
3294 	    tmp0 = x[i1];
3295 	    tmp1 = x[i2];
3296 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3297 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3298 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3299 	  }
3300 
3301 	  if (n == sz-1){
3302 	    tmp0  = x[*idx];
3303 	    sum1 -= *v1*tmp0;
3304 	    sum2 -= *v2*tmp0;
3305 	    sum3 -= *v3*tmp0;
3306 	  }
3307 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3308 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3309 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3310           row -= 3;
3311 	  break;
3312         case 4:
3313 
3314 	  sum1  = b[row];
3315 	  sum2  = b[row-1];
3316 	  sum3  = b[row-2];
3317 	  sum4  = b[row-3];
3318 	  v2    = a->a + diag[row-1] + 2;
3319 	  v3    = a->a + diag[row-2] + 3;
3320 	  v4    = a->a + diag[row-3] + 4;
3321 	  for(n = 0; n<sz-1; n+=2) {
3322 	    i1   = idx[0];
3323 	    i2   = idx[1];
3324 	    idx += 2;
3325 	    tmp0 = x[i1];
3326 	    tmp1 = x[i2];
3327 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3328 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3329 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3330 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3331 	  }
3332 
3333 	  if (n == sz-1){
3334 	    tmp0  = x[*idx];
3335 	    sum1 -= *v1*tmp0;
3336 	    sum2 -= *v2*tmp0;
3337 	    sum3 -= *v3*tmp0;
3338 	    sum4 -= *v4*tmp0;
3339 	  }
3340 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3341 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3342 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3343 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3344           row -= 4;
3345 	  break;
3346         case 5:
3347 
3348 	  sum1  = b[row];
3349 	  sum2  = b[row-1];
3350 	  sum3  = b[row-2];
3351 	  sum4  = b[row-3];
3352 	  sum5  = b[row-4];
3353 	  v2    = a->a + diag[row-1] + 2;
3354 	  v3    = a->a + diag[row-2] + 3;
3355 	  v4    = a->a + diag[row-3] + 4;
3356 	  v5    = a->a + diag[row-4] + 5;
3357 	  for(n = 0; n<sz-1; n+=2) {
3358 	    i1   = idx[0];
3359 	    i2   = idx[1];
3360 	    idx += 2;
3361 	    tmp0 = x[i1];
3362 	    tmp1 = x[i2];
3363 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3364 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3365 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3366 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3367 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3368 	  }
3369 
3370 	  if (n == sz-1){
3371 	    tmp0  = x[*idx];
3372 	    sum1 -= *v1*tmp0;
3373 	    sum2 -= *v2*tmp0;
3374 	    sum3 -= *v3*tmp0;
3375 	    sum4 -= *v4*tmp0;
3376 	    sum5 -= *v5*tmp0;
3377 	  }
3378 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3379 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3380 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3381 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3382 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3383           row -= 5;
3384 	  break;
3385         default:
3386 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3387       }
3388       CHKMEMQ;
3389     }
3390     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3391 
3392     /*
3393            t = b - D x    where D is the block diagonal
3394     */
3395     cnt = 0;
3396     for (i=0, row=0; i<m; i++) {
3397       CHKMEMQ;
3398       switch (sizes[i]){
3399         case 1:
3400 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3401 	  break;
3402         case 2:
3403 	  x1   = x[row]; x2 = x[row+1];
3404 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3405 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3406 	  t[row]   = b[row] - tmp1;
3407 	  t[row+1] = b[row+1] - tmp2; row += 2;
3408 	  cnt += 4;
3409 	  break;
3410         case 3:
3411 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3412 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3413 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3414 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3415 	  t[row] = b[row] - tmp1;
3416 	  t[row+1] = b[row+1] - tmp2;
3417 	  t[row+2] = b[row+2] - tmp3; row += 3;
3418 	  cnt += 9;
3419 	  break;
3420         case 4:
3421 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3422 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3423 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3424 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3425 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3426 	  t[row] = b[row] - tmp1;
3427 	  t[row+1] = b[row+1] - tmp2;
3428 	  t[row+2] = b[row+2] - tmp3;
3429 	  t[row+3] = b[row+3] - tmp4; row += 4;
3430 	  cnt += 16;
3431 	  break;
3432         case 5:
3433 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3434 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3435 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3436 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3437 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3438 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3439 	  t[row] = b[row] - tmp1;
3440 	  t[row+1] = b[row+1] - tmp2;
3441 	  t[row+2] = b[row+2] - tmp3;
3442 	  t[row+3] = b[row+3] - tmp4;
3443 	  t[row+4] = b[row+4] - tmp5;row += 5;
3444 	  cnt += 25;
3445 	  break;
3446         default:
3447 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3448       }
3449       CHKMEMQ;
3450     }
3451     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3452 
3453 
3454 
3455     /*
3456           Apply (L + D)^-1 where D is the block diagonal
3457     */
3458     for (i=0, row=0; i<m; i++) {
3459       sz  = diag[row] - ii[row];
3460       v1  = a->a + ii[row];
3461       idx = a->j + ii[row];
3462       CHKMEMQ;
3463       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3464       switch (sizes[i]){
3465         case 1:
3466 
3467 	  sum1  = t[row];
3468 	  for(n = 0; n<sz-1; n+=2) {
3469 	    i1   = idx[0];
3470 	    i2   = idx[1];
3471 	    idx += 2;
3472 	    tmp0 = t[i1];
3473 	    tmp1 = t[i2];
3474 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3475 	  }
3476 
3477 	  if (n == sz-1){
3478 	    tmp0  = t[*idx];
3479 	    sum1 -= *v1 * tmp0;
3480 	  }
3481 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3482 	  break;
3483         case 2:
3484 	  v2    = a->a + ii[row+1];
3485 	  sum1  = t[row];
3486 	  sum2  = t[row+1];
3487 	  for(n = 0; n<sz-1; n+=2) {
3488 	    i1   = idx[0];
3489 	    i2   = idx[1];
3490 	    idx += 2;
3491 	    tmp0 = t[i1];
3492 	    tmp1 = t[i2];
3493 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3494 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3495 	  }
3496 
3497 	  if (n == sz-1){
3498 	    tmp0  = t[*idx];
3499 	    sum1 -= v1[0] * tmp0;
3500 	    sum2 -= v2[0] * tmp0;
3501 	  }
3502 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3503 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3504 	  ibdiag  += 4; row += 2;
3505 	  break;
3506         case 3:
3507 	  v2    = a->a + ii[row+1];
3508 	  v3    = a->a + ii[row+2];
3509 	  sum1  = t[row];
3510 	  sum2  = t[row+1];
3511 	  sum3  = t[row+2];
3512 	  for(n = 0; n<sz-1; n+=2) {
3513 	    i1   = idx[0];
3514 	    i2   = idx[1];
3515 	    idx += 2;
3516 	    tmp0 = t[i1];
3517 	    tmp1 = t[i2];
3518 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3519 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3520 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3521 	  }
3522 
3523 	  if (n == sz-1){
3524 	    tmp0  = t[*idx];
3525 	    sum1 -= v1[0] * tmp0;
3526 	    sum2 -= v2[0] * tmp0;
3527 	    sum3 -= v3[0] * tmp0;
3528 	  }
3529 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3530 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3531 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3532 	  ibdiag  += 9; row += 3;
3533 	  break;
3534         case 4:
3535 	  v2    = a->a + ii[row+1];
3536 	  v3    = a->a + ii[row+2];
3537 	  v4    = a->a + ii[row+3];
3538 	  sum1  = t[row];
3539 	  sum2  = t[row+1];
3540 	  sum3  = t[row+2];
3541 	  sum4  = t[row+3];
3542 	  for(n = 0; n<sz-1; n+=2) {
3543 	    i1   = idx[0];
3544 	    i2   = idx[1];
3545 	    idx += 2;
3546 	    tmp0 = t[i1];
3547 	    tmp1 = t[i2];
3548 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3549 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3550 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3551 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3552 	  }
3553 
3554 	  if (n == sz-1){
3555 	    tmp0  = t[*idx];
3556 	    sum1 -= v1[0] * tmp0;
3557 	    sum2 -= v2[0] * tmp0;
3558 	    sum3 -= v3[0] * tmp0;
3559 	    sum4 -= v4[0] * tmp0;
3560 	  }
3561 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3562 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3563 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3564 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3565 	  ibdiag  += 16; row += 4;
3566 	  break;
3567         case 5:
3568 	  v2    = a->a + ii[row+1];
3569 	  v3    = a->a + ii[row+2];
3570 	  v4    = a->a + ii[row+3];
3571 	  v5    = a->a + ii[row+4];
3572 	  sum1  = t[row];
3573 	  sum2  = t[row+1];
3574 	  sum3  = t[row+2];
3575 	  sum4  = t[row+3];
3576 	  sum5  = t[row+4];
3577 	  for(n = 0; n<sz-1; n+=2) {
3578 	    i1   = idx[0];
3579 	    i2   = idx[1];
3580 	    idx += 2;
3581 	    tmp0 = t[i1];
3582 	    tmp1 = t[i2];
3583 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3584 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3585 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3586 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3587 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3588 	  }
3589 
3590 	  if (n == sz-1){
3591 	    tmp0  = t[*idx];
3592 	    sum1 -= v1[0] * tmp0;
3593 	    sum2 -= v2[0] * tmp0;
3594 	    sum3 -= v3[0] * tmp0;
3595 	    sum4 -= v4[0] * tmp0;
3596 	    sum5 -= v5[0] * tmp0;
3597 	  }
3598 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3599 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3600 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3601 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3602 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3603 	  ibdiag  += 25; row += 5;
3604 	  break;
3605         default:
3606 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3607       }
3608       CHKMEMQ;
3609     }
3610     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3611     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3612     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3613   }
3614   PetscFunctionReturn(0);
3615 }
3616 
3617 #undef __FUNCT__
3618 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3619 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3620 {
3621   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3622   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3623   const MatScalar    *bdiag = a->inode.bdiag;
3624   const PetscScalar  *b;
3625   PetscErrorCode      ierr;
3626   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3627   const PetscInt      *sizes = a->inode.size;
3628 
3629   PetscFunctionBegin;
3630   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3631   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3632   cnt = 0;
3633   for (i=0, row=0; i<m; i++) {
3634     switch (sizes[i]){
3635       case 1:
3636 	x[row] = b[row]*bdiag[cnt++];row++;
3637 	break;
3638       case 2:
3639 	x1   = b[row]; x2 = b[row+1];
3640 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3641 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3642 	x[row++] = tmp1;
3643 	x[row++] = tmp2;
3644 	cnt += 4;
3645 	break;
3646       case 3:
3647 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3648 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3649 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3650 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3651 	x[row++] = tmp1;
3652 	x[row++] = tmp2;
3653 	x[row++] = tmp3;
3654 	cnt += 9;
3655 	break;
3656       case 4:
3657 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3658 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3659 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3660 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3661 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3662 	x[row++] = tmp1;
3663 	x[row++] = tmp2;
3664 	x[row++] = tmp3;
3665 	x[row++] = tmp4;
3666 	cnt += 16;
3667 	break;
3668       case 5:
3669 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3670 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3671 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3672 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3673 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3674 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3675 	x[row++] = tmp1;
3676 	x[row++] = tmp2;
3677 	x[row++] = tmp3;
3678 	x[row++] = tmp4;
3679 	x[row++] = tmp5;
3680 	cnt += 25;
3681 	break;
3682       default:
3683 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3684     }
3685   }
3686   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3688   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3689   PetscFunctionReturn(0);
3690 }
3691 
3692 /*
3693     samestructure indicates that the matrix has not changed its nonzero structure so we
3694     do not need to recompute the inodes
3695 */
3696 #undef __FUNCT__
3697 #define __FUNCT__ "Mat_CheckInode"
3698 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3699 {
3700   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3701   PetscErrorCode ierr;
3702   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3703   PetscTruth     flag;
3704 
3705   PetscFunctionBegin;
3706   if (!a->inode.use)                     PetscFunctionReturn(0);
3707   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3708 
3709 
3710   m = A->rmap->n;
3711   if (a->inode.size) {ns = a->inode.size;}
3712   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3713 
3714   i          = 0;
3715   node_count = 0;
3716   idx        = a->j;
3717   ii         = a->i;
3718   while (i < m){                /* For each row */
3719     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3720     /* Limits the number of elements in a node to 'a->inode.limit' */
3721     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3722       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3723       if (nzy != nzx) break;
3724       idy  += nzx;             /* Same nonzero pattern */
3725       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3726       if (!flag) break;
3727     }
3728     ns[node_count++] = blk_size;
3729     idx += blk_size*nzx;
3730     i    = j;
3731   }
3732   /* If not enough inodes found,, do not use inode version of the routines */
3733   if (!m || node_count > .9*m) {
3734     ierr = PetscFree(ns);CHKERRQ(ierr);
3735     a->inode.node_count       = 0;
3736     a->inode.size             = PETSC_NULL;
3737     a->inode.use              = PETSC_FALSE;
3738     A->ops->mult              = MatMult_SeqAIJ;
3739     A->ops->sor               = MatSOR_SeqAIJ;
3740     A->ops->multadd           = MatMultAdd_SeqAIJ;
3741     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
3742     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
3743     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
3744     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
3745     A->ops->coloringpatch     = 0;
3746     A->ops->multdiagonalblock = 0;
3747     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3748   } else {
3749     if (!A->factortype) {
3750       A->ops->mult              = MatMult_SeqAIJ_Inode;
3751       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3752       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3753       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3754       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3755       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3756       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3757       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3758       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3759     } else {
3760       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3761     }
3762     a->inode.node_count       = node_count;
3763     a->inode.size             = ns;
3764     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3765   }
3766   PetscFunctionReturn(0);
3767 }
3768 
3769 #undef __FUNCT__
3770 #define __FUNCT__ "MatGetRow_FactoredLU"
3771 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
3772 {
3773   PetscInt k, *vi;
3774   PetscFunctionBegin;
3775   vi = aj + ai[row];
3776   for(k=0;k<nzl;k++) cols[k] = vi[k];
3777   vi = aj + adiag[row];
3778   cols[nzl] = vi[0];
3779   vi = aj + adiag[row+1]+1;
3780   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
3781   PetscFunctionReturn(0);
3782 }
3783 /*
3784    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3785    Modified from Mat_CheckInode().
3786 
3787    Input Parameters:
3788 +  Mat A - ILU or LU matrix factor
3789 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
3790     do not need to recompute the inodes
3791 */
3792 #undef __FUNCT__
3793 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3794 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3795 {
3796   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3797   PetscErrorCode ierr;
3798   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3799   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3800   PetscTruth     flag;
3801 
3802   PetscFunctionBegin;
3803   if (!a->inode.use)                     PetscFunctionReturn(0);
3804   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3805 
3806   m = A->rmap->n;
3807   if (a->inode.size) {ns = a->inode.size;}
3808   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3809 
3810   i          = 0;
3811   node_count = 0;
3812   while (i < m){                /* For each row */
3813     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3814     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3815     nzx  = nzl1 + nzu1 + 1;
3816     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3817     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3818 
3819     /* Limits the number of elements in a node to 'a->inode.limit' */
3820     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3821       nzl2    = ai[j+1] - ai[j];
3822       nzu2    = adiag[j] - adiag[j+1] - 1;
3823       nzy     = nzl2 + nzu2 + 1;
3824       if( nzy != nzx) break;
3825       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3826       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
3827       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3828       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3829       ierr = PetscFree(cols2);CHKERRQ(ierr);
3830     }
3831     ns[node_count++] = blk_size;
3832     ierr = PetscFree(cols1);CHKERRQ(ierr);
3833     i    = j;
3834   }
3835   /* If not enough inodes found,, do not use inode version of the routines */
3836   if (!m || node_count > .9*m) {
3837     ierr = PetscFree(ns);CHKERRQ(ierr);
3838     a->inode.node_count     = 0;
3839     a->inode.size           = PETSC_NULL;
3840     a->inode.use            = PETSC_FALSE;
3841     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3842   } else {
3843     A->ops->mult              = 0;
3844     A->ops->sor               = 0;
3845     A->ops->multadd           = 0;
3846     A->ops->getrowij          = 0;
3847     A->ops->restorerowij      = 0;
3848     A->ops->getcolumnij       = 0;
3849     A->ops->restorecolumnij   = 0;
3850     A->ops->coloringpatch     = 0;
3851     A->ops->multdiagonalblock = 0;
3852     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3853     a->inode.node_count       = node_count;
3854     a->inode.size             = ns;
3855     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3856   }
3857   PetscFunctionReturn(0);
3858 }
3859 
3860 /*
3861      This is really ugly. if inodes are used this replaces the
3862   permutations with ones that correspond to rows/cols of the matrix
3863   rather then inode blocks
3864 */
3865 #undef __FUNCT__
3866 #define __FUNCT__ "MatInodeAdjustForInodes"
3867 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3868 {
3869   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3870 
3871   PetscFunctionBegin;
3872   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3873   if (f) {
3874     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3875   }
3876   PetscFunctionReturn(0);
3877 }
3878 
3879 EXTERN_C_BEGIN
3880 #undef __FUNCT__
3881 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3882 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3883 {
3884   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3885   PetscErrorCode ierr;
3886   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3887   const PetscInt *ridx,*cidx;
3888   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3889   PetscInt       nslim_col,*ns_col;
3890   IS             ris = *rperm,cis = *cperm;
3891 
3892   PetscFunctionBegin;
3893   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3894   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3895 
3896   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3897   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3898   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3899 
3900   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3901   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3902 
3903   /* Form the inode structure for the rows of permuted matric using inv perm*/
3904   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3905 
3906   /* Construct the permutations for rows*/
3907   for (i=0,row = 0; i<nslim_row; ++i){
3908     indx      = ridx[i];
3909     start_val = tns[indx];
3910     end_val   = tns[indx + 1];
3911     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3912   }
3913 
3914   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3915   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3916 
3917  /* Construct permutations for columns */
3918   for (i=0,col=0; i<nslim_col; ++i){
3919     indx      = cidx[i];
3920     start_val = tns[indx];
3921     end_val   = tns[indx + 1];
3922     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3923   }
3924 
3925   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3926   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3927   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3928   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3929 
3930   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3931   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3932 
3933   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3934   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3935   ierr = ISDestroy(cis);CHKERRQ(ierr);
3936   ierr = ISDestroy(ris);CHKERRQ(ierr);
3937   ierr = PetscFree(tns);CHKERRQ(ierr);
3938   PetscFunctionReturn(0);
3939 }
3940 EXTERN_C_END
3941 
3942 #undef __FUNCT__
3943 #define __FUNCT__ "MatInodeGetInodeSizes"
3944 /*@C
3945    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3946 
3947    Not Collective
3948 
3949    Input Parameter:
3950 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3951 
3952    Output Parameter:
3953 +  node_count - no of inodes present in the matrix.
3954 .  sizes      - an array of size node_count,with sizes of each inode.
3955 -  limit      - the max size used to generate the inodes.
3956 
3957    Level: advanced
3958 
3959    Notes: This routine returns some internal storage information
3960    of the matrix, it is intended to be used by advanced users.
3961    It should be called after the matrix is assembled.
3962    The contents of the sizes[] array should not be changed.
3963    PETSC_NULL may be passed for information not requested.
3964 
3965 .keywords: matrix, seqaij, get, inode
3966 
3967 .seealso: MatGetInfo()
3968 @*/
3969 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3970 {
3971   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3972 
3973   PetscFunctionBegin;
3974   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3975   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3976   if (f) {
3977     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3978   }
3979   PetscFunctionReturn(0);
3980 }
3981 
3982 EXTERN_C_BEGIN
3983 #undef __FUNCT__
3984 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3985 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3986 {
3987   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3988 
3989   PetscFunctionBegin;
3990   if (node_count) *node_count = a->inode.node_count;
3991   if (sizes)      *sizes      = a->inode.size;
3992   if (limit)      *limit      = a->inode.limit;
3993   PetscFunctionReturn(0);
3994 }
3995 EXTERN_C_END
3996