xref: /petsc/src/mat/impls/aij/seq/inode.c (revision b79b07cfcefae6db9a29cf138c901e6c3b8d1456)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   rtmp4 = rtmp3 + n;
1234   ics   = ic;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1239 
1240   /* If max inode size > 4, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i] > 4) {
1245       tmp_vec1[j] = 4;
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.newshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1348 	if(sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1432 	if(sctx.newshift) break;
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         for (j=0; j<nz; j++) {
1456           col = pj[j];
1457           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1458         }
1459         /* U part */
1460         pc2 = b->a + bdiag[i+2]+1;
1461         pj  = b->j + bdiag[i+2]+1;
1462         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1463         for (j=0; j<nz; j++) {
1464           col = pj[j];
1465           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1466         }
1467 
1468         sctx.rs  = rs;
1469         sctx.pv  = rtmp2[i+1];
1470         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1471 	if(sctx.newshift) break;
1472         pc2  = b->a + bdiag[i+1];
1473         *pc2 = 1.0/sctx.pv;
1474         break;
1475 
1476       case 3:
1477       /*----------*/
1478         /* zero rtmp */
1479         /* L part */
1480         nz    = bi[i+1] - bi[i];
1481         bjtmp = bj + bi[i];
1482         for  (j=0; j<nz; j++) {
1483           col = bjtmp[j];
1484           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1485         }
1486 
1487         /* U part */
1488         nz = bdiag[i]-bdiag[i+1];
1489         bjtmp = bj + bdiag[i+1]+1;
1490         for  (j=0; j<nz; j++) {
1491           col = bjtmp[j];
1492           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1493         }
1494 
1495         /* load in initial (unfactored row) */
1496         nz    = ai[r[i]+1] - ai[r[i]];
1497         ajtmp = aj + ai[r[i]];
1498         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1499         for (j=0; j<nz; j++) {
1500           col = ics[ajtmp[j]];
1501           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1502         }
1503         /* ZeropivotApply(): shift the diagonal of the matrix  */
1504         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1505 
1506         /* elimination */
1507         bjtmp = bj + bi[i];
1508         row   = *bjtmp++; /* pivot row */
1509         nzL   = bi[i+1] - bi[i];
1510         for(k=0; k < nzL;k++) {
1511           pc1 = rtmp1 + row;
1512           pc2 = rtmp2 + row;
1513           pc3 = rtmp3 + row;
1514           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1515             pv  = b->a + bdiag[row];
1516             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1517             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1518 
1519             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1520             pv = b->a + bdiag[row+1]+1;
1521             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1522             for (j=0; j<nz; j++){
1523               col = pj[j];
1524               rtmp1[col] -= mul1 * pv[j];
1525               rtmp2[col] -= mul2 * pv[j];
1526               rtmp3[col] -= mul3 * pv[j];
1527             }
1528             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1529           }
1530           row = *bjtmp++;
1531         }
1532 
1533         /* finished row i; check zero pivot, then stick row i into b->a */
1534         rs  = 0.0;
1535         /* L part */
1536         pc1 = b->a + bi[i];
1537         pj  = b->j + bi[i] ;
1538         nz  = bi[i+1] - bi[i];
1539         for (j=0; j<nz; j++) {
1540           col = pj[j];
1541           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1542         }
1543         /* U part */
1544         pc1 = b->a + bdiag[i+1]+1;
1545         pj  = b->j + bdiag[i+1]+1;
1546         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1547         for (j=0; j<nz; j++) {
1548           col = pj[j];
1549           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1550         }
1551 
1552         sctx.rs  = rs;
1553         sctx.pv  = rtmp1[i];
1554         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1555 	if(sctx.newshift) break;
1556         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1557         *pc1 = 1.0/sctx.pv;
1558 
1559         /* Now take care of 1st column of diagonal 3x3 block. */
1560         pc2 = rtmp2 + i;
1561         pc3 = rtmp3 + i;
1562         if (*pc2 != 0.0 || *pc3 != 0.0){
1563           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1564           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1565           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1566           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1567           for (j=0; j<nz; j++) {
1568             col = pj[j];
1569             rtmp2[col] -= mul2 * rtmp1[col];
1570             rtmp3[col] -= mul3 * rtmp1[col];
1571           }
1572           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1573         }
1574 
1575         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1576         rs = 0.0;
1577         /* L part */
1578         pc2 = b->a + bi[i+1];
1579         pj  = b->j + bi[i+1] ;
1580         nz  = bi[i+2] - bi[i+1];
1581         for (j=0; j<nz; j++) {
1582           col = pj[j];
1583           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1584         }
1585         /* U part */
1586         pc2 = b->a + bdiag[i+2]+1;
1587         pj  = b->j + bdiag[i+2]+1;
1588         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1589         for (j=0; j<nz; j++) {
1590           col = pj[j];
1591           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1592         }
1593 
1594         sctx.rs  = rs;
1595         sctx.pv  = rtmp2[i+1];
1596         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1597 	if(sctx.newshift) break;
1598         pc2  = b->a + bdiag[i+1];
1599         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1600 
1601         /* Now take care of 2nd column of diagonal 3x3 block. */
1602         pc3 = rtmp3 + i+1;
1603         if (*pc3 != 0.0){
1604           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1605           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1606           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1607           for (j=0; j<nz; j++) {
1608             col = pj[j];
1609             rtmp3[col] -= mul3 * rtmp2[col];
1610           }
1611           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1612         }
1613 
1614         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1615         rs = 0.0;
1616         /* L part */
1617         pc3 = b->a + bi[i+2];
1618         pj  = b->j + bi[i+2] ;
1619         nz  = bi[i+3] - bi[i+2];
1620         for (j=0; j<nz; j++) {
1621           col = pj[j];
1622           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1623         }
1624         /* U part */
1625         pc3 = b->a + bdiag[i+3]+1;
1626         pj  = b->j + bdiag[i+3]+1;
1627         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1628         for (j=0; j<nz; j++) {
1629           col = pj[j];
1630           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1631         }
1632 
1633         sctx.rs  = rs;
1634         sctx.pv  = rtmp3[i+2];
1635         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1636 	if(sctx.newshift) break;
1637         pc3  = b->a + bdiag[i+2];
1638         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1639         break;
1640       case 4:
1641       /*----------*/
1642         /* zero rtmp */
1643         /* L part */
1644         nz    = bi[i+1] - bi[i];
1645         bjtmp = bj + bi[i];
1646         for  (j=0; j<nz; j++) {
1647           col = bjtmp[j];
1648           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1649         }
1650 
1651         /* U part */
1652         nz = bdiag[i]-bdiag[i+1];
1653         bjtmp = bj + bdiag[i+1]+1;
1654         for  (j=0; j<nz; j++) {
1655           col = bjtmp[j];
1656           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1657         }
1658 
1659         /* load in initial (unfactored row) */
1660         nz    = ai[r[i]+1] - ai[r[i]];
1661         ajtmp = aj + ai[r[i]];
1662         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1663         for (j=0; j<nz; j++) {
1664           col = ics[ajtmp[j]];
1665           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1666         }
1667         /* ZeropivotApply(): shift the diagonal of the matrix  */
1668         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1669 
1670         /* elimination */
1671         bjtmp = bj + bi[i];
1672         row   = *bjtmp++; /* pivot row */
1673         nzL   = bi[i+1] - bi[i];
1674         for(k=0; k < nzL;k++) {
1675           pc1 = rtmp1 + row;
1676           pc2 = rtmp2 + row;
1677           pc3 = rtmp3 + row;
1678 	  pc4 = rtmp4 + row;
1679           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1680             pv  = b->a + bdiag[row];
1681             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1682             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1683 
1684             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1685             pv = b->a + bdiag[row+1]+1;
1686             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1687             for (j=0; j<nz; j++){
1688               col = pj[j];
1689               rtmp1[col] -= mul1 * pv[j];
1690               rtmp2[col] -= mul2 * pv[j];
1691               rtmp3[col] -= mul3 * pv[j];
1692 	      rtmp4[col] -= mul4 * pv[j];
1693             }
1694             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1695           }
1696           row = *bjtmp++;
1697         }
1698 
1699         /* finished row i; check zero pivot, then stick row i into b->a */
1700         rs  = 0.0;
1701         /* L part */
1702         pc1 = b->a + bi[i];
1703         pj  = b->j + bi[i] ;
1704         nz  = bi[i+1] - bi[i];
1705         for (j=0; j<nz; j++) {
1706           col = pj[j];
1707           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1708         }
1709         /* U part */
1710         pc1 = b->a + bdiag[i+1]+1;
1711         pj  = b->j + bdiag[i+1]+1;
1712         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1713         for (j=0; j<nz; j++) {
1714           col = pj[j];
1715           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1716         }
1717 
1718         sctx.rs  = rs;
1719         sctx.pv  = rtmp1[i];
1720         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1721 	if(sctx.newshift) break;
1722         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1723         *pc1 = 1.0/sctx.pv;
1724 
1725         /* Now take care of 1st column of diagonal 4x4 block. */
1726         pc2 = rtmp2 + i;
1727         pc3 = rtmp3 + i;
1728 	pc4 = rtmp4 + i;
1729         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1730           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1731           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1732 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1733           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1734           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1735           for (j=0; j<nz; j++) {
1736             col = pj[j];
1737             rtmp2[col] -= mul2 * rtmp1[col];
1738             rtmp3[col] -= mul3 * rtmp1[col];
1739 	    rtmp4[col] -= mul4 * rtmp1[col];
1740           }
1741           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1742         }
1743 
1744         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1745         rs = 0.0;
1746         /* L part */
1747         pc2 = b->a + bi[i+1];
1748         pj  = b->j + bi[i+1] ;
1749         nz  = bi[i+2] - bi[i+1];
1750         for (j=0; j<nz; j++) {
1751           col = pj[j];
1752           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1753         }
1754         /* U part */
1755         pc2 = b->a + bdiag[i+2]+1;
1756         pj  = b->j + bdiag[i+2]+1;
1757         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1758         for (j=0; j<nz; j++) {
1759           col = pj[j];
1760           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1761         }
1762 
1763         sctx.rs  = rs;
1764         sctx.pv  = rtmp2[i+1];
1765         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1766 	if(sctx.newshift) break;
1767         pc2  = b->a + bdiag[i+1];
1768         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1769 
1770         /* Now take care of 2nd column of diagonal 4x4 block. */
1771         pc3 = rtmp3 + i+1;
1772 	pc4 = rtmp4 + i+1;
1773         if (*pc3 != 0.0 || *pc4 != 0.0){
1774           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1775 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1776           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1777           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1778           for (j=0; j<nz; j++) {
1779             col = pj[j];
1780             rtmp3[col] -= mul3 * rtmp2[col];
1781 	    rtmp4[col] -= mul4 * rtmp2[col];
1782           }
1783           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1784         }
1785 
1786         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1787         rs = 0.0;
1788         /* L part */
1789         pc3 = b->a + bi[i+2];
1790         pj  = b->j + bi[i+2] ;
1791         nz  = bi[i+3] - bi[i+2];
1792         for (j=0; j<nz; j++) {
1793           col = pj[j];
1794           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1795         }
1796         /* U part */
1797         pc3 = b->a + bdiag[i+3]+1;
1798         pj  = b->j + bdiag[i+3]+1;
1799         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1800         for (j=0; j<nz; j++) {
1801           col = pj[j];
1802           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1803         }
1804 
1805         sctx.rs  = rs;
1806         sctx.pv  = rtmp3[i+2];
1807         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1808 	if(sctx.newshift) break;
1809         pc3  = b->a + bdiag[i+2];
1810         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1811 
1812 	/* Now take care of 3rd column of diagonal 4x4 block. */
1813 	pc4 = rtmp4 + i+2;
1814         if (*pc4 != 0.0){
1815 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1816           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1817           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1818           for (j=0; j<nz; j++) {
1819             col = pj[j];
1820 	    rtmp4[col] -= mul4 * rtmp3[col];
1821           }
1822           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1823         }
1824 
1825         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1826         rs = 0.0;
1827         /* L part */
1828         pc4 = b->a + bi[i+3];
1829         pj  = b->j + bi[i+3] ;
1830         nz  = bi[i+4] - bi[i+3];
1831         for (j=0; j<nz; j++) {
1832           col = pj[j];
1833           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1834         }
1835         /* U part */
1836         pc4 = b->a + bdiag[i+4]+1;
1837         pj  = b->j + bdiag[i+4]+1;
1838         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1839         for (j=0; j<nz; j++) {
1840           col = pj[j];
1841           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1842         }
1843 
1844         sctx.rs  = rs;
1845         sctx.pv  = rtmp4[i+3];
1846         ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr);
1847 	if(sctx.newshift) break;
1848         pc4  = b->a + bdiag[i+3];
1849         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1850          break;
1851 
1852       default:
1853         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1854       }
1855       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1856       i += nodesz;                 /* Update the row */
1857     }
1858 
1859     /* MatPivotRefine() */
1860     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1861       /*
1862        * if no shift in this attempt & shifting & started shifting & can refine,
1863        * then try lower shift
1864        */
1865       sctx.shift_hi       = sctx.shift_fraction;
1866       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1867       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1868       sctx.newshift       = PETSC_TRUE;
1869       sctx.nshift++;
1870     }
1871   } while (sctx.newshift);
1872 
1873   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1874   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1876   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1877 
1878   C->ops->solve              = MatSolve_SeqAIJ;
1879   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1880   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1881   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1882   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1883   C->assembled    = PETSC_TRUE;
1884   C->preallocated = PETSC_TRUE;
1885   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1886 
1887   /* MatShiftView(A,info,&sctx) */
1888   if (sctx.nshift){
1889     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1890       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1892       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1894       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1895     }
1896   }
1897   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 #undef __FUNCT__
1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1904 {
1905   Mat               C = B;
1906   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1907   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1908   PetscErrorCode    ierr;
1909   const PetscInt    *r,*ic,*c,*ics;
1910   PetscInt          n = A->rmap->n,*bi = b->i;
1911   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1912   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1913   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1914   PetscScalar       mul1,mul2,mul3,tmp;
1915   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1916   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1917   PetscReal         rs=0.0;
1918   FactorShiftCtx    sctx;
1919 
1920   PetscFunctionBegin;
1921   sctx.shift_top      = 0;
1922   sctx.nshift_max     = 0;
1923   sctx.shift_lo       = 0;
1924   sctx.shift_hi       = 0;
1925   sctx.shift_fraction = 0;
1926 
1927   /* if both shift schemes are chosen by user, only use info->shiftpd */
1928   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1929     sctx.shift_top = 0;
1930     for (i=0; i<n; i++) {
1931       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1932       rs    = 0.0;
1933       ajtmp = aj + ai[i];
1934       rtmp1 = aa + ai[i];
1935       nz = ai[i+1] - ai[i];
1936       for (j=0; j<nz; j++){
1937         if (*ajtmp != i){
1938           rs += PetscAbsScalar(*rtmp1++);
1939         } else {
1940           rs -= PetscRealPart(*rtmp1++);
1941         }
1942         ajtmp++;
1943       }
1944       if (rs>sctx.shift_top) sctx.shift_top = rs;
1945     }
1946     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1947     sctx.shift_top *= 1.1;
1948     sctx.nshift_max = 5;
1949     sctx.shift_lo   = 0.;
1950     sctx.shift_hi   = 1.;
1951   }
1952   sctx.shift_amount = 0;
1953   sctx.nshift       = 0;
1954 
1955   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1957   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1958   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1959   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1960   ics   = ic ;
1961   rtmp22 = rtmp11 + n;
1962   rtmp33 = rtmp22 + n;
1963 
1964   node_max = a->inode.node_count;
1965   ns       = a->inode.size;
1966   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1967 
1968   /* If max inode size > 3, split it into two inodes.*/
1969   /* also map the inode sizes according to the ordering */
1970   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1971   for (i=0,j=0; i<node_max; ++i,++j){
1972     if (ns[i]>3) {
1973       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1974       ++j;
1975       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1976     } else {
1977       tmp_vec1[j] = ns[i];
1978     }
1979   }
1980   /* Use the correct node_max */
1981   node_max = j;
1982 
1983   /* Now reorder the inode info based on mat re-ordering info */
1984   /* First create a row -> inode_size_array_index map */
1985   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1986   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1987   for (i=0,row=0; i<node_max; i++) {
1988     nodesz = tmp_vec1[i];
1989     for (j=0; j<nodesz; j++,row++) {
1990       nsmap[row] = i;
1991     }
1992   }
1993   /* Using nsmap, create a reordered ns structure */
1994   for (i=0,j=0; i< node_max; i++) {
1995     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1996     tmp_vec2[i]  = nodesz;
1997     j           += nodesz;
1998   }
1999   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2000   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2001   /* Now use the correct ns */
2002   ns = tmp_vec2;
2003 
2004   do {
2005     sctx.newshift = PETSC_FALSE;
2006     /* Now loop over each block-row, and do the factorization */
2007     for (i=0,row=0; i<node_max; i++) {
2008       nodesz = ns[i];
2009       nz     = bi[row+1] - bi[row];
2010       bjtmp  = bj + bi[row];
2011 
2012       switch (nodesz){
2013       case 1:
2014         for  (j=0; j<nz; j++){
2015           idx        = bjtmp[j];
2016           rtmp11[idx] = 0.0;
2017         }
2018 
2019         /* load in initial (unfactored row) */
2020         idx    = r[row];
2021         nz_tmp = ai[idx+1] - ai[idx];
2022         ajtmp  = aj + ai[idx];
2023         v1     = aa + ai[idx];
2024 
2025         for (j=0; j<nz_tmp; j++) {
2026           idx        = ics[ajtmp[j]];
2027           rtmp11[idx] = v1[j];
2028         }
2029         rtmp11[ics[r[row]]] += sctx.shift_amount;
2030 
2031         prow = *bjtmp++ ;
2032         while (prow < row) {
2033           pc1 = rtmp11 + prow;
2034           if (*pc1 != 0.0){
2035             pv   = ba + bd[prow];
2036             pj   = nbj + bd[prow];
2037             mul1 = *pc1 * *pv++;
2038             *pc1 = mul1;
2039             nz_tmp = bi[prow+1] - bd[prow] - 1;
2040             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2041             for (j=0; j<nz_tmp; j++) {
2042               tmp = pv[j];
2043               idx = pj[j];
2044               rtmp11[idx] -= mul1 * tmp;
2045             }
2046           }
2047           prow = *bjtmp++ ;
2048         }
2049         pj  = bj + bi[row];
2050         pc1 = ba + bi[row];
2051 
2052         sctx.pv    = rtmp11[row];
2053         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2054         rs         = 0.0;
2055         for (j=0; j<nz; j++) {
2056           idx    = pj[j];
2057           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2058           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2059         }
2060         sctx.rs  = rs;
2061         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2062         if (sctx.newshift) goto endofwhile;
2063         break;
2064 
2065       case 2:
2066         for (j=0; j<nz; j++) {
2067           idx        = bjtmp[j];
2068           rtmp11[idx] = 0.0;
2069           rtmp22[idx] = 0.0;
2070         }
2071 
2072         /* load in initial (unfactored row) */
2073         idx    = r[row];
2074         nz_tmp = ai[idx+1] - ai[idx];
2075         ajtmp  = aj + ai[idx];
2076         v1     = aa + ai[idx];
2077         v2     = aa + ai[idx+1];
2078         for (j=0; j<nz_tmp; j++) {
2079           idx        = ics[ajtmp[j]];
2080           rtmp11[idx] = v1[j];
2081           rtmp22[idx] = v2[j];
2082         }
2083         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2084         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2085 
2086         prow = *bjtmp++ ;
2087         while (prow < row) {
2088           pc1 = rtmp11 + prow;
2089           pc2 = rtmp22 + prow;
2090           if (*pc1 != 0.0 || *pc2 != 0.0){
2091             pv   = ba + bd[prow];
2092             pj   = nbj + bd[prow];
2093             mul1 = *pc1 * *pv;
2094             mul2 = *pc2 * *pv;
2095             ++pv;
2096             *pc1 = mul1;
2097             *pc2 = mul2;
2098 
2099             nz_tmp = bi[prow+1] - bd[prow] - 1;
2100             for (j=0; j<nz_tmp; j++) {
2101               tmp = pv[j];
2102               idx = pj[j];
2103               rtmp11[idx] -= mul1 * tmp;
2104               rtmp22[idx] -= mul2 * tmp;
2105             }
2106             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2107           }
2108           prow = *bjtmp++ ;
2109         }
2110 
2111         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2112         pc1 = rtmp11 + prow;
2113         pc2 = rtmp22 + prow;
2114 
2115         sctx.pv = *pc1;
2116         pj      = bj + bi[prow];
2117         rs      = 0.0;
2118         for (j=0; j<nz; j++){
2119           idx = pj[j];
2120           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2121         }
2122         sctx.rs = rs;
2123         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2124         if (sctx.newshift) goto endofwhile;
2125 
2126         if (*pc2 != 0.0){
2127           pj     = nbj + bd[prow];
2128           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2129           *pc2   = mul2;
2130           nz_tmp = bi[prow+1] - bd[prow] - 1;
2131           for (j=0; j<nz_tmp; j++) {
2132             idx = pj[j] ;
2133             tmp = rtmp11[idx];
2134             rtmp22[idx] -= mul2 * tmp;
2135           }
2136           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2137         }
2138 
2139         pj  = bj + bi[row];
2140         pc1 = ba + bi[row];
2141         pc2 = ba + bi[row+1];
2142 
2143         sctx.pv = rtmp22[row+1];
2144         rs = 0.0;
2145         rtmp11[row]   = 1.0/rtmp11[row];
2146         rtmp22[row+1] = 1.0/rtmp22[row+1];
2147         /* copy row entries from dense representation to sparse */
2148         for (j=0; j<nz; j++) {
2149           idx    = pj[j];
2150           pc1[j] = rtmp11[idx];
2151           pc2[j] = rtmp22[idx];
2152           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2153         }
2154         sctx.rs = rs;
2155         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2156         if (sctx.newshift) goto endofwhile;
2157         break;
2158 
2159       case 3:
2160         for  (j=0; j<nz; j++) {
2161           idx        = bjtmp[j];
2162           rtmp11[idx] = 0.0;
2163           rtmp22[idx] = 0.0;
2164           rtmp33[idx] = 0.0;
2165         }
2166         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2167         idx    = r[row];
2168         nz_tmp = ai[idx+1] - ai[idx];
2169         ajtmp = aj + ai[idx];
2170         v1    = aa + ai[idx];
2171         v2    = aa + ai[idx+1];
2172         v3    = aa + ai[idx+2];
2173         for (j=0; j<nz_tmp; j++) {
2174           idx        = ics[ajtmp[j]];
2175           rtmp11[idx] = v1[j];
2176           rtmp22[idx] = v2[j];
2177           rtmp33[idx] = v3[j];
2178         }
2179         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2180         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2181         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2182 
2183         /* loop over all pivot row blocks above this row block */
2184         prow = *bjtmp++ ;
2185         while (prow < row) {
2186           pc1 = rtmp11 + prow;
2187           pc2 = rtmp22 + prow;
2188           pc3 = rtmp33 + prow;
2189           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2190             pv   = ba  + bd[prow];
2191             pj   = nbj + bd[prow];
2192             mul1 = *pc1 * *pv;
2193             mul2 = *pc2 * *pv;
2194             mul3 = *pc3 * *pv;
2195             ++pv;
2196             *pc1 = mul1;
2197             *pc2 = mul2;
2198             *pc3 = mul3;
2199 
2200             nz_tmp = bi[prow+1] - bd[prow] - 1;
2201             /* update this row based on pivot row */
2202             for (j=0; j<nz_tmp; j++) {
2203               tmp = pv[j];
2204               idx = pj[j];
2205               rtmp11[idx] -= mul1 * tmp;
2206               rtmp22[idx] -= mul2 * tmp;
2207               rtmp33[idx] -= mul3 * tmp;
2208             }
2209             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2210           }
2211           prow = *bjtmp++ ;
2212         }
2213 
2214         /* Now take care of diagonal 3x3 block in this set of rows */
2215         /* note: prow = row here */
2216         pc1 = rtmp11 + prow;
2217         pc2 = rtmp22 + prow;
2218         pc3 = rtmp33 + prow;
2219 
2220         sctx.pv = *pc1;
2221         pj      = bj + bi[prow];
2222         rs      = 0.0;
2223         for (j=0; j<nz; j++){
2224           idx = pj[j];
2225           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2226         }
2227         sctx.rs = rs;
2228         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2229         if (sctx.newshift) goto endofwhile;
2230 
2231         if (*pc2 != 0.0 || *pc3 != 0.0){
2232           mul2 = (*pc2)/(*pc1);
2233           mul3 = (*pc3)/(*pc1);
2234           *pc2 = mul2;
2235           *pc3 = mul3;
2236           nz_tmp = bi[prow+1] - bd[prow] - 1;
2237           pj     = nbj + bd[prow];
2238           for (j=0; j<nz_tmp; j++) {
2239             idx = pj[j] ;
2240             tmp = rtmp11[idx];
2241             rtmp22[idx] -= mul2 * tmp;
2242             rtmp33[idx] -= mul3 * tmp;
2243           }
2244           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2245         }
2246         ++prow;
2247 
2248         pc2 = rtmp22 + prow;
2249         pc3 = rtmp33 + prow;
2250         sctx.pv = *pc2;
2251         pj      = bj + bi[prow];
2252         rs      = 0.0;
2253         for (j=0; j<nz; j++){
2254           idx = pj[j];
2255           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2256         }
2257         sctx.rs = rs;
2258         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2259         if (sctx.newshift) goto endofwhile;
2260 
2261         if (*pc3 != 0.0){
2262           mul3   = (*pc3)/(*pc2);
2263           *pc3   = mul3;
2264           pj     = nbj + bd[prow];
2265           nz_tmp = bi[prow+1] - bd[prow] - 1;
2266           for (j=0; j<nz_tmp; j++) {
2267             idx = pj[j] ;
2268             tmp = rtmp22[idx];
2269             rtmp33[idx] -= mul3 * tmp;
2270           }
2271           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2272         }
2273 
2274         pj  = bj + bi[row];
2275         pc1 = ba + bi[row];
2276         pc2 = ba + bi[row+1];
2277         pc3 = ba + bi[row+2];
2278 
2279         sctx.pv = rtmp33[row+2];
2280         rs = 0.0;
2281         rtmp11[row]   = 1.0/rtmp11[row];
2282         rtmp22[row+1] = 1.0/rtmp22[row+1];
2283         rtmp33[row+2] = 1.0/rtmp33[row+2];
2284         /* copy row entries from dense representation to sparse */
2285         for (j=0; j<nz; j++) {
2286           idx    = pj[j];
2287           pc1[j] = rtmp11[idx];
2288           pc2[j] = rtmp22[idx];
2289           pc3[j] = rtmp33[idx];
2290           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2291         }
2292 
2293         sctx.rs = rs;
2294         ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr);
2295         if (sctx.newshift) goto endofwhile;
2296         break;
2297 
2298       default:
2299         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2300       }
2301       row += nodesz;                 /* Update the row */
2302     }
2303     endofwhile:;
2304   } while (sctx.newshift);
2305   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2306   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2309   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2310   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2311   /* do not set solve add, since MatSolve_Inode + Add is faster */
2312   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2313   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2314   C->assembled   = PETSC_TRUE;
2315   C->preallocated = PETSC_TRUE;
2316   if (sctx.nshift) {
2317     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2318       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2319     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2320       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2321     }
2322   }
2323   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2324   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2325   PetscFunctionReturn(0);
2326 }
2327 
2328 
2329 /* ----------------------------------------------------------- */
2330 #undef __FUNCT__
2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2333 {
2334   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2335   IS                iscol = a->col,isrow = a->row;
2336   PetscErrorCode    ierr;
2337   const PetscInt    *r,*c,*rout,*cout;
2338   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2339   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2340   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2341   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2342   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2343   const PetscScalar *b;
2344 
2345   PetscFunctionBegin;
2346   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2347   node_max = a->inode.node_count;
2348   ns       = a->inode.size;     /* Node Size array */
2349 
2350   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2351   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2352   tmp  = a->solve_work;
2353 
2354   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2355   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2356 
2357   /* forward solve the lower triangular */
2358   tmps = tmp ;
2359   aa   = a_a ;
2360   aj   = a_j ;
2361   ad   = a->diag;
2362 
2363   for (i = 0,row = 0; i< node_max; ++i){
2364     nsz = ns[i];
2365     aii = ai[row];
2366     v1  = aa + aii;
2367     vi  = aj + aii;
2368     nz  = ai[row+1]- ai[row];
2369 
2370     if (i < node_max-1) {
2371       /* Prefetch the indices for the next block */
2372       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2373       /* Prefetch the data for the next block */
2374       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2375     }
2376 
2377     switch (nsz){               /* Each loop in 'case' is unrolled */
2378     case 1 :
2379       sum1 = b[r[row]];
2380       for(j=0; j<nz-1; j+=2){
2381         i0   = vi[j];
2382         i1   = vi[j+1];
2383         tmp0 = tmps[i0];
2384         tmp1 = tmps[i1];
2385         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2386       }
2387       if(j == nz-1){
2388         tmp0 = tmps[vi[j]];
2389         sum1 -= v1[j]*tmp0;
2390       }
2391       tmp[row++]=sum1;
2392       break;
2393     case 2:
2394       sum1 = b[r[row]];
2395       sum2 = b[r[row+1]];
2396       v2   = aa + ai[row+1];
2397 
2398       for(j=0; j<nz-1; j+=2){
2399         i0   = vi[j];
2400         i1   = vi[j+1];
2401         tmp0 = tmps[i0];
2402         tmp1 = tmps[i1];
2403         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2404         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2405       }
2406       if(j == nz-1){
2407         tmp0 = tmps[vi[j]];
2408         sum1 -= v1[j] *tmp0;
2409         sum2 -= v2[j] *tmp0;
2410       }
2411       sum2 -= v2[nz] * sum1;
2412       tmp[row ++]=sum1;
2413       tmp[row ++]=sum2;
2414       break;
2415     case 3:
2416       sum1 = b[r[row]];
2417       sum2 = b[r[row+1]];
2418       sum3 = b[r[row+2]];
2419       v2   = aa + ai[row+1];
2420       v3   = aa + ai[row+2];
2421 
2422       for (j=0; j<nz-1; j+=2){
2423         i0   = vi[j];
2424         i1   = vi[j+1];
2425         tmp0 = tmps[i0];
2426         tmp1 = tmps[i1];
2427         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2428         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2429         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2430       }
2431       if (j == nz-1){
2432         tmp0 = tmps[vi[j]];
2433         sum1 -= v1[j] *tmp0;
2434         sum2 -= v2[j] *tmp0;
2435         sum3 -= v3[j] *tmp0;
2436       }
2437       sum2 -= v2[nz] * sum1;
2438       sum3 -= v3[nz] * sum1;
2439       sum3 -= v3[nz+1] * sum2;
2440       tmp[row ++]=sum1;
2441       tmp[row ++]=sum2;
2442       tmp[row ++]=sum3;
2443       break;
2444 
2445     case 4:
2446       sum1 = b[r[row]];
2447       sum2 = b[r[row+1]];
2448       sum3 = b[r[row+2]];
2449       sum4 = b[r[row+3]];
2450       v2   = aa + ai[row+1];
2451       v3   = aa + ai[row+2];
2452       v4   = aa + ai[row+3];
2453 
2454       for (j=0; j<nz-1; j+=2){
2455         i0   = vi[j];
2456         i1   = vi[j+1];
2457         tmp0 = tmps[i0];
2458         tmp1 = tmps[i1];
2459         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2460         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2461         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2462         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2463       }
2464       if (j == nz-1){
2465         tmp0 = tmps[vi[j]];
2466         sum1 -= v1[j] *tmp0;
2467         sum2 -= v2[j] *tmp0;
2468         sum3 -= v3[j] *tmp0;
2469         sum4 -= v4[j] *tmp0;
2470       }
2471       sum2 -= v2[nz] * sum1;
2472       sum3 -= v3[nz] * sum1;
2473       sum4 -= v4[nz] * sum1;
2474       sum3 -= v3[nz+1] * sum2;
2475       sum4 -= v4[nz+1] * sum2;
2476       sum4 -= v4[nz+2] * sum3;
2477 
2478       tmp[row ++]=sum1;
2479       tmp[row ++]=sum2;
2480       tmp[row ++]=sum3;
2481       tmp[row ++]=sum4;
2482       break;
2483     case 5:
2484       sum1 = b[r[row]];
2485       sum2 = b[r[row+1]];
2486       sum3 = b[r[row+2]];
2487       sum4 = b[r[row+3]];
2488       sum5 = b[r[row+4]];
2489       v2   = aa + ai[row+1];
2490       v3   = aa + ai[row+2];
2491       v4   = aa + ai[row+3];
2492       v5   = aa + ai[row+4];
2493 
2494       for (j=0; j<nz-1; j+=2){
2495         i0   = vi[j];
2496         i1   = vi[j+1];
2497         tmp0 = tmps[i0];
2498         tmp1 = tmps[i1];
2499         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2500         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2501         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2502         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2503         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2504       }
2505       if (j == nz-1){
2506         tmp0 = tmps[vi[j]];
2507         sum1 -= v1[j] *tmp0;
2508         sum2 -= v2[j] *tmp0;
2509         sum3 -= v3[j] *tmp0;
2510         sum4 -= v4[j] *tmp0;
2511         sum5 -= v5[j] *tmp0;
2512       }
2513 
2514       sum2 -= v2[nz] * sum1;
2515       sum3 -= v3[nz] * sum1;
2516       sum4 -= v4[nz] * sum1;
2517       sum5 -= v5[nz] * sum1;
2518       sum3 -= v3[nz+1] * sum2;
2519       sum4 -= v4[nz+1] * sum2;
2520       sum5 -= v5[nz+1] * sum2;
2521       sum4 -= v4[nz+2] * sum3;
2522       sum5 -= v5[nz+2] * sum3;
2523       sum5 -= v5[nz+3] * sum4;
2524 
2525       tmp[row ++]=sum1;
2526       tmp[row ++]=sum2;
2527       tmp[row ++]=sum3;
2528       tmp[row ++]=sum4;
2529       tmp[row ++]=sum5;
2530       break;
2531     default:
2532       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2533     }
2534   }
2535   /* backward solve the upper triangular */
2536   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2537     nsz = ns[i];
2538     aii = ad[row+1] + 1;
2539     v1  = aa + aii;
2540     vi  = aj + aii;
2541     nz  = ad[row]- ad[row+1] - 1;
2542 
2543     if (i > 0) {
2544       /* Prefetch the indices for the next block */
2545       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2546       /* Prefetch the data for the next block */
2547       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2548     }
2549 
2550     switch (nsz){               /* Each loop in 'case' is unrolled */
2551     case 1 :
2552       sum1 = tmp[row];
2553 
2554       for(j=0 ; j<nz-1; j+=2){
2555         i0   = vi[j];
2556         i1   = vi[j+1];
2557         tmp0 = tmps[i0];
2558         tmp1 = tmps[i1];
2559         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2560       }
2561       if (j == nz-1){
2562         tmp0  = tmps[vi[j]];
2563         sum1 -= v1[j]*tmp0;
2564       }
2565       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2566       break;
2567     case 2 :
2568       sum1 = tmp[row];
2569       sum2 = tmp[row-1];
2570       v2   = aa + ad[row] + 1;
2571       for (j=0 ; j<nz-1; j+=2){
2572         i0   = vi[j];
2573         i1   = vi[j+1];
2574         tmp0 = tmps[i0];
2575         tmp1 = tmps[i1];
2576         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2577         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2578       }
2579       if (j == nz-1){
2580         tmp0  = tmps[vi[j]];
2581         sum1 -= v1[j]* tmp0;
2582         sum2 -= v2[j+1]* tmp0;
2583       }
2584 
2585       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2586       sum2   -= v2[0] * tmp0;
2587       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2588       break;
2589     case 3 :
2590       sum1 = tmp[row];
2591       sum2 = tmp[row -1];
2592       sum3 = tmp[row -2];
2593       v2   = aa + ad[row] + 1;
2594       v3   = aa + ad[row -1] + 1;
2595       for (j=0 ; j<nz-1; j+=2){
2596         i0   = vi[j];
2597         i1   = vi[j+1];
2598         tmp0 = tmps[i0];
2599         tmp1 = tmps[i1];
2600         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2601         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2602         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2603       }
2604       if (j== nz-1){
2605         tmp0  = tmps[vi[j]];
2606         sum1 -= v1[j] * tmp0;
2607         sum2 -= v2[j+1] * tmp0;
2608         sum3 -= v3[j+2] * tmp0;
2609       }
2610       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611       sum2   -= v2[0]* tmp0;
2612       sum3   -= v3[1] * tmp0;
2613       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2614       sum3   -= v3[0]* tmp0;
2615       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2616 
2617       break;
2618     case 4 :
2619       sum1 = tmp[row];
2620       sum2 = tmp[row -1];
2621       sum3 = tmp[row -2];
2622       sum4 = tmp[row -3];
2623       v2   = aa + ad[row]+1;
2624       v3   = aa + ad[row -1]+1;
2625       v4   = aa + ad[row -2]+1;
2626 
2627       for (j=0 ; j<nz-1; j+=2){
2628         i0   = vi[j];
2629         i1   = vi[j+1];
2630         tmp0 = tmps[i0];
2631         tmp1 = tmps[i1];
2632         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2633         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2634         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2635         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2636       }
2637       if (j== nz-1){
2638         tmp0  = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640         sum2 -= v2[j+1] * tmp0;
2641         sum3 -= v3[j+2] * tmp0;
2642         sum4 -= v4[j+3] * tmp0;
2643       }
2644 
2645       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2646       sum2   -= v2[0] * tmp0;
2647       sum3   -= v3[1] * tmp0;
2648       sum4   -= v4[2] * tmp0;
2649       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650       sum3   -= v3[0] * tmp0;
2651       sum4   -= v4[1] * tmp0;
2652       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2653       sum4   -= v4[0] * tmp0;
2654       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655       break;
2656     case 5 :
2657       sum1 = tmp[row];
2658       sum2 = tmp[row -1];
2659       sum3 = tmp[row -2];
2660       sum4 = tmp[row -3];
2661       sum5 = tmp[row -4];
2662       v2   = aa + ad[row]+1;
2663       v3   = aa + ad[row -1]+1;
2664       v4   = aa + ad[row -2]+1;
2665       v5   = aa + ad[row -3]+1;
2666       for (j=0 ; j<nz-1; j+=2){
2667         i0   = vi[j];
2668         i1   = vi[j+1];
2669         tmp0 = tmps[i0];
2670         tmp1 = tmps[i1];
2671         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2672         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2673         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2674         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2675         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2676       }
2677       if (j==nz-1){
2678         tmp0  = tmps[vi[j]];
2679         sum1 -= v1[j] * tmp0;
2680         sum2 -= v2[j+1] * tmp0;
2681         sum3 -= v3[j+2] * tmp0;
2682         sum4 -= v4[j+3] * tmp0;
2683         sum5 -= v5[j+4] * tmp0;
2684       }
2685 
2686       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2687       sum2   -= v2[0] * tmp0;
2688       sum3   -= v3[1] * tmp0;
2689       sum4   -= v4[2] * tmp0;
2690       sum5   -= v5[3] * tmp0;
2691       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2692       sum3   -= v3[0] * tmp0;
2693       sum4   -= v4[1] * tmp0;
2694       sum5   -= v5[2] * tmp0;
2695       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2696       sum4   -= v4[0] * tmp0;
2697       sum5   -= v5[1] * tmp0;
2698       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2699       sum5   -= v5[0] * tmp0;
2700       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2701       break;
2702     default:
2703       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2704     }
2705   }
2706   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2707   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2710   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 
2715 /*
2716      Makes a longer coloring[] array and calls the usual code with that
2717 */
2718 #undef __FUNCT__
2719 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2720 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2721 {
2722   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2723   PetscErrorCode  ierr;
2724   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2725   PetscInt        *colorused,i;
2726   ISColoringValue *newcolor;
2727 
2728   PetscFunctionBegin;
2729   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2730   /* loop over inodes, marking a color for each column*/
2731   row = 0;
2732   for (i=0; i<m; i++){
2733     for (j=0; j<ns[i]; j++) {
2734       newcolor[row++] = coloring[i] + j*ncolors;
2735     }
2736   }
2737 
2738   /* eliminate unneeded colors */
2739   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2740   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2741   for (i=0; i<n; i++) {
2742     colorused[newcolor[i]] = 1;
2743   }
2744 
2745   for (i=1; i<5*ncolors; i++) {
2746     colorused[i] += colorused[i-1];
2747   }
2748   ncolors = colorused[5*ncolors-1];
2749   for (i=0; i<n; i++) {
2750     newcolor[i] = colorused[newcolor[i]]-1;
2751   }
2752   ierr = PetscFree(colorused);CHKERRQ(ierr);
2753   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2754   ierr = PetscFree(coloring);CHKERRQ(ierr);
2755   PetscFunctionReturn(0);
2756 }
2757 
2758 #include "../src/mat/blockinvert.h"
2759 
2760 #undef __FUNCT__
2761 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2762 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2763 {
2764   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2765   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2766   MatScalar          *ibdiag,*bdiag,work[25];
2767   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2768   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2769   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770   PetscErrorCode     ierr;
2771   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2773 
2774   PetscFunctionBegin;
2775   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777   if (its > 1) {
2778     /* switch to non-inode version */
2779     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2780     PetscFunctionReturn(0);
2781   }
2782 
2783   if (!a->inode.ibdiagvalid) {
2784     if (!a->inode.ibdiag) {
2785       /* calculate space needed for diagonal blocks */
2786       for (i=0; i<m; i++) {
2787 	cnt += sizes[i]*sizes[i];
2788       }
2789       a->inode.bdiagsize = cnt;
2790       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2791     }
2792 
2793     /* copy over the diagonal blocks and invert them */
2794     ibdiag = a->inode.ibdiag;
2795     bdiag  = a->inode.bdiag;
2796     cnt = 0;
2797     for (i=0, row = 0; i<m; i++) {
2798       for (j=0; j<sizes[i]; j++) {
2799         for (k=0; k<sizes[i]; k++) {
2800           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2801         }
2802       }
2803       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2804 
2805       switch(sizes[i]) {
2806         case 1:
2807           /* Create matrix data structure */
2808           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2809           ibdiag[cnt] = 1.0/ibdiag[cnt];
2810           break;
2811         case 2:
2812           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2813           break;
2814         case 3:
2815           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2816           break;
2817         case 4:
2818           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2819           break;
2820         case 5:
2821           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2822           break;
2823        default:
2824 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2825       }
2826       cnt += sizes[i]*sizes[i];
2827       row += sizes[i];
2828     }
2829     a->inode.ibdiagvalid = PETSC_TRUE;
2830   }
2831   ibdiag = a->inode.ibdiag;
2832   bdiag  = a->inode.bdiag;
2833 
2834 
2835   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837     PetscScalar *b;
2838     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2839     if (xx != bb) {
2840       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2841     } else {
2842       b = x;
2843     }
2844     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2845 
2846       for (i=0, row=0; i<m; i++) {
2847         sz  = diag[row] - ii[row];
2848         v1  = a->a + ii[row];
2849         idx = a->j + ii[row];
2850 
2851         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2852         switch (sizes[i]){
2853           case 1:
2854 
2855             sum1  = b[row];
2856             for(n = 0; n<sz-1; n+=2) {
2857               i1   = idx[0];
2858               i2   = idx[1];
2859               idx += 2;
2860               tmp0 = x[i1];
2861               tmp1 = x[i2];
2862               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2863             }
2864 
2865             if (n == sz-1){
2866               tmp0  = x[*idx];
2867               sum1 -= *v1 * tmp0;
2868             }
2869             x[row++] = sum1*(*ibdiag++);
2870             break;
2871           case 2:
2872             v2    = a->a + ii[row+1];
2873             sum1  = b[row];
2874             sum2  = b[row+1];
2875             for(n = 0; n<sz-1; n+=2) {
2876               i1   = idx[0];
2877               i2   = idx[1];
2878               idx += 2;
2879               tmp0 = x[i1];
2880               tmp1 = x[i2];
2881               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2882               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2883             }
2884 
2885             if (n == sz-1){
2886               tmp0  = x[*idx];
2887               sum1 -= v1[0] * tmp0;
2888               sum2 -= v2[0] * tmp0;
2889             }
2890             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892             ibdiag  += 4;
2893             break;
2894           case 3:
2895             v2    = a->a + ii[row+1];
2896             v3    = a->a + ii[row+2];
2897             sum1  = b[row];
2898             sum2  = b[row+1];
2899             sum3  = b[row+2];
2900             for(n = 0; n<sz-1; n+=2) {
2901               i1   = idx[0];
2902               i2   = idx[1];
2903               idx += 2;
2904               tmp0 = x[i1];
2905               tmp1 = x[i2];
2906               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909             }
2910 
2911             if (n == sz-1){
2912               tmp0  = x[*idx];
2913               sum1 -= v1[0] * tmp0;
2914               sum2 -= v2[0] * tmp0;
2915               sum3 -= v3[0] * tmp0;
2916             }
2917             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2918             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2919             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2920             ibdiag  += 9;
2921             break;
2922           case 4:
2923             v2    = a->a + ii[row+1];
2924             v3    = a->a + ii[row+2];
2925             v4    = a->a + ii[row+3];
2926             sum1  = b[row];
2927             sum2  = b[row+1];
2928             sum3  = b[row+2];
2929             sum4  = b[row+3];
2930             for(n = 0; n<sz-1; n+=2) {
2931               i1   = idx[0];
2932               i2   = idx[1];
2933               idx += 2;
2934               tmp0 = x[i1];
2935               tmp1 = x[i2];
2936               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2937               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2938               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2939               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2940             }
2941 
2942             if (n == sz-1){
2943               tmp0  = x[*idx];
2944               sum1 -= v1[0] * tmp0;
2945               sum2 -= v2[0] * tmp0;
2946               sum3 -= v3[0] * tmp0;
2947               sum4 -= v4[0] * tmp0;
2948             }
2949             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2950             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2951             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2952             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2953             ibdiag  += 16;
2954             break;
2955           case 5:
2956             v2    = a->a + ii[row+1];
2957             v3    = a->a + ii[row+2];
2958             v4    = a->a + ii[row+3];
2959             v5    = a->a + ii[row+4];
2960             sum1  = b[row];
2961             sum2  = b[row+1];
2962             sum3  = b[row+2];
2963             sum4  = b[row+3];
2964             sum5  = b[row+4];
2965             for(n = 0; n<sz-1; n+=2) {
2966               i1   = idx[0];
2967               i2   = idx[1];
2968               idx += 2;
2969               tmp0 = x[i1];
2970               tmp1 = x[i2];
2971               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2972               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2973               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2974               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2975               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2976             }
2977 
2978             if (n == sz-1){
2979               tmp0  = x[*idx];
2980               sum1 -= v1[0] * tmp0;
2981               sum2 -= v2[0] * tmp0;
2982               sum3 -= v3[0] * tmp0;
2983               sum4 -= v4[0] * tmp0;
2984               sum5 -= v5[0] * tmp0;
2985             }
2986             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2987             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2988             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2989             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2990             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2991             ibdiag  += 25;
2992             break;
2993 	  default:
2994    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2995         }
2996       }
2997 
2998       xb = x;
2999       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3000     } else xb = b;
3001     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
3002         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
3003       cnt = 0;
3004       for (i=0, row=0; i<m; i++) {
3005 
3006         switch (sizes[i]){
3007           case 1:
3008             x[row++] *= bdiag[cnt++];
3009             break;
3010           case 2:
3011             x1   = x[row]; x2 = x[row+1];
3012             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3013             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3014             x[row++] = tmp1;
3015             x[row++] = tmp2;
3016             cnt += 4;
3017             break;
3018           case 3:
3019             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3020             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3021             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3022             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3023             x[row++] = tmp1;
3024             x[row++] = tmp2;
3025             x[row++] = tmp3;
3026             cnt += 9;
3027             break;
3028           case 4:
3029             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3030             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3031             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3032             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3033             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3034             x[row++] = tmp1;
3035             x[row++] = tmp2;
3036             x[row++] = tmp3;
3037             x[row++] = tmp4;
3038             cnt += 16;
3039             break;
3040           case 5:
3041             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3042             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3043             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3044             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3045             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3046             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3047             x[row++] = tmp1;
3048             x[row++] = tmp2;
3049             x[row++] = tmp3;
3050             x[row++] = tmp4;
3051             x[row++] = tmp5;
3052             cnt += 25;
3053             break;
3054 	  default:
3055    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3056         }
3057       }
3058       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3059     }
3060     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3061 
3062       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3063       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3064         ibdiag -= sizes[i]*sizes[i];
3065         sz      = ii[row+1] - diag[row] - 1;
3066         v1      = a->a + diag[row] + 1;
3067         idx     = a->j + diag[row] + 1;
3068 
3069         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3070         switch (sizes[i]){
3071           case 1:
3072 
3073             sum1  = xb[row];
3074             for(n = 0; n<sz-1; n+=2) {
3075               i1   = idx[0];
3076               i2   = idx[1];
3077               idx += 2;
3078               tmp0 = x[i1];
3079               tmp1 = x[i2];
3080               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081             }
3082 
3083             if (n == sz-1){
3084               tmp0  = x[*idx];
3085               sum1 -= *v1*tmp0;
3086             }
3087             x[row--] = sum1*(*ibdiag);
3088             break;
3089 
3090           case 2:
3091 
3092             sum1  = xb[row];
3093             sum2  = xb[row-1];
3094             /* note that sum1 is associated with the second of the two rows */
3095             v2    = a->a + diag[row-1] + 2;
3096             for(n = 0; n<sz-1; n+=2) {
3097               i1   = idx[0];
3098               i2   = idx[1];
3099               idx += 2;
3100               tmp0 = x[i1];
3101               tmp1 = x[i2];
3102               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3103               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3104             }
3105 
3106             if (n == sz-1){
3107               tmp0  = x[*idx];
3108               sum1 -= *v1*tmp0;
3109               sum2 -= *v2*tmp0;
3110             }
3111             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3112             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3113             break;
3114           case 3:
3115 
3116             sum1  = xb[row];
3117             sum2  = xb[row-1];
3118             sum3  = xb[row-2];
3119             v2    = a->a + diag[row-1] + 2;
3120             v3    = a->a + diag[row-2] + 3;
3121             for(n = 0; n<sz-1; n+=2) {
3122               i1   = idx[0];
3123               i2   = idx[1];
3124               idx += 2;
3125               tmp0 = x[i1];
3126               tmp1 = x[i2];
3127               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3128               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3129               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3130             }
3131 
3132             if (n == sz-1){
3133               tmp0  = x[*idx];
3134               sum1 -= *v1*tmp0;
3135               sum2 -= *v2*tmp0;
3136               sum3 -= *v3*tmp0;
3137             }
3138             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3139             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3140             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3141             break;
3142           case 4:
3143 
3144             sum1  = xb[row];
3145             sum2  = xb[row-1];
3146             sum3  = xb[row-2];
3147             sum4  = xb[row-3];
3148             v2    = a->a + diag[row-1] + 2;
3149             v3    = a->a + diag[row-2] + 3;
3150             v4    = a->a + diag[row-3] + 4;
3151             for(n = 0; n<sz-1; n+=2) {
3152               i1   = idx[0];
3153               i2   = idx[1];
3154               idx += 2;
3155               tmp0 = x[i1];
3156               tmp1 = x[i2];
3157               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3158               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3159               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3160               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3161             }
3162 
3163             if (n == sz-1){
3164               tmp0  = x[*idx];
3165               sum1 -= *v1*tmp0;
3166               sum2 -= *v2*tmp0;
3167               sum3 -= *v3*tmp0;
3168               sum4 -= *v4*tmp0;
3169             }
3170             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3171             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3172             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3173             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3174             break;
3175           case 5:
3176 
3177             sum1  = xb[row];
3178             sum2  = xb[row-1];
3179             sum3  = xb[row-2];
3180             sum4  = xb[row-3];
3181             sum5  = xb[row-4];
3182             v2    = a->a + diag[row-1] + 2;
3183             v3    = a->a + diag[row-2] + 3;
3184             v4    = a->a + diag[row-3] + 4;
3185             v5    = a->a + diag[row-4] + 5;
3186             for(n = 0; n<sz-1; n+=2) {
3187               i1   = idx[0];
3188               i2   = idx[1];
3189               idx += 2;
3190               tmp0 = x[i1];
3191               tmp1 = x[i2];
3192               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3193               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3194               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3195               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3196               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3197             }
3198 
3199             if (n == sz-1){
3200               tmp0  = x[*idx];
3201               sum1 -= *v1*tmp0;
3202               sum2 -= *v2*tmp0;
3203               sum3 -= *v3*tmp0;
3204               sum4 -= *v4*tmp0;
3205               sum5 -= *v5*tmp0;
3206             }
3207             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3208             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3209             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3210             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3211             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3212             break;
3213 	  default:
3214    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3215         }
3216       }
3217 
3218       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3219     }
3220     its--;
3221     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3222     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3223   }
3224   if (flag & SOR_EISENSTAT) {
3225     const PetscScalar *b;
3226     MatScalar         *t = a->inode.ssor_work;
3227 
3228     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3229     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3230     /*
3231           Apply  (U + D)^-1  where D is now the block diagonal
3232     */
3233     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3234     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3235       ibdiag -= sizes[i]*sizes[i];
3236       sz      = ii[row+1] - diag[row] - 1;
3237       v1      = a->a + diag[row] + 1;
3238       idx     = a->j + diag[row] + 1;
3239       CHKMEMQ;
3240       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3241       switch (sizes[i]){
3242         case 1:
3243 
3244 	  sum1  = b[row];
3245 	  for(n = 0; n<sz-1; n+=2) {
3246 	    i1   = idx[0];
3247 	    i2   = idx[1];
3248 	    idx += 2;
3249 	    tmp0 = x[i1];
3250 	    tmp1 = x[i2];
3251 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3252 	  }
3253 
3254 	  if (n == sz-1){
3255 	    tmp0  = x[*idx];
3256 	    sum1 -= *v1*tmp0;
3257 	  }
3258 	  x[row] = sum1*(*ibdiag);row--;
3259 	  break;
3260 
3261         case 2:
3262 
3263 	  sum1  = b[row];
3264 	  sum2  = b[row-1];
3265 	  /* note that sum1 is associated with the second of the two rows */
3266 	  v2    = a->a + diag[row-1] + 2;
3267 	  for(n = 0; n<sz-1; n+=2) {
3268 	    i1   = idx[0];
3269 	    i2   = idx[1];
3270 	    idx += 2;
3271 	    tmp0 = x[i1];
3272 	    tmp1 = x[i2];
3273 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3274 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3275 	  }
3276 
3277 	  if (n == sz-1){
3278 	    tmp0  = x[*idx];
3279 	    sum1 -= *v1*tmp0;
3280 	    sum2 -= *v2*tmp0;
3281 	  }
3282 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3283 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3284           row -= 2;
3285 	  break;
3286         case 3:
3287 
3288 	  sum1  = b[row];
3289 	  sum2  = b[row-1];
3290 	  sum3  = b[row-2];
3291 	  v2    = a->a + diag[row-1] + 2;
3292 	  v3    = a->a + diag[row-2] + 3;
3293 	  for(n = 0; n<sz-1; n+=2) {
3294 	    i1   = idx[0];
3295 	    i2   = idx[1];
3296 	    idx += 2;
3297 	    tmp0 = x[i1];
3298 	    tmp1 = x[i2];
3299 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3300 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3301 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3302 	  }
3303 
3304 	  if (n == sz-1){
3305 	    tmp0  = x[*idx];
3306 	    sum1 -= *v1*tmp0;
3307 	    sum2 -= *v2*tmp0;
3308 	    sum3 -= *v3*tmp0;
3309 	  }
3310 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3311 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3312 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3313           row -= 3;
3314 	  break;
3315         case 4:
3316 
3317 	  sum1  = b[row];
3318 	  sum2  = b[row-1];
3319 	  sum3  = b[row-2];
3320 	  sum4  = b[row-3];
3321 	  v2    = a->a + diag[row-1] + 2;
3322 	  v3    = a->a + diag[row-2] + 3;
3323 	  v4    = a->a + diag[row-3] + 4;
3324 	  for(n = 0; n<sz-1; n+=2) {
3325 	    i1   = idx[0];
3326 	    i2   = idx[1];
3327 	    idx += 2;
3328 	    tmp0 = x[i1];
3329 	    tmp1 = x[i2];
3330 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3331 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3332 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3333 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3334 	  }
3335 
3336 	  if (n == sz-1){
3337 	    tmp0  = x[*idx];
3338 	    sum1 -= *v1*tmp0;
3339 	    sum2 -= *v2*tmp0;
3340 	    sum3 -= *v3*tmp0;
3341 	    sum4 -= *v4*tmp0;
3342 	  }
3343 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3344 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3345 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3346 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3347           row -= 4;
3348 	  break;
3349         case 5:
3350 
3351 	  sum1  = b[row];
3352 	  sum2  = b[row-1];
3353 	  sum3  = b[row-2];
3354 	  sum4  = b[row-3];
3355 	  sum5  = b[row-4];
3356 	  v2    = a->a + diag[row-1] + 2;
3357 	  v3    = a->a + diag[row-2] + 3;
3358 	  v4    = a->a + diag[row-3] + 4;
3359 	  v5    = a->a + diag[row-4] + 5;
3360 	  for(n = 0; n<sz-1; n+=2) {
3361 	    i1   = idx[0];
3362 	    i2   = idx[1];
3363 	    idx += 2;
3364 	    tmp0 = x[i1];
3365 	    tmp1 = x[i2];
3366 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3367 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3368 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3369 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3370 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3371 	  }
3372 
3373 	  if (n == sz-1){
3374 	    tmp0  = x[*idx];
3375 	    sum1 -= *v1*tmp0;
3376 	    sum2 -= *v2*tmp0;
3377 	    sum3 -= *v3*tmp0;
3378 	    sum4 -= *v4*tmp0;
3379 	    sum5 -= *v5*tmp0;
3380 	  }
3381 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3382 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3383 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3384 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3385 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3386           row -= 5;
3387 	  break;
3388         default:
3389 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3390       }
3391       CHKMEMQ;
3392     }
3393     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3394 
3395     /*
3396            t = b - D x    where D is the block diagonal
3397     */
3398     cnt = 0;
3399     for (i=0, row=0; i<m; i++) {
3400       CHKMEMQ;
3401       switch (sizes[i]){
3402         case 1:
3403 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3404 	  break;
3405         case 2:
3406 	  x1   = x[row]; x2 = x[row+1];
3407 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3408 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3409 	  t[row]   = b[row] - tmp1;
3410 	  t[row+1] = b[row+1] - tmp2; row += 2;
3411 	  cnt += 4;
3412 	  break;
3413         case 3:
3414 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3415 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3416 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3417 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3418 	  t[row] = b[row] - tmp1;
3419 	  t[row+1] = b[row+1] - tmp2;
3420 	  t[row+2] = b[row+2] - tmp3; row += 3;
3421 	  cnt += 9;
3422 	  break;
3423         case 4:
3424 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3425 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3426 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3427 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3428 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3429 	  t[row] = b[row] - tmp1;
3430 	  t[row+1] = b[row+1] - tmp2;
3431 	  t[row+2] = b[row+2] - tmp3;
3432 	  t[row+3] = b[row+3] - tmp4; row += 4;
3433 	  cnt += 16;
3434 	  break;
3435         case 5:
3436 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3437 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3438 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3439 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3440 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3441 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3442 	  t[row] = b[row] - tmp1;
3443 	  t[row+1] = b[row+1] - tmp2;
3444 	  t[row+2] = b[row+2] - tmp3;
3445 	  t[row+3] = b[row+3] - tmp4;
3446 	  t[row+4] = b[row+4] - tmp5;row += 5;
3447 	  cnt += 25;
3448 	  break;
3449         default:
3450 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3451       }
3452       CHKMEMQ;
3453     }
3454     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3455 
3456 
3457 
3458     /*
3459           Apply (L + D)^-1 where D is the block diagonal
3460     */
3461     for (i=0, row=0; i<m; i++) {
3462       sz  = diag[row] - ii[row];
3463       v1  = a->a + ii[row];
3464       idx = a->j + ii[row];
3465       CHKMEMQ;
3466       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3467       switch (sizes[i]){
3468         case 1:
3469 
3470 	  sum1  = t[row];
3471 	  for(n = 0; n<sz-1; n+=2) {
3472 	    i1   = idx[0];
3473 	    i2   = idx[1];
3474 	    idx += 2;
3475 	    tmp0 = t[i1];
3476 	    tmp1 = t[i2];
3477 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3478 	  }
3479 
3480 	  if (n == sz-1){
3481 	    tmp0  = t[*idx];
3482 	    sum1 -= *v1 * tmp0;
3483 	  }
3484 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3485 	  break;
3486         case 2:
3487 	  v2    = a->a + ii[row+1];
3488 	  sum1  = t[row];
3489 	  sum2  = t[row+1];
3490 	  for(n = 0; n<sz-1; n+=2) {
3491 	    i1   = idx[0];
3492 	    i2   = idx[1];
3493 	    idx += 2;
3494 	    tmp0 = t[i1];
3495 	    tmp1 = t[i2];
3496 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3497 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3498 	  }
3499 
3500 	  if (n == sz-1){
3501 	    tmp0  = t[*idx];
3502 	    sum1 -= v1[0] * tmp0;
3503 	    sum2 -= v2[0] * tmp0;
3504 	  }
3505 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3506 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3507 	  ibdiag  += 4; row += 2;
3508 	  break;
3509         case 3:
3510 	  v2    = a->a + ii[row+1];
3511 	  v3    = a->a + ii[row+2];
3512 	  sum1  = t[row];
3513 	  sum2  = t[row+1];
3514 	  sum3  = t[row+2];
3515 	  for(n = 0; n<sz-1; n+=2) {
3516 	    i1   = idx[0];
3517 	    i2   = idx[1];
3518 	    idx += 2;
3519 	    tmp0 = t[i1];
3520 	    tmp1 = t[i2];
3521 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3522 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3523 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3524 	  }
3525 
3526 	  if (n == sz-1){
3527 	    tmp0  = t[*idx];
3528 	    sum1 -= v1[0] * tmp0;
3529 	    sum2 -= v2[0] * tmp0;
3530 	    sum3 -= v3[0] * tmp0;
3531 	  }
3532 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3533 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3534 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3535 	  ibdiag  += 9; row += 3;
3536 	  break;
3537         case 4:
3538 	  v2    = a->a + ii[row+1];
3539 	  v3    = a->a + ii[row+2];
3540 	  v4    = a->a + ii[row+3];
3541 	  sum1  = t[row];
3542 	  sum2  = t[row+1];
3543 	  sum3  = t[row+2];
3544 	  sum4  = t[row+3];
3545 	  for(n = 0; n<sz-1; n+=2) {
3546 	    i1   = idx[0];
3547 	    i2   = idx[1];
3548 	    idx += 2;
3549 	    tmp0 = t[i1];
3550 	    tmp1 = t[i2];
3551 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3552 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3553 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3554 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3555 	  }
3556 
3557 	  if (n == sz-1){
3558 	    tmp0  = t[*idx];
3559 	    sum1 -= v1[0] * tmp0;
3560 	    sum2 -= v2[0] * tmp0;
3561 	    sum3 -= v3[0] * tmp0;
3562 	    sum4 -= v4[0] * tmp0;
3563 	  }
3564 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3565 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3566 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3567 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3568 	  ibdiag  += 16; row += 4;
3569 	  break;
3570         case 5:
3571 	  v2    = a->a + ii[row+1];
3572 	  v3    = a->a + ii[row+2];
3573 	  v4    = a->a + ii[row+3];
3574 	  v5    = a->a + ii[row+4];
3575 	  sum1  = t[row];
3576 	  sum2  = t[row+1];
3577 	  sum3  = t[row+2];
3578 	  sum4  = t[row+3];
3579 	  sum5  = t[row+4];
3580 	  for(n = 0; n<sz-1; n+=2) {
3581 	    i1   = idx[0];
3582 	    i2   = idx[1];
3583 	    idx += 2;
3584 	    tmp0 = t[i1];
3585 	    tmp1 = t[i2];
3586 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3587 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3588 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3589 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3590 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3591 	  }
3592 
3593 	  if (n == sz-1){
3594 	    tmp0  = t[*idx];
3595 	    sum1 -= v1[0] * tmp0;
3596 	    sum2 -= v2[0] * tmp0;
3597 	    sum3 -= v3[0] * tmp0;
3598 	    sum4 -= v4[0] * tmp0;
3599 	    sum5 -= v5[0] * tmp0;
3600 	  }
3601 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3602 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3603 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3604 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3605 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3606 	  ibdiag  += 25; row += 5;
3607 	  break;
3608         default:
3609 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3610       }
3611       CHKMEMQ;
3612     }
3613     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3614     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3615     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3616   }
3617   PetscFunctionReturn(0);
3618 }
3619 
3620 #undef __FUNCT__
3621 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3622 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3623 {
3624   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3625   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3626   const MatScalar    *bdiag = a->inode.bdiag;
3627   const PetscScalar  *b;
3628   PetscErrorCode      ierr;
3629   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3630   const PetscInt      *sizes = a->inode.size;
3631 
3632   PetscFunctionBegin;
3633   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3634   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3635   cnt = 0;
3636   for (i=0, row=0; i<m; i++) {
3637     switch (sizes[i]){
3638       case 1:
3639 	x[row] = b[row]*bdiag[cnt++];row++;
3640 	break;
3641       case 2:
3642 	x1   = b[row]; x2 = b[row+1];
3643 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3644 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3645 	x[row++] = tmp1;
3646 	x[row++] = tmp2;
3647 	cnt += 4;
3648 	break;
3649       case 3:
3650 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3651 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3652 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3653 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3654 	x[row++] = tmp1;
3655 	x[row++] = tmp2;
3656 	x[row++] = tmp3;
3657 	cnt += 9;
3658 	break;
3659       case 4:
3660 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3661 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3662 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3663 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3664 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3665 	x[row++] = tmp1;
3666 	x[row++] = tmp2;
3667 	x[row++] = tmp3;
3668 	x[row++] = tmp4;
3669 	cnt += 16;
3670 	break;
3671       case 5:
3672 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3673 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3674 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3675 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3676 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3677 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3678 	x[row++] = tmp1;
3679 	x[row++] = tmp2;
3680 	x[row++] = tmp3;
3681 	x[row++] = tmp4;
3682 	x[row++] = tmp5;
3683 	cnt += 25;
3684 	break;
3685       default:
3686 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3687     }
3688   }
3689   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3690   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3691   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 /*
3696     samestructure indicates that the matrix has not changed its nonzero structure so we
3697     do not need to recompute the inodes
3698 */
3699 #undef __FUNCT__
3700 #define __FUNCT__ "Mat_CheckInode"
3701 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3702 {
3703   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3704   PetscErrorCode ierr;
3705   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3706   PetscTruth     flag;
3707 
3708   PetscFunctionBegin;
3709   if (!a->inode.use)                     PetscFunctionReturn(0);
3710   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3711 
3712 
3713   m = A->rmap->n;
3714   if (a->inode.size) {ns = a->inode.size;}
3715   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3716 
3717   i          = 0;
3718   node_count = 0;
3719   idx        = a->j;
3720   ii         = a->i;
3721   while (i < m){                /* For each row */
3722     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3723     /* Limits the number of elements in a node to 'a->inode.limit' */
3724     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3725       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3726       if (nzy != nzx) break;
3727       idy  += nzx;             /* Same nonzero pattern */
3728       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3729       if (!flag) break;
3730     }
3731     ns[node_count++] = blk_size;
3732     idx += blk_size*nzx;
3733     i    = j;
3734   }
3735   /* If not enough inodes found,, do not use inode version of the routines */
3736   if (!a->inode.size && m && node_count > .9*m) {
3737     ierr = PetscFree(ns);CHKERRQ(ierr);
3738     a->inode.node_count     = 0;
3739     a->inode.size           = PETSC_NULL;
3740     a->inode.use            = PETSC_FALSE;
3741     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3742   } else {
3743     if (!A->factortype) {
3744       A->ops->mult              = MatMult_SeqAIJ_Inode;
3745       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3746       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3747       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3748       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3749       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3750       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3751       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3752       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3753     } else {
3754       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3755     }
3756     a->inode.node_count       = node_count;
3757     a->inode.size             = ns;
3758     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3759   }
3760   PetscFunctionReturn(0);
3761 }
3762 
3763 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3764 PetscInt __k, *__vi; \
3765 __vi = aj + ai[row];				\
3766 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3767 __vi = aj + adiag[row];				\
3768 cols[nzl] = __vi[0];\
3769 __vi = aj + adiag[row+1]+1;\
3770 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3771 
3772 
3773 /*
3774    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3775    Modified from Mat_CheckInode().
3776 
3777    Input Parameters:
3778 +  Mat A - ILU or LU matrix factor
3779 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3780     do not need to recompute the inodes
3781 */
3782 #undef __FUNCT__
3783 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3784 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3785 {
3786   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3787   PetscErrorCode ierr;
3788   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3789   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3790   PetscTruth     flag;
3791 
3792   PetscFunctionBegin;
3793   if (!a->inode.use)                     PetscFunctionReturn(0);
3794   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3795 
3796   m = A->rmap->n;
3797   if (a->inode.size) {ns = a->inode.size;}
3798   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3799 
3800   i          = 0;
3801   node_count = 0;
3802   while (i < m){                /* For each row */
3803     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3804     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3805     nzx  = nzl1 + nzu1 + 1;
3806     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3807     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3808 
3809     /* Limits the number of elements in a node to 'a->inode.limit' */
3810     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3811       nzl2    = ai[j+1] - ai[j];
3812       nzu2    = adiag[j] - adiag[j+1] - 1;
3813       nzy     = nzl2 + nzu2 + 1;
3814       if( nzy != nzx) break;
3815       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3816       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3817       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3818       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3819       ierr = PetscFree(cols2);CHKERRQ(ierr);
3820     }
3821     ns[node_count++] = blk_size;
3822     ierr = PetscFree(cols1);CHKERRQ(ierr);
3823     i    = j;
3824   }
3825   /* If not enough inodes found,, do not use inode version of the routines */
3826   if (!a->inode.size && m && node_count > .9*m) {
3827     ierr = PetscFree(ns);CHKERRQ(ierr);
3828     a->inode.node_count     = 0;
3829     a->inode.size           = PETSC_NULL;
3830     a->inode.use            = PETSC_FALSE;
3831     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3832   } else {
3833     A->ops->mult              = 0;
3834     A->ops->sor               = 0;
3835     A->ops->multadd           = 0;
3836     A->ops->getrowij          = 0;
3837     A->ops->restorerowij      = 0;
3838     A->ops->getcolumnij       = 0;
3839     A->ops->restorecolumnij   = 0;
3840     A->ops->coloringpatch     = 0;
3841     A->ops->multdiagonalblock = 0;
3842     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3843     a->inode.node_count       = node_count;
3844     a->inode.size             = ns;
3845     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3846   }
3847   PetscFunctionReturn(0);
3848 }
3849 
3850 /*
3851      This is really ugly. if inodes are used this replaces the
3852   permutations with ones that correspond to rows/cols of the matrix
3853   rather then inode blocks
3854 */
3855 #undef __FUNCT__
3856 #define __FUNCT__ "MatInodeAdjustForInodes"
3857 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3858 {
3859   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3860 
3861   PetscFunctionBegin;
3862   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3863   if (f) {
3864     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3865   }
3866   PetscFunctionReturn(0);
3867 }
3868 
3869 EXTERN_C_BEGIN
3870 #undef __FUNCT__
3871 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3872 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3873 {
3874   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3875   PetscErrorCode ierr;
3876   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3877   const PetscInt *ridx,*cidx;
3878   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3879   PetscInt       nslim_col,*ns_col;
3880   IS             ris = *rperm,cis = *cperm;
3881 
3882   PetscFunctionBegin;
3883   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3884   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3885 
3886   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3887   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3888   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3889 
3890   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3891   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3892 
3893   /* Form the inode structure for the rows of permuted matric using inv perm*/
3894   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3895 
3896   /* Construct the permutations for rows*/
3897   for (i=0,row = 0; i<nslim_row; ++i){
3898     indx      = ridx[i];
3899     start_val = tns[indx];
3900     end_val   = tns[indx + 1];
3901     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3902   }
3903 
3904   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3905   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3906 
3907  /* Construct permutations for columns */
3908   for (i=0,col=0; i<nslim_col; ++i){
3909     indx      = cidx[i];
3910     start_val = tns[indx];
3911     end_val   = tns[indx + 1];
3912     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3913   }
3914 
3915   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3916   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3917   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3918   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3919 
3920   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3921   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3922 
3923   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3924   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3925   ierr = ISDestroy(cis);CHKERRQ(ierr);
3926   ierr = ISDestroy(ris);CHKERRQ(ierr);
3927   ierr = PetscFree(tns);CHKERRQ(ierr);
3928   PetscFunctionReturn(0);
3929 }
3930 EXTERN_C_END
3931 
3932 #undef __FUNCT__
3933 #define __FUNCT__ "MatInodeGetInodeSizes"
3934 /*@C
3935    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3936 
3937    Collective on Mat
3938 
3939    Input Parameter:
3940 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3941 
3942    Output Parameter:
3943 +  node_count - no of inodes present in the matrix.
3944 .  sizes      - an array of size node_count,with sizes of each inode.
3945 -  limit      - the max size used to generate the inodes.
3946 
3947    Level: advanced
3948 
3949    Notes: This routine returns some internal storage information
3950    of the matrix, it is intended to be used by advanced users.
3951    It should be called after the matrix is assembled.
3952    The contents of the sizes[] array should not be changed.
3953    PETSC_NULL may be passed for information not requested.
3954 
3955 .keywords: matrix, seqaij, get, inode
3956 
3957 .seealso: MatGetInfo()
3958 @*/
3959 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3960 {
3961   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3962 
3963   PetscFunctionBegin;
3964   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3965   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3966   if (f) {
3967     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3968   }
3969   PetscFunctionReturn(0);
3970 }
3971 
3972 EXTERN_C_BEGIN
3973 #undef __FUNCT__
3974 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3975 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3976 {
3977   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3978 
3979   PetscFunctionBegin;
3980   if (node_count) *node_count = a->inode.node_count;
3981   if (sizes)      *sizes      = a->inode.size;
3982   if (limit)      *limit      = a->inode.limit;
3983   PetscFunctionReturn(0);
3984 }
3985 EXTERN_C_END
3986