xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 70663e4a70994b7bf6a2e51d4030e794ec74df4c)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   rtmp4 = rtmp3 + n;
1234   ics   = ic;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1239 
1240   /* If max inode size > 4, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i] > 4) {
1245       tmp_vec1[j] = 4;
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.newshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1348 	if(sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1432 	if(sctx.newshift) break;
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         for (j=0; j<nz; j++) {
1456           col = pj[j];
1457           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1458         }
1459         /* U part */
1460         pc2 = b->a + bdiag[i+2]+1;
1461         pj  = b->j + bdiag[i+2]+1;
1462         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1463         for (j=0; j<nz; j++) {
1464           col = pj[j];
1465           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1466         }
1467 
1468         sctx.rs  = rs;
1469         sctx.pv  = rtmp2[i+1];
1470         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1471 	if(sctx.newshift) break;
1472         pc2  = b->a + bdiag[i+1];
1473         *pc2 = 1.0/sctx.pv;
1474         break;
1475 
1476       case 3:
1477       /*----------*/
1478         /* zero rtmp */
1479         /* L part */
1480         nz    = bi[i+1] - bi[i];
1481         bjtmp = bj + bi[i];
1482         for  (j=0; j<nz; j++) {
1483           col = bjtmp[j];
1484           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1485         }
1486 
1487         /* U part */
1488         nz = bdiag[i]-bdiag[i+1];
1489         bjtmp = bj + bdiag[i+1]+1;
1490         for  (j=0; j<nz; j++) {
1491           col = bjtmp[j];
1492           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1493         }
1494 
1495         /* load in initial (unfactored row) */
1496         nz    = ai[r[i]+1] - ai[r[i]];
1497         ajtmp = aj + ai[r[i]];
1498         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1499         for (j=0; j<nz; j++) {
1500           col = ics[ajtmp[j]];
1501           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1502         }
1503         /* ZeropivotApply(): shift the diagonal of the matrix  */
1504         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1505 
1506         /* elimination */
1507         bjtmp = bj + bi[i];
1508         row   = *bjtmp++; /* pivot row */
1509         nzL   = bi[i+1] - bi[i];
1510         for(k=0; k < nzL;k++) {
1511           pc1 = rtmp1 + row;
1512           pc2 = rtmp2 + row;
1513           pc3 = rtmp3 + row;
1514           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1515             pv  = b->a + bdiag[row];
1516             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1517             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1518 
1519             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1520             pv = b->a + bdiag[row+1]+1;
1521             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1522             for (j=0; j<nz; j++){
1523               col = pj[j];
1524               rtmp1[col] -= mul1 * pv[j];
1525               rtmp2[col] -= mul2 * pv[j];
1526               rtmp3[col] -= mul3 * pv[j];
1527             }
1528             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1529           }
1530           row = *bjtmp++;
1531         }
1532 
1533         /* finished row i; check zero pivot, then stick row i into b->a */
1534         rs  = 0.0;
1535         /* L part */
1536         pc1 = b->a + bi[i];
1537         pj  = b->j + bi[i] ;
1538         nz  = bi[i+1] - bi[i];
1539         for (j=0; j<nz; j++) {
1540           col = pj[j];
1541           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1542         }
1543         /* U part */
1544         pc1 = b->a + bdiag[i+1]+1;
1545         pj  = b->j + bdiag[i+1]+1;
1546         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1547         for (j=0; j<nz; j++) {
1548           col = pj[j];
1549           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1550         }
1551 
1552         sctx.rs  = rs;
1553         sctx.pv  = rtmp1[i];
1554         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1555 	if(sctx.newshift) break;
1556         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1557         *pc1 = 1.0/sctx.pv;
1558 
1559         /* Now take care of 1st column of diagonal 3x3 block. */
1560         pc2 = rtmp2 + i;
1561         pc3 = rtmp3 + i;
1562         if (*pc2 != 0.0 || *pc3 != 0.0){
1563           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1564           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1565           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1566           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1567           for (j=0; j<nz; j++) {
1568             col = pj[j];
1569             rtmp2[col] -= mul2 * rtmp1[col];
1570             rtmp3[col] -= mul3 * rtmp1[col];
1571           }
1572           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1573         }
1574 
1575         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1576         rs = 0.0;
1577         /* L part */
1578         pc2 = b->a + bi[i+1];
1579         pj  = b->j + bi[i+1] ;
1580         nz  = bi[i+2] - bi[i+1];
1581         for (j=0; j<nz; j++) {
1582           col = pj[j];
1583           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1584         }
1585         /* U part */
1586         pc2 = b->a + bdiag[i+2]+1;
1587         pj  = b->j + bdiag[i+2]+1;
1588         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1589         for (j=0; j<nz; j++) {
1590           col = pj[j];
1591           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1592         }
1593 
1594         sctx.rs  = rs;
1595         sctx.pv  = rtmp2[i+1];
1596         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1597 	if(sctx.newshift) break;
1598         pc2  = b->a + bdiag[i+1];
1599         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1600 
1601         /* Now take care of 2nd column of diagonal 3x3 block. */
1602         pc3 = rtmp3 + i+1;
1603         if (*pc3 != 0.0){
1604           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1605           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1606           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1607           for (j=0; j<nz; j++) {
1608             col = pj[j];
1609             rtmp3[col] -= mul3 * rtmp2[col];
1610           }
1611           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1612         }
1613 
1614         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1615         rs = 0.0;
1616         /* L part */
1617         pc3 = b->a + bi[i+2];
1618         pj  = b->j + bi[i+2] ;
1619         nz  = bi[i+3] - bi[i+2];
1620         for (j=0; j<nz; j++) {
1621           col = pj[j];
1622           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1623         }
1624         /* U part */
1625         pc3 = b->a + bdiag[i+3]+1;
1626         pj  = b->j + bdiag[i+3]+1;
1627         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1628         for (j=0; j<nz; j++) {
1629           col = pj[j];
1630           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1631         }
1632 
1633         sctx.rs  = rs;
1634         sctx.pv  = rtmp3[i+2];
1635         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1636 	if(sctx.newshift) break;
1637         pc3  = b->a + bdiag[i+2];
1638         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1639         break;
1640       case 4:
1641       /*----------*/
1642         /* zero rtmp */
1643         /* L part */
1644         nz    = bi[i+1] - bi[i];
1645         bjtmp = bj + bi[i];
1646         for  (j=0; j<nz; j++) {
1647           col = bjtmp[j];
1648           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1649         }
1650 
1651         /* U part */
1652         nz = bdiag[i]-bdiag[i+1];
1653         bjtmp = bj + bdiag[i+1]+1;
1654         for  (j=0; j<nz; j++) {
1655           col = bjtmp[j];
1656           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1657         }
1658 
1659         /* load in initial (unfactored row) */
1660         nz    = ai[r[i]+1] - ai[r[i]];
1661         ajtmp = aj + ai[r[i]];
1662         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1663         for (j=0; j<nz; j++) {
1664           col = ics[ajtmp[j]];
1665           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1666         }
1667         /* ZeropivotApply(): shift the diagonal of the matrix  */
1668         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1669 
1670         /* elimination */
1671         bjtmp = bj + bi[i];
1672         row   = *bjtmp++; /* pivot row */
1673         nzL   = bi[i+1] - bi[i];
1674         for(k=0; k < nzL;k++) {
1675           pc1 = rtmp1 + row;
1676           pc2 = rtmp2 + row;
1677           pc3 = rtmp3 + row;
1678 	  pc4 = rtmp4 + row;
1679           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1680             pv  = b->a + bdiag[row];
1681             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1682             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1683 
1684             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1685             pv = b->a + bdiag[row+1]+1;
1686             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1687             for (j=0; j<nz; j++){
1688               col = pj[j];
1689               rtmp1[col] -= mul1 * pv[j];
1690               rtmp2[col] -= mul2 * pv[j];
1691               rtmp3[col] -= mul3 * pv[j];
1692 	      rtmp4[col] -= mul4 * pv[j];
1693             }
1694             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1695           }
1696           row = *bjtmp++;
1697         }
1698 
1699         /* finished row i; check zero pivot, then stick row i into b->a */
1700         rs  = 0.0;
1701         /* L part */
1702         pc1 = b->a + bi[i];
1703         pj  = b->j + bi[i] ;
1704         nz  = bi[i+1] - bi[i];
1705         for (j=0; j<nz; j++) {
1706           col = pj[j];
1707           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1708         }
1709         /* U part */
1710         pc1 = b->a + bdiag[i+1]+1;
1711         pj  = b->j + bdiag[i+1]+1;
1712         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1713         for (j=0; j<nz; j++) {
1714           col = pj[j];
1715           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1716         }
1717 
1718         sctx.rs  = rs;
1719         sctx.pv  = rtmp1[i];
1720         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1721 	if(sctx.newshift) break;
1722         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1723         *pc1 = 1.0/sctx.pv;
1724 
1725         /* Now take care of 1st column of diagonal 4x4 block. */
1726         pc2 = rtmp2 + i;
1727         pc3 = rtmp3 + i;
1728 	pc4 = rtmp4 + i;
1729         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1730           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1731           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1732 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1733           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1734           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1735           for (j=0; j<nz; j++) {
1736             col = pj[j];
1737             rtmp2[col] -= mul2 * rtmp1[col];
1738             rtmp3[col] -= mul3 * rtmp1[col];
1739 	    rtmp4[col] -= mul4 * rtmp1[col];
1740           }
1741           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1742         }
1743 
1744         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1745         rs = 0.0;
1746         /* L part */
1747         pc2 = b->a + bi[i+1];
1748         pj  = b->j + bi[i+1] ;
1749         nz  = bi[i+2] - bi[i+1];
1750         for (j=0; j<nz; j++) {
1751           col = pj[j];
1752           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1753         }
1754         /* U part */
1755         pc2 = b->a + bdiag[i+2]+1;
1756         pj  = b->j + bdiag[i+2]+1;
1757         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1758         for (j=0; j<nz; j++) {
1759           col = pj[j];
1760           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1761         }
1762 
1763         sctx.rs  = rs;
1764         sctx.pv  = rtmp2[i+1];
1765         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1766 	if(sctx.newshift) break;
1767         pc2  = b->a + bdiag[i+1];
1768         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1769 
1770         /* Now take care of 2nd column of diagonal 4x4 block. */
1771         pc3 = rtmp3 + i+1;
1772 	pc4 = rtmp4 + i+1;
1773         if (*pc3 != 0.0 || *pc4 != 0.0){
1774           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1775 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1776           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1777           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1778           for (j=0; j<nz; j++) {
1779             col = pj[j];
1780             rtmp3[col] -= mul3 * rtmp2[col];
1781 	    rtmp4[col] -= mul4 * rtmp2[col];
1782           }
1783           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1784         }
1785 
1786         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1787         rs = 0.0;
1788         /* L part */
1789         pc3 = b->a + bi[i+2];
1790         pj  = b->j + bi[i+2] ;
1791         nz  = bi[i+3] - bi[i+2];
1792         for (j=0; j<nz; j++) {
1793           col = pj[j];
1794           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1795         }
1796         /* U part */
1797         pc3 = b->a + bdiag[i+3]+1;
1798         pj  = b->j + bdiag[i+3]+1;
1799         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1800         for (j=0; j<nz; j++) {
1801           col = pj[j];
1802           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1803         }
1804 
1805         sctx.rs  = rs;
1806         sctx.pv  = rtmp3[i+2];
1807         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1808 	if(sctx.newshift) break;
1809         pc3  = b->a + bdiag[i+2];
1810         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1811 
1812 	/* Now take care of 3rd column of diagonal 4x4 block. */
1813 	pc4 = rtmp4 + i+2;
1814         if (*pc4 != 0.0){
1815 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1816           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1817           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1818           for (j=0; j<nz; j++) {
1819             col = pj[j];
1820 	    rtmp4[col] -= mul4 * rtmp3[col];
1821           }
1822           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1823         }
1824 
1825         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1826         rs = 0.0;
1827         /* L part */
1828         pc4 = b->a + bi[i+3];
1829         pj  = b->j + bi[i+3] ;
1830         nz  = bi[i+4] - bi[i+3];
1831         for (j=0; j<nz; j++) {
1832           col = pj[j];
1833           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1834         }
1835         /* U part */
1836         pc4 = b->a + bdiag[i+4]+1;
1837         pj  = b->j + bdiag[i+4]+1;
1838         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1839         for (j=0; j<nz; j++) {
1840           col = pj[j];
1841           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1842         }
1843 
1844         sctx.rs  = rs;
1845         sctx.pv  = rtmp4[i+3];
1846         ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr);
1847 	if(sctx.newshift) break;
1848         pc4  = b->a + bdiag[i+3];
1849         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1850          break;
1851 
1852       default:
1853         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1854       }
1855       i += nodesz;                 /* Update the row */
1856     }
1857 
1858     /* MatPivotRefine() */
1859     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1860       /*
1861        * if no shift in this attempt & shifting & started shifting & can refine,
1862        * then try lower shift
1863        */
1864       sctx.shift_hi       = sctx.shift_fraction;
1865       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1866       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1867       sctx.newshift       = PETSC_TRUE;
1868       sctx.nshift++;
1869     }
1870   } while (sctx.newshift);
1871 
1872   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1873   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1874   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1876 
1877   C->ops->solve              = MatSolve_SeqAIJ;
1878   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1879   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1880   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1881   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1882   C->assembled    = PETSC_TRUE;
1883   C->preallocated = PETSC_TRUE;
1884   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1885 
1886   /* MatShiftView(A,info,&sctx) */
1887   if (sctx.nshift){
1888     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1889       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1890     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1891       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1892     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1893       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1894     }
1895   }
1896   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1897   PetscFunctionReturn(0);
1898 }
1899 
1900 #undef __FUNCT__
1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1903 {
1904   Mat               C = B;
1905   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1906   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1907   PetscErrorCode    ierr;
1908   const PetscInt    *r,*ic,*c,*ics;
1909   PetscInt          n = A->rmap->n,*bi = b->i;
1910   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1911   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1912   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1913   PetscScalar       mul1,mul2,mul3,tmp;
1914   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1915   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1916   PetscReal         rs=0.0;
1917   FactorShiftCtx    sctx;
1918 
1919   PetscFunctionBegin;
1920   sctx.shift_top      = 0;
1921   sctx.nshift_max     = 0;
1922   sctx.shift_lo       = 0;
1923   sctx.shift_hi       = 0;
1924   sctx.shift_fraction = 0;
1925 
1926   /* if both shift schemes are chosen by user, only use info->shiftpd */
1927   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1928     sctx.shift_top = 0;
1929     for (i=0; i<n; i++) {
1930       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1931       rs    = 0.0;
1932       ajtmp = aj + ai[i];
1933       rtmp1 = aa + ai[i];
1934       nz = ai[i+1] - ai[i];
1935       for (j=0; j<nz; j++){
1936         if (*ajtmp != i){
1937           rs += PetscAbsScalar(*rtmp1++);
1938         } else {
1939           rs -= PetscRealPart(*rtmp1++);
1940         }
1941         ajtmp++;
1942       }
1943       if (rs>sctx.shift_top) sctx.shift_top = rs;
1944     }
1945     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1946     sctx.shift_top *= 1.1;
1947     sctx.nshift_max = 5;
1948     sctx.shift_lo   = 0.;
1949     sctx.shift_hi   = 1.;
1950   }
1951   sctx.shift_amount = 0;
1952   sctx.nshift       = 0;
1953 
1954   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1955   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1957   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1958   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1959   ics   = ic ;
1960   rtmp22 = rtmp11 + n;
1961   rtmp33 = rtmp22 + n;
1962 
1963   node_max = a->inode.node_count;
1964   ns       = a->inode.size;
1965   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1966 
1967   /* If max inode size > 3, split it into two inodes.*/
1968   /* also map the inode sizes according to the ordering */
1969   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1970   for (i=0,j=0; i<node_max; ++i,++j){
1971     if (ns[i]>3) {
1972       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1973       ++j;
1974       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1975     } else {
1976       tmp_vec1[j] = ns[i];
1977     }
1978   }
1979   /* Use the correct node_max */
1980   node_max = j;
1981 
1982   /* Now reorder the inode info based on mat re-ordering info */
1983   /* First create a row -> inode_size_array_index map */
1984   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1985   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1986   for (i=0,row=0; i<node_max; i++) {
1987     nodesz = tmp_vec1[i];
1988     for (j=0; j<nodesz; j++,row++) {
1989       nsmap[row] = i;
1990     }
1991   }
1992   /* Using nsmap, create a reordered ns structure */
1993   for (i=0,j=0; i< node_max; i++) {
1994     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1995     tmp_vec2[i]  = nodesz;
1996     j           += nodesz;
1997   }
1998   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1999   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2000   /* Now use the correct ns */
2001   ns = tmp_vec2;
2002 
2003   do {
2004     sctx.newshift = PETSC_FALSE;
2005     /* Now loop over each block-row, and do the factorization */
2006     for (i=0,row=0; i<node_max; i++) {
2007       nodesz = ns[i];
2008       nz     = bi[row+1] - bi[row];
2009       bjtmp  = bj + bi[row];
2010 
2011       switch (nodesz){
2012       case 1:
2013         for  (j=0; j<nz; j++){
2014           idx        = bjtmp[j];
2015           rtmp11[idx] = 0.0;
2016         }
2017 
2018         /* load in initial (unfactored row) */
2019         idx    = r[row];
2020         nz_tmp = ai[idx+1] - ai[idx];
2021         ajtmp  = aj + ai[idx];
2022         v1     = aa + ai[idx];
2023 
2024         for (j=0; j<nz_tmp; j++) {
2025           idx        = ics[ajtmp[j]];
2026           rtmp11[idx] = v1[j];
2027         }
2028         rtmp11[ics[r[row]]] += sctx.shift_amount;
2029 
2030         prow = *bjtmp++ ;
2031         while (prow < row) {
2032           pc1 = rtmp11 + prow;
2033           if (*pc1 != 0.0){
2034             pv   = ba + bd[prow];
2035             pj   = nbj + bd[prow];
2036             mul1 = *pc1 * *pv++;
2037             *pc1 = mul1;
2038             nz_tmp = bi[prow+1] - bd[prow] - 1;
2039             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2040             for (j=0; j<nz_tmp; j++) {
2041               tmp = pv[j];
2042               idx = pj[j];
2043               rtmp11[idx] -= mul1 * tmp;
2044             }
2045           }
2046           prow = *bjtmp++ ;
2047         }
2048         pj  = bj + bi[row];
2049         pc1 = ba + bi[row];
2050 
2051         sctx.pv    = rtmp11[row];
2052         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2053         rs         = 0.0;
2054         for (j=0; j<nz; j++) {
2055           idx    = pj[j];
2056           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2057           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2058         }
2059         sctx.rs  = rs;
2060         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2061         if (sctx.newshift) goto endofwhile;
2062         break;
2063 
2064       case 2:
2065         for (j=0; j<nz; j++) {
2066           idx        = bjtmp[j];
2067           rtmp11[idx] = 0.0;
2068           rtmp22[idx] = 0.0;
2069         }
2070 
2071         /* load in initial (unfactored row) */
2072         idx    = r[row];
2073         nz_tmp = ai[idx+1] - ai[idx];
2074         ajtmp  = aj + ai[idx];
2075         v1     = aa + ai[idx];
2076         v2     = aa + ai[idx+1];
2077         for (j=0; j<nz_tmp; j++) {
2078           idx        = ics[ajtmp[j]];
2079           rtmp11[idx] = v1[j];
2080           rtmp22[idx] = v2[j];
2081         }
2082         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2083         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2084 
2085         prow = *bjtmp++ ;
2086         while (prow < row) {
2087           pc1 = rtmp11 + prow;
2088           pc2 = rtmp22 + prow;
2089           if (*pc1 != 0.0 || *pc2 != 0.0){
2090             pv   = ba + bd[prow];
2091             pj   = nbj + bd[prow];
2092             mul1 = *pc1 * *pv;
2093             mul2 = *pc2 * *pv;
2094             ++pv;
2095             *pc1 = mul1;
2096             *pc2 = mul2;
2097 
2098             nz_tmp = bi[prow+1] - bd[prow] - 1;
2099             for (j=0; j<nz_tmp; j++) {
2100               tmp = pv[j];
2101               idx = pj[j];
2102               rtmp11[idx] -= mul1 * tmp;
2103               rtmp22[idx] -= mul2 * tmp;
2104             }
2105             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2106           }
2107           prow = *bjtmp++ ;
2108         }
2109 
2110         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2111         pc1 = rtmp11 + prow;
2112         pc2 = rtmp22 + prow;
2113 
2114         sctx.pv = *pc1;
2115         pj      = bj + bi[prow];
2116         rs      = 0.0;
2117         for (j=0; j<nz; j++){
2118           idx = pj[j];
2119           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2120         }
2121         sctx.rs = rs;
2122         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2123         if (sctx.newshift) goto endofwhile;
2124 
2125         if (*pc2 != 0.0){
2126           pj     = nbj + bd[prow];
2127           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2128           *pc2   = mul2;
2129           nz_tmp = bi[prow+1] - bd[prow] - 1;
2130           for (j=0; j<nz_tmp; j++) {
2131             idx = pj[j] ;
2132             tmp = rtmp11[idx];
2133             rtmp22[idx] -= mul2 * tmp;
2134           }
2135           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2136         }
2137 
2138         pj  = bj + bi[row];
2139         pc1 = ba + bi[row];
2140         pc2 = ba + bi[row+1];
2141 
2142         sctx.pv = rtmp22[row+1];
2143         rs = 0.0;
2144         rtmp11[row]   = 1.0/rtmp11[row];
2145         rtmp22[row+1] = 1.0/rtmp22[row+1];
2146         /* copy row entries from dense representation to sparse */
2147         for (j=0; j<nz; j++) {
2148           idx    = pj[j];
2149           pc1[j] = rtmp11[idx];
2150           pc2[j] = rtmp22[idx];
2151           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2152         }
2153         sctx.rs = rs;
2154         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2155         if (sctx.newshift) goto endofwhile;
2156         break;
2157 
2158       case 3:
2159         for  (j=0; j<nz; j++) {
2160           idx        = bjtmp[j];
2161           rtmp11[idx] = 0.0;
2162           rtmp22[idx] = 0.0;
2163           rtmp33[idx] = 0.0;
2164         }
2165         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2166         idx    = r[row];
2167         nz_tmp = ai[idx+1] - ai[idx];
2168         ajtmp = aj + ai[idx];
2169         v1    = aa + ai[idx];
2170         v2    = aa + ai[idx+1];
2171         v3    = aa + ai[idx+2];
2172         for (j=0; j<nz_tmp; j++) {
2173           idx        = ics[ajtmp[j]];
2174           rtmp11[idx] = v1[j];
2175           rtmp22[idx] = v2[j];
2176           rtmp33[idx] = v3[j];
2177         }
2178         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2179         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2180         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2181 
2182         /* loop over all pivot row blocks above this row block */
2183         prow = *bjtmp++ ;
2184         while (prow < row) {
2185           pc1 = rtmp11 + prow;
2186           pc2 = rtmp22 + prow;
2187           pc3 = rtmp33 + prow;
2188           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2189             pv   = ba  + bd[prow];
2190             pj   = nbj + bd[prow];
2191             mul1 = *pc1 * *pv;
2192             mul2 = *pc2 * *pv;
2193             mul3 = *pc3 * *pv;
2194             ++pv;
2195             *pc1 = mul1;
2196             *pc2 = mul2;
2197             *pc3 = mul3;
2198 
2199             nz_tmp = bi[prow+1] - bd[prow] - 1;
2200             /* update this row based on pivot row */
2201             for (j=0; j<nz_tmp; j++) {
2202               tmp = pv[j];
2203               idx = pj[j];
2204               rtmp11[idx] -= mul1 * tmp;
2205               rtmp22[idx] -= mul2 * tmp;
2206               rtmp33[idx] -= mul3 * tmp;
2207             }
2208             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2209           }
2210           prow = *bjtmp++ ;
2211         }
2212 
2213         /* Now take care of diagonal 3x3 block in this set of rows */
2214         /* note: prow = row here */
2215         pc1 = rtmp11 + prow;
2216         pc2 = rtmp22 + prow;
2217         pc3 = rtmp33 + prow;
2218 
2219         sctx.pv = *pc1;
2220         pj      = bj + bi[prow];
2221         rs      = 0.0;
2222         for (j=0; j<nz; j++){
2223           idx = pj[j];
2224           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2225         }
2226         sctx.rs = rs;
2227         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2228         if (sctx.newshift) goto endofwhile;
2229 
2230         if (*pc2 != 0.0 || *pc3 != 0.0){
2231           mul2 = (*pc2)/(*pc1);
2232           mul3 = (*pc3)/(*pc1);
2233           *pc2 = mul2;
2234           *pc3 = mul3;
2235           nz_tmp = bi[prow+1] - bd[prow] - 1;
2236           pj     = nbj + bd[prow];
2237           for (j=0; j<nz_tmp; j++) {
2238             idx = pj[j] ;
2239             tmp = rtmp11[idx];
2240             rtmp22[idx] -= mul2 * tmp;
2241             rtmp33[idx] -= mul3 * tmp;
2242           }
2243           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2244         }
2245         ++prow;
2246 
2247         pc2 = rtmp22 + prow;
2248         pc3 = rtmp33 + prow;
2249         sctx.pv = *pc2;
2250         pj      = bj + bi[prow];
2251         rs      = 0.0;
2252         for (j=0; j<nz; j++){
2253           idx = pj[j];
2254           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2255         }
2256         sctx.rs = rs;
2257         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2258         if (sctx.newshift) goto endofwhile;
2259 
2260         if (*pc3 != 0.0){
2261           mul3   = (*pc3)/(*pc2);
2262           *pc3   = mul3;
2263           pj     = nbj + bd[prow];
2264           nz_tmp = bi[prow+1] - bd[prow] - 1;
2265           for (j=0; j<nz_tmp; j++) {
2266             idx = pj[j] ;
2267             tmp = rtmp22[idx];
2268             rtmp33[idx] -= mul3 * tmp;
2269           }
2270           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2271         }
2272 
2273         pj  = bj + bi[row];
2274         pc1 = ba + bi[row];
2275         pc2 = ba + bi[row+1];
2276         pc3 = ba + bi[row+2];
2277 
2278         sctx.pv = rtmp33[row+2];
2279         rs = 0.0;
2280         rtmp11[row]   = 1.0/rtmp11[row];
2281         rtmp22[row+1] = 1.0/rtmp22[row+1];
2282         rtmp33[row+2] = 1.0/rtmp33[row+2];
2283         /* copy row entries from dense representation to sparse */
2284         for (j=0; j<nz; j++) {
2285           idx    = pj[j];
2286           pc1[j] = rtmp11[idx];
2287           pc2[j] = rtmp22[idx];
2288           pc3[j] = rtmp33[idx];
2289           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2290         }
2291 
2292         sctx.rs = rs;
2293         ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr);
2294         if (sctx.newshift) goto endofwhile;
2295         break;
2296 
2297       default:
2298         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2299       }
2300       row += nodesz;                 /* Update the row */
2301     }
2302     endofwhile:;
2303   } while (sctx.newshift);
2304   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2305   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2306   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2309   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2310   /* do not set solve add, since MatSolve_Inode + Add is faster */
2311   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2312   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2313   C->assembled   = PETSC_TRUE;
2314   C->preallocated = PETSC_TRUE;
2315   if (sctx.nshift) {
2316     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2317       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2318     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2319       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2320     }
2321   }
2322   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2323   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2324   PetscFunctionReturn(0);
2325 }
2326 
2327 
2328 /* ----------------------------------------------------------- */
2329 #undef __FUNCT__
2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2332 {
2333   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2334   IS                iscol = a->col,isrow = a->row;
2335   PetscErrorCode    ierr;
2336   const PetscInt    *r,*c,*rout,*cout;
2337   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2338   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2339   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2340   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2341   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2342   const PetscScalar *b;
2343 
2344   PetscFunctionBegin;
2345   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2346   node_max = a->inode.node_count;
2347   ns       = a->inode.size;     /* Node Size array */
2348 
2349   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2350   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2351   tmp  = a->solve_work;
2352 
2353   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2354   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2355 
2356   /* forward solve the lower triangular */
2357   tmps = tmp ;
2358   aa   = a_a ;
2359   aj   = a_j ;
2360   ad   = a->diag;
2361 
2362   for (i = 0,row = 0; i< node_max; ++i){
2363     nsz = ns[i];
2364     aii = ai[row];
2365     v1  = aa + aii;
2366     vi  = aj + aii;
2367     nz  = ai[row+1]- ai[row];
2368 
2369     if (i < node_max-1) {
2370       /* Prefetch the indices for the next block */
2371       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2372       /* Prefetch the data for the next block */
2373       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2374     }
2375 
2376     switch (nsz){               /* Each loop in 'case' is unrolled */
2377     case 1 :
2378       sum1 = b[r[row]];
2379       for(j=0; j<nz-1; j+=2){
2380         i0   = vi[j];
2381         i1   = vi[j+1];
2382         tmp0 = tmps[i0];
2383         tmp1 = tmps[i1];
2384         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2385       }
2386       if(j == nz-1){
2387         tmp0 = tmps[vi[j]];
2388         sum1 -= v1[j]*tmp0;
2389       }
2390       tmp[row++]=sum1;
2391       break;
2392     case 2:
2393       sum1 = b[r[row]];
2394       sum2 = b[r[row+1]];
2395       v2   = aa + ai[row+1];
2396 
2397       for(j=0; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2404       }
2405       if(j == nz-1){
2406         tmp0 = tmps[vi[j]];
2407         sum1 -= v1[j] *tmp0;
2408         sum2 -= v2[j] *tmp0;
2409       }
2410       sum2 -= v2[nz] * sum1;
2411       tmp[row ++]=sum1;
2412       tmp[row ++]=sum2;
2413       break;
2414     case 3:
2415       sum1 = b[r[row]];
2416       sum2 = b[r[row+1]];
2417       sum3 = b[r[row+2]];
2418       v2   = aa + ai[row+1];
2419       v3   = aa + ai[row+2];
2420 
2421       for (j=0; j<nz-1; j+=2){
2422         i0   = vi[j];
2423         i1   = vi[j+1];
2424         tmp0 = tmps[i0];
2425         tmp1 = tmps[i1];
2426         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2427         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2428         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2429       }
2430       if (j == nz-1){
2431         tmp0 = tmps[vi[j]];
2432         sum1 -= v1[j] *tmp0;
2433         sum2 -= v2[j] *tmp0;
2434         sum3 -= v3[j] *tmp0;
2435       }
2436       sum2 -= v2[nz] * sum1;
2437       sum3 -= v3[nz] * sum1;
2438       sum3 -= v3[nz+1] * sum2;
2439       tmp[row ++]=sum1;
2440       tmp[row ++]=sum2;
2441       tmp[row ++]=sum3;
2442       break;
2443 
2444     case 4:
2445       sum1 = b[r[row]];
2446       sum2 = b[r[row+1]];
2447       sum3 = b[r[row+2]];
2448       sum4 = b[r[row+3]];
2449       v2   = aa + ai[row+1];
2450       v3   = aa + ai[row+2];
2451       v4   = aa + ai[row+3];
2452 
2453       for (j=0; j<nz-1; j+=2){
2454         i0   = vi[j];
2455         i1   = vi[j+1];
2456         tmp0 = tmps[i0];
2457         tmp1 = tmps[i1];
2458         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2459         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2460         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2461         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2462       }
2463       if (j == nz-1){
2464         tmp0 = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469       }
2470       sum2 -= v2[nz] * sum1;
2471       sum3 -= v3[nz] * sum1;
2472       sum4 -= v4[nz] * sum1;
2473       sum3 -= v3[nz+1] * sum2;
2474       sum4 -= v4[nz+1] * sum2;
2475       sum4 -= v4[nz+2] * sum3;
2476 
2477       tmp[row ++]=sum1;
2478       tmp[row ++]=sum2;
2479       tmp[row ++]=sum3;
2480       tmp[row ++]=sum4;
2481       break;
2482     case 5:
2483       sum1 = b[r[row]];
2484       sum2 = b[r[row+1]];
2485       sum3 = b[r[row+2]];
2486       sum4 = b[r[row+3]];
2487       sum5 = b[r[row+4]];
2488       v2   = aa + ai[row+1];
2489       v3   = aa + ai[row+2];
2490       v4   = aa + ai[row+3];
2491       v5   = aa + ai[row+4];
2492 
2493       for (j=0; j<nz-1; j+=2){
2494         i0   = vi[j];
2495         i1   = vi[j+1];
2496         tmp0 = tmps[i0];
2497         tmp1 = tmps[i1];
2498         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2499         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2500         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2501         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2502         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2503       }
2504       if (j == nz-1){
2505         tmp0 = tmps[vi[j]];
2506         sum1 -= v1[j] *tmp0;
2507         sum2 -= v2[j] *tmp0;
2508         sum3 -= v3[j] *tmp0;
2509         sum4 -= v4[j] *tmp0;
2510         sum5 -= v5[j] *tmp0;
2511       }
2512 
2513       sum2 -= v2[nz] * sum1;
2514       sum3 -= v3[nz] * sum1;
2515       sum4 -= v4[nz] * sum1;
2516       sum5 -= v5[nz] * sum1;
2517       sum3 -= v3[nz+1] * sum2;
2518       sum4 -= v4[nz+1] * sum2;
2519       sum5 -= v5[nz+1] * sum2;
2520       sum4 -= v4[nz+2] * sum3;
2521       sum5 -= v5[nz+2] * sum3;
2522       sum5 -= v5[nz+3] * sum4;
2523 
2524       tmp[row ++]=sum1;
2525       tmp[row ++]=sum2;
2526       tmp[row ++]=sum3;
2527       tmp[row ++]=sum4;
2528       tmp[row ++]=sum5;
2529       break;
2530     default:
2531       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2532     }
2533   }
2534   /* backward solve the upper triangular */
2535   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2536     nsz = ns[i];
2537     aii = ad[row+1] + 1;
2538     v1  = aa + aii;
2539     vi  = aj + aii;
2540     nz  = ad[row]- ad[row+1] - 1;
2541 
2542     if (i > 0) {
2543       /* Prefetch the indices for the next block */
2544       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2545       /* Prefetch the data for the next block */
2546       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2547     }
2548 
2549     switch (nsz){               /* Each loop in 'case' is unrolled */
2550     case 1 :
2551       sum1 = tmp[row];
2552 
2553       for(j=0 ; j<nz-1; j+=2){
2554         i0   = vi[j];
2555         i1   = vi[j+1];
2556         tmp0 = tmps[i0];
2557         tmp1 = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559       }
2560       if (j == nz-1){
2561         tmp0  = tmps[vi[j]];
2562         sum1 -= v1[j]*tmp0;
2563       }
2564       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2565       break;
2566     case 2 :
2567       sum1 = tmp[row];
2568       sum2 = tmp[row-1];
2569       v2   = aa + ad[row] + 1;
2570       for (j=0 ; j<nz-1; j+=2){
2571         i0   = vi[j];
2572         i1   = vi[j+1];
2573         tmp0 = tmps[i0];
2574         tmp1 = tmps[i1];
2575         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2576         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2577       }
2578       if (j == nz-1){
2579         tmp0  = tmps[vi[j]];
2580         sum1 -= v1[j]* tmp0;
2581         sum2 -= v2[j+1]* tmp0;
2582       }
2583 
2584       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2585       sum2   -= v2[0] * tmp0;
2586       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2587       break;
2588     case 3 :
2589       sum1 = tmp[row];
2590       sum2 = tmp[row -1];
2591       sum3 = tmp[row -2];
2592       v2   = aa + ad[row] + 1;
2593       v3   = aa + ad[row -1] + 1;
2594       for (j=0 ; j<nz-1; j+=2){
2595         i0   = vi[j];
2596         i1   = vi[j+1];
2597         tmp0 = tmps[i0];
2598         tmp1 = tmps[i1];
2599         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2600         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2601         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2602       }
2603       if (j== nz-1){
2604         tmp0  = tmps[vi[j]];
2605         sum1 -= v1[j] * tmp0;
2606         sum2 -= v2[j+1] * tmp0;
2607         sum3 -= v3[j+2] * tmp0;
2608       }
2609       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2610       sum2   -= v2[0]* tmp0;
2611       sum3   -= v3[1] * tmp0;
2612       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2613       sum3   -= v3[0]* tmp0;
2614       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2615 
2616       break;
2617     case 4 :
2618       sum1 = tmp[row];
2619       sum2 = tmp[row -1];
2620       sum3 = tmp[row -2];
2621       sum4 = tmp[row -3];
2622       v2   = aa + ad[row]+1;
2623       v3   = aa + ad[row -1]+1;
2624       v4   = aa + ad[row -2]+1;
2625 
2626       for (j=0 ; j<nz-1; j+=2){
2627         i0   = vi[j];
2628         i1   = vi[j+1];
2629         tmp0 = tmps[i0];
2630         tmp1 = tmps[i1];
2631         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2632         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2633         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2634         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2635       }
2636       if (j== nz-1){
2637         tmp0  = tmps[vi[j]];
2638         sum1 -= v1[j] * tmp0;
2639         sum2 -= v2[j+1] * tmp0;
2640         sum3 -= v3[j+2] * tmp0;
2641         sum4 -= v4[j+3] * tmp0;
2642       }
2643 
2644       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2   -= v2[0] * tmp0;
2646       sum3   -= v3[1] * tmp0;
2647       sum4   -= v4[2] * tmp0;
2648       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2649       sum3   -= v3[0] * tmp0;
2650       sum4   -= v4[1] * tmp0;
2651       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2652       sum4   -= v4[0] * tmp0;
2653       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2654       break;
2655     case 5 :
2656       sum1 = tmp[row];
2657       sum2 = tmp[row -1];
2658       sum3 = tmp[row -2];
2659       sum4 = tmp[row -3];
2660       sum5 = tmp[row -4];
2661       v2   = aa + ad[row]+1;
2662       v3   = aa + ad[row -1]+1;
2663       v4   = aa + ad[row -2]+1;
2664       v5   = aa + ad[row -3]+1;
2665       for (j=0 ; j<nz-1; j+=2){
2666         i0   = vi[j];
2667         i1   = vi[j+1];
2668         tmp0 = tmps[i0];
2669         tmp1 = tmps[i1];
2670         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2671         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2672         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2673         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2674         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2675       }
2676       if (j==nz-1){
2677         tmp0  = tmps[vi[j]];
2678         sum1 -= v1[j] * tmp0;
2679         sum2 -= v2[j+1] * tmp0;
2680         sum3 -= v3[j+2] * tmp0;
2681         sum4 -= v4[j+3] * tmp0;
2682         sum5 -= v5[j+4] * tmp0;
2683       }
2684 
2685       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2686       sum2   -= v2[0] * tmp0;
2687       sum3   -= v3[1] * tmp0;
2688       sum4   -= v4[2] * tmp0;
2689       sum5   -= v5[3] * tmp0;
2690       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2691       sum3   -= v3[0] * tmp0;
2692       sum4   -= v4[1] * tmp0;
2693       sum5   -= v5[2] * tmp0;
2694       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2695       sum4   -= v4[0] * tmp0;
2696       sum5   -= v5[1] * tmp0;
2697       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2698       sum5   -= v5[0] * tmp0;
2699       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2700       break;
2701     default:
2702       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2703     }
2704   }
2705   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2706   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2707   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2708   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2709   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2710   PetscFunctionReturn(0);
2711 }
2712 
2713 
2714 /*
2715      Makes a longer coloring[] array and calls the usual code with that
2716 */
2717 #undef __FUNCT__
2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2720 {
2721   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2722   PetscErrorCode  ierr;
2723   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2724   PetscInt        *colorused,i;
2725   ISColoringValue *newcolor;
2726 
2727   PetscFunctionBegin;
2728   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2729   /* loop over inodes, marking a color for each column*/
2730   row = 0;
2731   for (i=0; i<m; i++){
2732     for (j=0; j<ns[i]; j++) {
2733       newcolor[row++] = coloring[i] + j*ncolors;
2734     }
2735   }
2736 
2737   /* eliminate unneeded colors */
2738   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2739   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2740   for (i=0; i<n; i++) {
2741     colorused[newcolor[i]] = 1;
2742   }
2743 
2744   for (i=1; i<5*ncolors; i++) {
2745     colorused[i] += colorused[i-1];
2746   }
2747   ncolors = colorused[5*ncolors-1];
2748   for (i=0; i<n; i++) {
2749     newcolor[i] = colorused[newcolor[i]]-1;
2750   }
2751   ierr = PetscFree(colorused);CHKERRQ(ierr);
2752   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2753   ierr = PetscFree(coloring);CHKERRQ(ierr);
2754   PetscFunctionReturn(0);
2755 }
2756 
2757 #include "../src/mat/blockinvert.h"
2758 
2759 #undef __FUNCT__
2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2762 {
2763   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2764   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2765   MatScalar          *ibdiag,*bdiag,work[25];
2766   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2767   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2768   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2769   PetscErrorCode     ierr;
2770   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2771   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2772 
2773   PetscFunctionBegin;
2774   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2775   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2776   if (its > 1) {
2777     /* switch to non-inode version */
2778     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2779     PetscFunctionReturn(0);
2780   }
2781 
2782   if (!a->inode.ibdiagvalid) {
2783     if (!a->inode.ibdiag) {
2784       /* calculate space needed for diagonal blocks */
2785       for (i=0; i<m; i++) {
2786 	cnt += sizes[i]*sizes[i];
2787       }
2788       a->inode.bdiagsize = cnt;
2789       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2790     }
2791 
2792     /* copy over the diagonal blocks and invert them */
2793     ibdiag = a->inode.ibdiag;
2794     bdiag  = a->inode.bdiag;
2795     cnt = 0;
2796     for (i=0, row = 0; i<m; i++) {
2797       for (j=0; j<sizes[i]; j++) {
2798         for (k=0; k<sizes[i]; k++) {
2799           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2800         }
2801       }
2802       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2803 
2804       switch(sizes[i]) {
2805         case 1:
2806           /* Create matrix data structure */
2807           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2808           ibdiag[cnt] = 1.0/ibdiag[cnt];
2809           break;
2810         case 2:
2811           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2812           break;
2813         case 3:
2814           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2815           break;
2816         case 4:
2817           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2818           break;
2819         case 5:
2820           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2821           break;
2822        default:
2823 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2824       }
2825       cnt += sizes[i]*sizes[i];
2826       row += sizes[i];
2827     }
2828     a->inode.ibdiagvalid = PETSC_TRUE;
2829   }
2830   ibdiag = a->inode.ibdiag;
2831   bdiag  = a->inode.bdiag;
2832 
2833 
2834   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2835   if (flag & SOR_ZERO_INITIAL_GUESS) {
2836     PetscScalar *b;
2837     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838     if (xx != bb) {
2839       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2840     } else {
2841       b = x;
2842     }
2843     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2844 
2845       for (i=0, row=0; i<m; i++) {
2846         sz  = diag[row] - ii[row];
2847         v1  = a->a + ii[row];
2848         idx = a->j + ii[row];
2849 
2850         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2851         switch (sizes[i]){
2852           case 1:
2853 
2854             sum1  = b[row];
2855             for(n = 0; n<sz-1; n+=2) {
2856               i1   = idx[0];
2857               i2   = idx[1];
2858               idx += 2;
2859               tmp0 = x[i1];
2860               tmp1 = x[i2];
2861               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2862             }
2863 
2864             if (n == sz-1){
2865               tmp0  = x[*idx];
2866               sum1 -= *v1 * tmp0;
2867             }
2868             x[row++] = sum1*(*ibdiag++);
2869             break;
2870           case 2:
2871             v2    = a->a + ii[row+1];
2872             sum1  = b[row];
2873             sum2  = b[row+1];
2874             for(n = 0; n<sz-1; n+=2) {
2875               i1   = idx[0];
2876               i2   = idx[1];
2877               idx += 2;
2878               tmp0 = x[i1];
2879               tmp1 = x[i2];
2880               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2881               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2882             }
2883 
2884             if (n == sz-1){
2885               tmp0  = x[*idx];
2886               sum1 -= v1[0] * tmp0;
2887               sum2 -= v2[0] * tmp0;
2888             }
2889             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2890             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2891             ibdiag  += 4;
2892             break;
2893           case 3:
2894             v2    = a->a + ii[row+1];
2895             v3    = a->a + ii[row+2];
2896             sum1  = b[row];
2897             sum2  = b[row+1];
2898             sum3  = b[row+2];
2899             for(n = 0; n<sz-1; n+=2) {
2900               i1   = idx[0];
2901               i2   = idx[1];
2902               idx += 2;
2903               tmp0 = x[i1];
2904               tmp1 = x[i2];
2905               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2906               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2907               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2908             }
2909 
2910             if (n == sz-1){
2911               tmp0  = x[*idx];
2912               sum1 -= v1[0] * tmp0;
2913               sum2 -= v2[0] * tmp0;
2914               sum3 -= v3[0] * tmp0;
2915             }
2916             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2917             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2918             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2919             ibdiag  += 9;
2920             break;
2921           case 4:
2922             v2    = a->a + ii[row+1];
2923             v3    = a->a + ii[row+2];
2924             v4    = a->a + ii[row+3];
2925             sum1  = b[row];
2926             sum2  = b[row+1];
2927             sum3  = b[row+2];
2928             sum4  = b[row+3];
2929             for(n = 0; n<sz-1; n+=2) {
2930               i1   = idx[0];
2931               i2   = idx[1];
2932               idx += 2;
2933               tmp0 = x[i1];
2934               tmp1 = x[i2];
2935               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2936               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2937               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2938               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2939             }
2940 
2941             if (n == sz-1){
2942               tmp0  = x[*idx];
2943               sum1 -= v1[0] * tmp0;
2944               sum2 -= v2[0] * tmp0;
2945               sum3 -= v3[0] * tmp0;
2946               sum4 -= v4[0] * tmp0;
2947             }
2948             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2949             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2950             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2951             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2952             ibdiag  += 16;
2953             break;
2954           case 5:
2955             v2    = a->a + ii[row+1];
2956             v3    = a->a + ii[row+2];
2957             v4    = a->a + ii[row+3];
2958             v5    = a->a + ii[row+4];
2959             sum1  = b[row];
2960             sum2  = b[row+1];
2961             sum3  = b[row+2];
2962             sum4  = b[row+3];
2963             sum5  = b[row+4];
2964             for(n = 0; n<sz-1; n+=2) {
2965               i1   = idx[0];
2966               i2   = idx[1];
2967               idx += 2;
2968               tmp0 = x[i1];
2969               tmp1 = x[i2];
2970               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2971               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2972               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2973               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2974               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2975             }
2976 
2977             if (n == sz-1){
2978               tmp0  = x[*idx];
2979               sum1 -= v1[0] * tmp0;
2980               sum2 -= v2[0] * tmp0;
2981               sum3 -= v3[0] * tmp0;
2982               sum4 -= v4[0] * tmp0;
2983               sum5 -= v5[0] * tmp0;
2984             }
2985             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2986             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2987             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2988             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2989             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2990             ibdiag  += 25;
2991             break;
2992 	  default:
2993    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2994         }
2995       }
2996 
2997       xb = x;
2998       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2999     } else xb = b;
3000     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
3001         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
3002       cnt = 0;
3003       for (i=0, row=0; i<m; i++) {
3004 
3005         switch (sizes[i]){
3006           case 1:
3007             x[row++] *= bdiag[cnt++];
3008             break;
3009           case 2:
3010             x1   = x[row]; x2 = x[row+1];
3011             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3012             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3013             x[row++] = tmp1;
3014             x[row++] = tmp2;
3015             cnt += 4;
3016             break;
3017           case 3:
3018             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3019             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3020             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3021             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3022             x[row++] = tmp1;
3023             x[row++] = tmp2;
3024             x[row++] = tmp3;
3025             cnt += 9;
3026             break;
3027           case 4:
3028             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3029             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3030             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3031             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3032             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3033             x[row++] = tmp1;
3034             x[row++] = tmp2;
3035             x[row++] = tmp3;
3036             x[row++] = tmp4;
3037             cnt += 16;
3038             break;
3039           case 5:
3040             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3041             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3042             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3043             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3044             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3045             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3046             x[row++] = tmp1;
3047             x[row++] = tmp2;
3048             x[row++] = tmp3;
3049             x[row++] = tmp4;
3050             x[row++] = tmp5;
3051             cnt += 25;
3052             break;
3053 	  default:
3054    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3055         }
3056       }
3057       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3058     }
3059     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3060 
3061       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3062       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3063         ibdiag -= sizes[i]*sizes[i];
3064         sz      = ii[row+1] - diag[row] - 1;
3065         v1      = a->a + diag[row] + 1;
3066         idx     = a->j + diag[row] + 1;
3067 
3068         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3069         switch (sizes[i]){
3070           case 1:
3071 
3072             sum1  = xb[row];
3073             for(n = 0; n<sz-1; n+=2) {
3074               i1   = idx[0];
3075               i2   = idx[1];
3076               idx += 2;
3077               tmp0 = x[i1];
3078               tmp1 = x[i2];
3079               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3080             }
3081 
3082             if (n == sz-1){
3083               tmp0  = x[*idx];
3084               sum1 -= *v1*tmp0;
3085             }
3086             x[row--] = sum1*(*ibdiag);
3087             break;
3088 
3089           case 2:
3090 
3091             sum1  = xb[row];
3092             sum2  = xb[row-1];
3093             /* note that sum1 is associated with the second of the two rows */
3094             v2    = a->a + diag[row-1] + 2;
3095             for(n = 0; n<sz-1; n+=2) {
3096               i1   = idx[0];
3097               i2   = idx[1];
3098               idx += 2;
3099               tmp0 = x[i1];
3100               tmp1 = x[i2];
3101               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3102               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3103             }
3104 
3105             if (n == sz-1){
3106               tmp0  = x[*idx];
3107               sum1 -= *v1*tmp0;
3108               sum2 -= *v2*tmp0;
3109             }
3110             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3111             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3112             break;
3113           case 3:
3114 
3115             sum1  = xb[row];
3116             sum2  = xb[row-1];
3117             sum3  = xb[row-2];
3118             v2    = a->a + diag[row-1] + 2;
3119             v3    = a->a + diag[row-2] + 3;
3120             for(n = 0; n<sz-1; n+=2) {
3121               i1   = idx[0];
3122               i2   = idx[1];
3123               idx += 2;
3124               tmp0 = x[i1];
3125               tmp1 = x[i2];
3126               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3127               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3128               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3129             }
3130 
3131             if (n == sz-1){
3132               tmp0  = x[*idx];
3133               sum1 -= *v1*tmp0;
3134               sum2 -= *v2*tmp0;
3135               sum3 -= *v3*tmp0;
3136             }
3137             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3138             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3139             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3140             break;
3141           case 4:
3142 
3143             sum1  = xb[row];
3144             sum2  = xb[row-1];
3145             sum3  = xb[row-2];
3146             sum4  = xb[row-3];
3147             v2    = a->a + diag[row-1] + 2;
3148             v3    = a->a + diag[row-2] + 3;
3149             v4    = a->a + diag[row-3] + 4;
3150             for(n = 0; n<sz-1; n+=2) {
3151               i1   = idx[0];
3152               i2   = idx[1];
3153               idx += 2;
3154               tmp0 = x[i1];
3155               tmp1 = x[i2];
3156               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3157               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3158               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3159               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3160             }
3161 
3162             if (n == sz-1){
3163               tmp0  = x[*idx];
3164               sum1 -= *v1*tmp0;
3165               sum2 -= *v2*tmp0;
3166               sum3 -= *v3*tmp0;
3167               sum4 -= *v4*tmp0;
3168             }
3169             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3170             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3171             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3172             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3173             break;
3174           case 5:
3175 
3176             sum1  = xb[row];
3177             sum2  = xb[row-1];
3178             sum3  = xb[row-2];
3179             sum4  = xb[row-3];
3180             sum5  = xb[row-4];
3181             v2    = a->a + diag[row-1] + 2;
3182             v3    = a->a + diag[row-2] + 3;
3183             v4    = a->a + diag[row-3] + 4;
3184             v5    = a->a + diag[row-4] + 5;
3185             for(n = 0; n<sz-1; n+=2) {
3186               i1   = idx[0];
3187               i2   = idx[1];
3188               idx += 2;
3189               tmp0 = x[i1];
3190               tmp1 = x[i2];
3191               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3192               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3193               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3194               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3195               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3196             }
3197 
3198             if (n == sz-1){
3199               tmp0  = x[*idx];
3200               sum1 -= *v1*tmp0;
3201               sum2 -= *v2*tmp0;
3202               sum3 -= *v3*tmp0;
3203               sum4 -= *v4*tmp0;
3204               sum5 -= *v5*tmp0;
3205             }
3206             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3207             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3208             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3209             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3210             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3211             break;
3212 	  default:
3213    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3214         }
3215       }
3216 
3217       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3218     }
3219     its--;
3220     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3221     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3222   }
3223   if (flag & SOR_EISENSTAT) {
3224     const PetscScalar *b;
3225     MatScalar         *t = a->inode.ssor_work;
3226 
3227     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3228     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3229     /*
3230           Apply  (U + D)^-1  where D is now the block diagonal
3231     */
3232     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3233     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3234       ibdiag -= sizes[i]*sizes[i];
3235       sz      = ii[row+1] - diag[row] - 1;
3236       v1      = a->a + diag[row] + 1;
3237       idx     = a->j + diag[row] + 1;
3238       CHKMEMQ;
3239       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3240       switch (sizes[i]){
3241         case 1:
3242 
3243 	  sum1  = b[row];
3244 	  for(n = 0; n<sz-1; n+=2) {
3245 	    i1   = idx[0];
3246 	    i2   = idx[1];
3247 	    idx += 2;
3248 	    tmp0 = x[i1];
3249 	    tmp1 = x[i2];
3250 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3251 	  }
3252 
3253 	  if (n == sz-1){
3254 	    tmp0  = x[*idx];
3255 	    sum1 -= *v1*tmp0;
3256 	  }
3257 	  x[row] = sum1*(*ibdiag);row--;
3258 	  break;
3259 
3260         case 2:
3261 
3262 	  sum1  = b[row];
3263 	  sum2  = b[row-1];
3264 	  /* note that sum1 is associated with the second of the two rows */
3265 	  v2    = a->a + diag[row-1] + 2;
3266 	  for(n = 0; n<sz-1; n+=2) {
3267 	    i1   = idx[0];
3268 	    i2   = idx[1];
3269 	    idx += 2;
3270 	    tmp0 = x[i1];
3271 	    tmp1 = x[i2];
3272 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274 	  }
3275 
3276 	  if (n == sz-1){
3277 	    tmp0  = x[*idx];
3278 	    sum1 -= *v1*tmp0;
3279 	    sum2 -= *v2*tmp0;
3280 	  }
3281 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3282 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3283           row -= 2;
3284 	  break;
3285         case 3:
3286 
3287 	  sum1  = b[row];
3288 	  sum2  = b[row-1];
3289 	  sum3  = b[row-2];
3290 	  v2    = a->a + diag[row-1] + 2;
3291 	  v3    = a->a + diag[row-2] + 3;
3292 	  for(n = 0; n<sz-1; n+=2) {
3293 	    i1   = idx[0];
3294 	    i2   = idx[1];
3295 	    idx += 2;
3296 	    tmp0 = x[i1];
3297 	    tmp1 = x[i2];
3298 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3299 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3300 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3301 	  }
3302 
3303 	  if (n == sz-1){
3304 	    tmp0  = x[*idx];
3305 	    sum1 -= *v1*tmp0;
3306 	    sum2 -= *v2*tmp0;
3307 	    sum3 -= *v3*tmp0;
3308 	  }
3309 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3310 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3311 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3312           row -= 3;
3313 	  break;
3314         case 4:
3315 
3316 	  sum1  = b[row];
3317 	  sum2  = b[row-1];
3318 	  sum3  = b[row-2];
3319 	  sum4  = b[row-3];
3320 	  v2    = a->a + diag[row-1] + 2;
3321 	  v3    = a->a + diag[row-2] + 3;
3322 	  v4    = a->a + diag[row-3] + 4;
3323 	  for(n = 0; n<sz-1; n+=2) {
3324 	    i1   = idx[0];
3325 	    i2   = idx[1];
3326 	    idx += 2;
3327 	    tmp0 = x[i1];
3328 	    tmp1 = x[i2];
3329 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333 	  }
3334 
3335 	  if (n == sz-1){
3336 	    tmp0  = x[*idx];
3337 	    sum1 -= *v1*tmp0;
3338 	    sum2 -= *v2*tmp0;
3339 	    sum3 -= *v3*tmp0;
3340 	    sum4 -= *v4*tmp0;
3341 	  }
3342 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3343 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3344 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3345 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3346           row -= 4;
3347 	  break;
3348         case 5:
3349 
3350 	  sum1  = b[row];
3351 	  sum2  = b[row-1];
3352 	  sum3  = b[row-2];
3353 	  sum4  = b[row-3];
3354 	  sum5  = b[row-4];
3355 	  v2    = a->a + diag[row-1] + 2;
3356 	  v3    = a->a + diag[row-2] + 3;
3357 	  v4    = a->a + diag[row-3] + 4;
3358 	  v5    = a->a + diag[row-4] + 5;
3359 	  for(n = 0; n<sz-1; n+=2) {
3360 	    i1   = idx[0];
3361 	    i2   = idx[1];
3362 	    idx += 2;
3363 	    tmp0 = x[i1];
3364 	    tmp1 = x[i2];
3365 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3370 	  }
3371 
3372 	  if (n == sz-1){
3373 	    tmp0  = x[*idx];
3374 	    sum1 -= *v1*tmp0;
3375 	    sum2 -= *v2*tmp0;
3376 	    sum3 -= *v3*tmp0;
3377 	    sum4 -= *v4*tmp0;
3378 	    sum5 -= *v5*tmp0;
3379 	  }
3380 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3381 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3382 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3383 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3384 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3385           row -= 5;
3386 	  break;
3387         default:
3388 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3389       }
3390       CHKMEMQ;
3391     }
3392     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3393 
3394     /*
3395            t = b - D x    where D is the block diagonal
3396     */
3397     cnt = 0;
3398     for (i=0, row=0; i<m; i++) {
3399       CHKMEMQ;
3400       switch (sizes[i]){
3401         case 1:
3402 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3403 	  break;
3404         case 2:
3405 	  x1   = x[row]; x2 = x[row+1];
3406 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3407 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3408 	  t[row]   = b[row] - tmp1;
3409 	  t[row+1] = b[row+1] - tmp2; row += 2;
3410 	  cnt += 4;
3411 	  break;
3412         case 3:
3413 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3414 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3415 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3416 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3417 	  t[row] = b[row] - tmp1;
3418 	  t[row+1] = b[row+1] - tmp2;
3419 	  t[row+2] = b[row+2] - tmp3; row += 3;
3420 	  cnt += 9;
3421 	  break;
3422         case 4:
3423 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3424 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3425 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3426 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3427 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3428 	  t[row] = b[row] - tmp1;
3429 	  t[row+1] = b[row+1] - tmp2;
3430 	  t[row+2] = b[row+2] - tmp3;
3431 	  t[row+3] = b[row+3] - tmp4; row += 4;
3432 	  cnt += 16;
3433 	  break;
3434         case 5:
3435 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3436 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3437 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3438 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3439 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3440 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3441 	  t[row] = b[row] - tmp1;
3442 	  t[row+1] = b[row+1] - tmp2;
3443 	  t[row+2] = b[row+2] - tmp3;
3444 	  t[row+3] = b[row+3] - tmp4;
3445 	  t[row+4] = b[row+4] - tmp5;row += 5;
3446 	  cnt += 25;
3447 	  break;
3448         default:
3449 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3450       }
3451       CHKMEMQ;
3452     }
3453     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3454 
3455 
3456 
3457     /*
3458           Apply (L + D)^-1 where D is the block diagonal
3459     */
3460     for (i=0, row=0; i<m; i++) {
3461       sz  = diag[row] - ii[row];
3462       v1  = a->a + ii[row];
3463       idx = a->j + ii[row];
3464       CHKMEMQ;
3465       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3466       switch (sizes[i]){
3467         case 1:
3468 
3469 	  sum1  = t[row];
3470 	  for(n = 0; n<sz-1; n+=2) {
3471 	    i1   = idx[0];
3472 	    i2   = idx[1];
3473 	    idx += 2;
3474 	    tmp0 = t[i1];
3475 	    tmp1 = t[i2];
3476 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3477 	  }
3478 
3479 	  if (n == sz-1){
3480 	    tmp0  = t[*idx];
3481 	    sum1 -= *v1 * tmp0;
3482 	  }
3483 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3484 	  break;
3485         case 2:
3486 	  v2    = a->a + ii[row+1];
3487 	  sum1  = t[row];
3488 	  sum2  = t[row+1];
3489 	  for(n = 0; n<sz-1; n+=2) {
3490 	    i1   = idx[0];
3491 	    i2   = idx[1];
3492 	    idx += 2;
3493 	    tmp0 = t[i1];
3494 	    tmp1 = t[i2];
3495 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3496 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3497 	  }
3498 
3499 	  if (n == sz-1){
3500 	    tmp0  = t[*idx];
3501 	    sum1 -= v1[0] * tmp0;
3502 	    sum2 -= v2[0] * tmp0;
3503 	  }
3504 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3505 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3506 	  ibdiag  += 4; row += 2;
3507 	  break;
3508         case 3:
3509 	  v2    = a->a + ii[row+1];
3510 	  v3    = a->a + ii[row+2];
3511 	  sum1  = t[row];
3512 	  sum2  = t[row+1];
3513 	  sum3  = t[row+2];
3514 	  for(n = 0; n<sz-1; n+=2) {
3515 	    i1   = idx[0];
3516 	    i2   = idx[1];
3517 	    idx += 2;
3518 	    tmp0 = t[i1];
3519 	    tmp1 = t[i2];
3520 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3521 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3522 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3523 	  }
3524 
3525 	  if (n == sz-1){
3526 	    tmp0  = t[*idx];
3527 	    sum1 -= v1[0] * tmp0;
3528 	    sum2 -= v2[0] * tmp0;
3529 	    sum3 -= v3[0] * tmp0;
3530 	  }
3531 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3532 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3533 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3534 	  ibdiag  += 9; row += 3;
3535 	  break;
3536         case 4:
3537 	  v2    = a->a + ii[row+1];
3538 	  v3    = a->a + ii[row+2];
3539 	  v4    = a->a + ii[row+3];
3540 	  sum1  = t[row];
3541 	  sum2  = t[row+1];
3542 	  sum3  = t[row+2];
3543 	  sum4  = t[row+3];
3544 	  for(n = 0; n<sz-1; n+=2) {
3545 	    i1   = idx[0];
3546 	    i2   = idx[1];
3547 	    idx += 2;
3548 	    tmp0 = t[i1];
3549 	    tmp1 = t[i2];
3550 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3551 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3552 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3553 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3554 	  }
3555 
3556 	  if (n == sz-1){
3557 	    tmp0  = t[*idx];
3558 	    sum1 -= v1[0] * tmp0;
3559 	    sum2 -= v2[0] * tmp0;
3560 	    sum3 -= v3[0] * tmp0;
3561 	    sum4 -= v4[0] * tmp0;
3562 	  }
3563 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3564 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3565 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3566 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3567 	  ibdiag  += 16; row += 4;
3568 	  break;
3569         case 5:
3570 	  v2    = a->a + ii[row+1];
3571 	  v3    = a->a + ii[row+2];
3572 	  v4    = a->a + ii[row+3];
3573 	  v5    = a->a + ii[row+4];
3574 	  sum1  = t[row];
3575 	  sum2  = t[row+1];
3576 	  sum3  = t[row+2];
3577 	  sum4  = t[row+3];
3578 	  sum5  = t[row+4];
3579 	  for(n = 0; n<sz-1; n+=2) {
3580 	    i1   = idx[0];
3581 	    i2   = idx[1];
3582 	    idx += 2;
3583 	    tmp0 = t[i1];
3584 	    tmp1 = t[i2];
3585 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3586 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3587 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3588 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3589 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3590 	  }
3591 
3592 	  if (n == sz-1){
3593 	    tmp0  = t[*idx];
3594 	    sum1 -= v1[0] * tmp0;
3595 	    sum2 -= v2[0] * tmp0;
3596 	    sum3 -= v3[0] * tmp0;
3597 	    sum4 -= v4[0] * tmp0;
3598 	    sum5 -= v5[0] * tmp0;
3599 	  }
3600 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3601 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3602 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3603 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3604 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3605 	  ibdiag  += 25; row += 5;
3606 	  break;
3607         default:
3608 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3609       }
3610       CHKMEMQ;
3611     }
3612     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3613     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3614     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3615   }
3616   PetscFunctionReturn(0);
3617 }
3618 
3619 #undef __FUNCT__
3620 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3621 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3622 {
3623   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3624   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3625   const MatScalar    *bdiag = a->inode.bdiag;
3626   const PetscScalar  *b;
3627   PetscErrorCode      ierr;
3628   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3629   const PetscInt      *sizes = a->inode.size;
3630 
3631   PetscFunctionBegin;
3632   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3633   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3634   cnt = 0;
3635   for (i=0, row=0; i<m; i++) {
3636     switch (sizes[i]){
3637       case 1:
3638 	x[row] = b[row]*bdiag[cnt++];row++;
3639 	break;
3640       case 2:
3641 	x1   = b[row]; x2 = b[row+1];
3642 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3643 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3644 	x[row++] = tmp1;
3645 	x[row++] = tmp2;
3646 	cnt += 4;
3647 	break;
3648       case 3:
3649 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3650 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3651 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3652 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3653 	x[row++] = tmp1;
3654 	x[row++] = tmp2;
3655 	x[row++] = tmp3;
3656 	cnt += 9;
3657 	break;
3658       case 4:
3659 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3660 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3661 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3662 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3663 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3664 	x[row++] = tmp1;
3665 	x[row++] = tmp2;
3666 	x[row++] = tmp3;
3667 	x[row++] = tmp4;
3668 	cnt += 16;
3669 	break;
3670       case 5:
3671 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3672 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3673 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3674 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3675 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3676 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3677 	x[row++] = tmp1;
3678 	x[row++] = tmp2;
3679 	x[row++] = tmp3;
3680 	x[row++] = tmp4;
3681 	x[row++] = tmp5;
3682 	cnt += 25;
3683 	break;
3684       default:
3685 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3686     }
3687   }
3688   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3689   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3690   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3691   PetscFunctionReturn(0);
3692 }
3693 
3694 /*
3695     samestructure indicates that the matrix has not changed its nonzero structure so we
3696     do not need to recompute the inodes
3697 */
3698 #undef __FUNCT__
3699 #define __FUNCT__ "Mat_CheckInode"
3700 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3701 {
3702   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3703   PetscErrorCode ierr;
3704   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3705   PetscTruth     flag;
3706 
3707   PetscFunctionBegin;
3708   if (!a->inode.use)                     PetscFunctionReturn(0);
3709   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3710 
3711 
3712   m = A->rmap->n;
3713   if (a->inode.size) {ns = a->inode.size;}
3714   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3715 
3716   i          = 0;
3717   node_count = 0;
3718   idx        = a->j;
3719   ii         = a->i;
3720   while (i < m){                /* For each row */
3721     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3722     /* Limits the number of elements in a node to 'a->inode.limit' */
3723     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3724       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3725       if (nzy != nzx) break;
3726       idy  += nzx;             /* Same nonzero pattern */
3727       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3728       if (!flag) break;
3729     }
3730     ns[node_count++] = blk_size;
3731     idx += blk_size*nzx;
3732     i    = j;
3733   }
3734   /* If not enough inodes found,, do not use inode version of the routines */
3735   if (!a->inode.size && m && node_count > .9*m) {
3736     ierr = PetscFree(ns);CHKERRQ(ierr);
3737     a->inode.node_count     = 0;
3738     a->inode.size           = PETSC_NULL;
3739     a->inode.use            = PETSC_FALSE;
3740     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3741   } else {
3742     if (!A->factortype) {
3743       A->ops->mult              = MatMult_SeqAIJ_Inode;
3744       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3745       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3746       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3747       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3748       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3749       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3750       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3751       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3752     } else {
3753       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3754     }
3755     a->inode.node_count       = node_count;
3756     a->inode.size             = ns;
3757     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3758   }
3759   PetscFunctionReturn(0);
3760 }
3761 
3762 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3763 PetscInt __k, *__vi; \
3764 __vi = aj + ai[row];				\
3765 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3766 __vi = aj + adiag[row];				\
3767 cols[nzl] = __vi[0];\
3768 __vi = aj + adiag[row+1]+1;\
3769 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3770 
3771 
3772 /*
3773    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3774    Modified from Mat_CheckInode().
3775 
3776    Input Parameters:
3777 +  Mat A - ILU or LU matrix factor
3778 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3779     do not need to recompute the inodes
3780 */
3781 #undef __FUNCT__
3782 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3783 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3784 {
3785   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3786   PetscErrorCode ierr;
3787   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3788   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3789   PetscTruth     flag;
3790 
3791   PetscFunctionBegin;
3792   if (!a->inode.use)                     PetscFunctionReturn(0);
3793   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3794 
3795   m = A->rmap->n;
3796   if (a->inode.size) {ns = a->inode.size;}
3797   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3798 
3799   i          = 0;
3800   node_count = 0;
3801   while (i < m){                /* For each row */
3802     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3803     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3804     nzx  = nzl1 + nzu1 + 1;
3805     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3806     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3807 
3808     /* Limits the number of elements in a node to 'a->inode.limit' */
3809     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3810       nzl2    = ai[j+1] - ai[j];
3811       nzu2    = adiag[j] - adiag[j+1] - 1;
3812       nzy     = nzl2 + nzu2 + 1;
3813       if( nzy != nzx) break;
3814       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3815       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3816       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3817       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3818       ierr = PetscFree(cols2);CHKERRQ(ierr);
3819     }
3820     ns[node_count++] = blk_size;
3821     ierr = PetscFree(cols1);CHKERRQ(ierr);
3822     i    = j;
3823   }
3824   /* If not enough inodes found,, do not use inode version of the routines */
3825   if (!a->inode.size && m && node_count > .9*m) {
3826     ierr = PetscFree(ns);CHKERRQ(ierr);
3827     a->inode.node_count     = 0;
3828     a->inode.size           = PETSC_NULL;
3829     a->inode.use            = PETSC_FALSE;
3830     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3831   } else {
3832     A->ops->mult              = 0;
3833     A->ops->sor               = 0;
3834     A->ops->multadd           = 0;
3835     A->ops->getrowij          = 0;
3836     A->ops->restorerowij      = 0;
3837     A->ops->getcolumnij       = 0;
3838     A->ops->restorecolumnij   = 0;
3839     A->ops->coloringpatch     = 0;
3840     A->ops->multdiagonalblock = 0;
3841     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3842     a->inode.node_count       = node_count;
3843     a->inode.size             = ns;
3844     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3845   }
3846   PetscFunctionReturn(0);
3847 }
3848 
3849 /*
3850      This is really ugly. if inodes are used this replaces the
3851   permutations with ones that correspond to rows/cols of the matrix
3852   rather then inode blocks
3853 */
3854 #undef __FUNCT__
3855 #define __FUNCT__ "MatInodeAdjustForInodes"
3856 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3857 {
3858   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3859 
3860   PetscFunctionBegin;
3861   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3862   if (f) {
3863     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3864   }
3865   PetscFunctionReturn(0);
3866 }
3867 
3868 EXTERN_C_BEGIN
3869 #undef __FUNCT__
3870 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3871 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3872 {
3873   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3874   PetscErrorCode ierr;
3875   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3876   const PetscInt *ridx,*cidx;
3877   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3878   PetscInt       nslim_col,*ns_col;
3879   IS             ris = *rperm,cis = *cperm;
3880 
3881   PetscFunctionBegin;
3882   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3883   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3884 
3885   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3886   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3887   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3888 
3889   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3890   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3891 
3892   /* Form the inode structure for the rows of permuted matric using inv perm*/
3893   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3894 
3895   /* Construct the permutations for rows*/
3896   for (i=0,row = 0; i<nslim_row; ++i){
3897     indx      = ridx[i];
3898     start_val = tns[indx];
3899     end_val   = tns[indx + 1];
3900     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3901   }
3902 
3903   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3904   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3905 
3906  /* Construct permutations for columns */
3907   for (i=0,col=0; i<nslim_col; ++i){
3908     indx      = cidx[i];
3909     start_val = tns[indx];
3910     end_val   = tns[indx + 1];
3911     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3912   }
3913 
3914   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3915   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3916   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3917   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3918 
3919   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3920   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3921 
3922   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3923   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3924   ierr = ISDestroy(cis);CHKERRQ(ierr);
3925   ierr = ISDestroy(ris);CHKERRQ(ierr);
3926   ierr = PetscFree(tns);CHKERRQ(ierr);
3927   PetscFunctionReturn(0);
3928 }
3929 EXTERN_C_END
3930 
3931 #undef __FUNCT__
3932 #define __FUNCT__ "MatInodeGetInodeSizes"
3933 /*@C
3934    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3935 
3936    Collective on Mat
3937 
3938    Input Parameter:
3939 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3940 
3941    Output Parameter:
3942 +  node_count - no of inodes present in the matrix.
3943 .  sizes      - an array of size node_count,with sizes of each inode.
3944 -  limit      - the max size used to generate the inodes.
3945 
3946    Level: advanced
3947 
3948    Notes: This routine returns some internal storage information
3949    of the matrix, it is intended to be used by advanced users.
3950    It should be called after the matrix is assembled.
3951    The contents of the sizes[] array should not be changed.
3952    PETSC_NULL may be passed for information not requested.
3953 
3954 .keywords: matrix, seqaij, get, inode
3955 
3956 .seealso: MatGetInfo()
3957 @*/
3958 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3959 {
3960   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3961 
3962   PetscFunctionBegin;
3963   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3964   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3965   if (f) {
3966     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3967   }
3968   PetscFunctionReturn(0);
3969 }
3970 
3971 EXTERN_C_BEGIN
3972 #undef __FUNCT__
3973 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3974 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3975 {
3976   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3977 
3978   PetscFunctionBegin;
3979   if (node_count) *node_count = a->inode.node_count;
3980   if (sizes)      *sizes      = a->inode.size;
3981   if (limit)      *limit      = a->inode.limit;
3982   PetscFunctionReturn(0);
3983 }
3984 EXTERN_C_END
3985