xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 3649974ff2fe2c90f327f29aac9f68ec978ade9e)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   rtmp4 = rtmp3 + n;
1234   ics   = ic;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1239 
1240   /* If max inode size > 4, split it into two inodes.*/
1241   /* also map the inode sizes according to the ordering */
1242   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1243   for (i=0,j=0; i<node_max; ++i,++j){
1244     if (ns[i] > 4) {
1245       tmp_vec1[j] = 4;
1246       ++j;
1247       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1248     } else {
1249       tmp_vec1[j] = ns[i];
1250     }
1251   }
1252   /* Use the correct node_max */
1253   node_max = j;
1254 
1255   /* Now reorder the inode info based on mat re-ordering info */
1256   /* First create a row -> inode_size_array_index map */
1257   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1258   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1259   for (i=0,row=0; i<node_max; i++) {
1260     nodesz = tmp_vec1[i];
1261     for (j=0; j<nodesz; j++,row++) {
1262       nsmap[row] = i;
1263     }
1264   }
1265   /* Using nsmap, create a reordered ns structure */
1266   for (i=0,j=0; i< node_max; i++) {
1267     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1268     tmp_vec2[i]  = nodesz;
1269     j           += nodesz;
1270   }
1271   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1272   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1273 
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.newshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286         /* zero rtmp1 */
1287         /* L part */
1288         nz    = bi[i+1] - bi[i];
1289         bjtmp = bj + bi[i];
1290         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1291 
1292         /* U part */
1293         nz = bdiag[i]-bdiag[i+1];
1294         bjtmp = bj + bdiag[i+1]+1;
1295         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1296 
1297         /* load in initial (unfactored row) */
1298         nz    = ai[r[i]+1] - ai[r[i]];
1299         ajtmp = aj + ai[r[i]];
1300         v     = aa + ai[r[i]];
1301         for (j=0; j<nz; j++) {
1302           rtmp1[ics[ajtmp[j]]] = v[j];
1303         }
1304         /* ZeropivotApply() */
1305         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307         /* elimination */
1308         bjtmp = bj + bi[i];
1309         row   = *bjtmp++;
1310         nzL   = bi[i+1] - bi[i];
1311         for(k=0; k < nzL;k++) {
1312           pc = rtmp1 + row;
1313           if (*pc != 0.0) {
1314             pv   = b->a + bdiag[row];
1315             mul1 = *pc * (*pv);
1316             *pc  = mul1;
1317             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318             pv = b->a + bdiag[row+1]+1;
1319             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1321             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322           }
1323           row = *bjtmp++;
1324         }
1325 
1326         /* finished row so stick it into b->a */
1327         rs = 0.0;
1328         /* L part */
1329         pv = b->a + bi[i] ;
1330         pj = b->j + bi[i] ;
1331         nz = bi[i+1] - bi[i];
1332         for (j=0; j<nz; j++) {
1333           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334         }
1335 
1336         /* U part */
1337         pv = b->a + bdiag[i+1]+1;
1338         pj = b->j + bdiag[i+1]+1;
1339         nz = bdiag[i] - bdiag[i+1]-1;
1340         for (j=0; j<nz; j++) {
1341           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342         }
1343 
1344         /* Check zero pivot */
1345         sctx.rs = rs;
1346         sctx.pv = rtmp1[i];
1347         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1348 	if(sctx.newshift) break;
1349 
1350         /* Mark diagonal and invert diagonal for simplier triangular solves */
1351         pv  = b->a + bdiag[i];
1352         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1353         break;
1354 
1355       case 2:
1356       /*----------*/
1357         /* zero rtmp1 and rtmp2 */
1358         /* L part */
1359         nz    = bi[i+1] - bi[i];
1360         bjtmp = bj + bi[i];
1361         for  (j=0; j<nz; j++) {
1362           col = bjtmp[j];
1363           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1364         }
1365 
1366         /* U part */
1367         nz = bdiag[i]-bdiag[i+1];
1368         bjtmp = bj + bdiag[i+1]+1;
1369         for  (j=0; j<nz; j++) {
1370           col = bjtmp[j];
1371           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1372         }
1373 
1374         /* load in initial (unfactored row) */
1375         nz    = ai[r[i]+1] - ai[r[i]];
1376         ajtmp = aj + ai[r[i]];
1377         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1378         for (j=0; j<nz; j++) {
1379           col = ics[ajtmp[j]];
1380           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1381         }
1382         /* ZeropivotApply(): shift the diagonal of the matrix  */
1383         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1384 
1385         /* elimination */
1386         bjtmp = bj + bi[i];
1387         row   = *bjtmp++; /* pivot row */
1388         nzL   = bi[i+1] - bi[i];
1389         for(k=0; k < nzL;k++) {
1390           pc1 = rtmp1 + row;
1391           pc2 = rtmp2 + row;
1392           if (*pc1 != 0.0 || *pc2 != 0.0) {
1393             pv   = b->a + bdiag[row];
1394             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1395             *pc1 = mul1;       *pc2 = mul2;
1396 
1397             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1398             pv = b->a + bdiag[row+1]+1;
1399             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1400             for (j=0; j<nz; j++){
1401               col = pj[j];
1402               rtmp1[col] -= mul1 * pv[j];
1403               rtmp2[col] -= mul2 * pv[j];
1404             }
1405             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1406           }
1407           row = *bjtmp++;
1408         }
1409 
1410         /* finished row i; check zero pivot, then stick row i into b->a */
1411         rs  = 0.0;
1412         /* L part */
1413         pc1 = b->a + bi[i];
1414         pj  = b->j + bi[i] ;
1415         nz  = bi[i+1] - bi[i];
1416         for (j=0; j<nz; j++) {
1417           col = pj[j];
1418           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1419         }
1420         /* U part */
1421         pc1 = b->a + bdiag[i+1]+1;
1422         pj  = b->j + bdiag[i+1]+1;
1423         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1424         for (j=0; j<nz; j++) {
1425           col = pj[j];
1426           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1427         }
1428 
1429         sctx.rs  = rs;
1430         sctx.pv  = rtmp1[i];
1431         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1432 	if(sctx.newshift) break;
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         for (j=0; j<nz; j++) {
1456           col = pj[j];
1457           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1458         }
1459         /* U part */
1460         pc2 = b->a + bdiag[i+2]+1;
1461         pj  = b->j + bdiag[i+2]+1;
1462         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1463         for (j=0; j<nz; j++) {
1464           col = pj[j];
1465           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1466         }
1467 
1468         sctx.rs  = rs;
1469         sctx.pv  = rtmp2[i+1];
1470         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1471 	if(sctx.newshift) break;
1472         pc2  = b->a + bdiag[i+1];
1473         *pc2 = 1.0/sctx.pv;
1474         break;
1475 
1476       case 3:
1477       /*----------*/
1478         /* zero rtmp */
1479         /* L part */
1480         nz    = bi[i+1] - bi[i];
1481         bjtmp = bj + bi[i];
1482         for  (j=0; j<nz; j++) {
1483           col = bjtmp[j];
1484           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1485         }
1486 
1487         /* U part */
1488         nz = bdiag[i]-bdiag[i+1];
1489         bjtmp = bj + bdiag[i+1]+1;
1490         for  (j=0; j<nz; j++) {
1491           col = bjtmp[j];
1492           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1493         }
1494 
1495         /* load in initial (unfactored row) */
1496         nz    = ai[r[i]+1] - ai[r[i]];
1497         ajtmp = aj + ai[r[i]];
1498         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1499         for (j=0; j<nz; j++) {
1500           col = ics[ajtmp[j]];
1501           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1502         }
1503         /* ZeropivotApply(): shift the diagonal of the matrix  */
1504         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1505 
1506         /* elimination */
1507         bjtmp = bj + bi[i];
1508         row   = *bjtmp++; /* pivot row */
1509         nzL   = bi[i+1] - bi[i];
1510         for(k=0; k < nzL;k++) {
1511           pc1 = rtmp1 + row;
1512           pc2 = rtmp2 + row;
1513           pc3 = rtmp3 + row;
1514           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1515             pv  = b->a + bdiag[row];
1516             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1517             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1518 
1519             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1520             pv = b->a + bdiag[row+1]+1;
1521             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1522             for (j=0; j<nz; j++){
1523               col = pj[j];
1524               rtmp1[col] -= mul1 * pv[j];
1525               rtmp2[col] -= mul2 * pv[j];
1526               rtmp3[col] -= mul3 * pv[j];
1527             }
1528             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1529           }
1530           row = *bjtmp++;
1531         }
1532 
1533         /* finished row i; check zero pivot, then stick row i into b->a */
1534         rs  = 0.0;
1535         /* L part */
1536         pc1 = b->a + bi[i];
1537         pj  = b->j + bi[i] ;
1538         nz  = bi[i+1] - bi[i];
1539         for (j=0; j<nz; j++) {
1540           col = pj[j];
1541           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1542         }
1543         /* U part */
1544         pc1 = b->a + bdiag[i+1]+1;
1545         pj  = b->j + bdiag[i+1]+1;
1546         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1547         for (j=0; j<nz; j++) {
1548           col = pj[j];
1549           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1550         }
1551 
1552         sctx.rs  = rs;
1553         sctx.pv  = rtmp1[i];
1554         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1555 	if(sctx.newshift) break;
1556         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1557         *pc1 = 1.0/sctx.pv;
1558 
1559         /* Now take care of 1st column of diagonal 3x3 block. */
1560         pc2 = rtmp2 + i;
1561         pc3 = rtmp3 + i;
1562         if (*pc2 != 0.0 || *pc3 != 0.0){
1563           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1564           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1565           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1566           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1567           for (j=0; j<nz; j++) {
1568             col = pj[j];
1569             rtmp2[col] -= mul2 * rtmp1[col];
1570             rtmp3[col] -= mul3 * rtmp1[col];
1571           }
1572           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1573         }
1574 
1575         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1576         rs = 0.0;
1577         /* L part */
1578         pc2 = b->a + bi[i+1];
1579         pj  = b->j + bi[i+1] ;
1580         nz  = bi[i+2] - bi[i+1];
1581         for (j=0; j<nz; j++) {
1582           col = pj[j];
1583           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1584         }
1585         /* U part */
1586         pc2 = b->a + bdiag[i+2]+1;
1587         pj  = b->j + bdiag[i+2]+1;
1588         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1589         for (j=0; j<nz; j++) {
1590           col = pj[j];
1591           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1592         }
1593 
1594         sctx.rs  = rs;
1595         sctx.pv  = rtmp2[i+1];
1596         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1597 	if(sctx.newshift) break;
1598         pc2  = b->a + bdiag[i+1];
1599         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1600 
1601         /* Now take care of 2nd column of diagonal 3x3 block. */
1602         pc3 = rtmp3 + i+1;
1603         if (*pc3 != 0.0){
1604           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1605           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1606           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1607           for (j=0; j<nz; j++) {
1608             col = pj[j];
1609             rtmp3[col] -= mul3 * rtmp2[col];
1610           }
1611           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1612         }
1613 
1614         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1615         rs = 0.0;
1616         /* L part */
1617         pc3 = b->a + bi[i+2];
1618         pj  = b->j + bi[i+2] ;
1619         nz  = bi[i+3] - bi[i+2];
1620         for (j=0; j<nz; j++) {
1621           col = pj[j];
1622           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1623         }
1624         /* U part */
1625         pc3 = b->a + bdiag[i+3]+1;
1626         pj  = b->j + bdiag[i+3]+1;
1627         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1628         for (j=0; j<nz; j++) {
1629           col = pj[j];
1630           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1631         }
1632 
1633         sctx.rs  = rs;
1634         sctx.pv  = rtmp3[i+2];
1635         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1636 	if(sctx.newshift) break;
1637         pc3  = b->a + bdiag[i+2];
1638         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1639         break;
1640       case 4:
1641       /*----------*/
1642         /* zero rtmp */
1643         /* L part */
1644         nz    = bi[i+1] - bi[i];
1645         bjtmp = bj + bi[i];
1646         for  (j=0; j<nz; j++) {
1647           col = bjtmp[j];
1648           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1649         }
1650 
1651         /* U part */
1652         nz = bdiag[i]-bdiag[i+1];
1653         bjtmp = bj + bdiag[i+1]+1;
1654         for  (j=0; j<nz; j++) {
1655           col = bjtmp[j];
1656           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1657         }
1658 
1659         /* load in initial (unfactored row) */
1660         nz    = ai[r[i]+1] - ai[r[i]];
1661         ajtmp = aj + ai[r[i]];
1662         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1663         for (j=0; j<nz; j++) {
1664           col = ics[ajtmp[j]];
1665           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1666         }
1667         /* ZeropivotApply(): shift the diagonal of the matrix  */
1668         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1669 
1670         /* elimination */
1671         bjtmp = bj + bi[i];
1672         row   = *bjtmp++; /* pivot row */
1673         nzL   = bi[i+1] - bi[i];
1674         for(k=0; k < nzL;k++) {
1675           pc1 = rtmp1 + row;
1676           pc2 = rtmp2 + row;
1677           pc3 = rtmp3 + row;
1678 	  pc4 = rtmp4 + row;
1679           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1680             pv  = b->a + bdiag[row];
1681             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1682             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1683 
1684             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1685             pv = b->a + bdiag[row+1]+1;
1686             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1687             for (j=0; j<nz; j++){
1688               col = pj[j];
1689               rtmp1[col] -= mul1 * pv[j];
1690               rtmp2[col] -= mul2 * pv[j];
1691               rtmp3[col] -= mul3 * pv[j];
1692 	      rtmp4[col] -= mul4 * pv[j];
1693             }
1694             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1695           }
1696           row = *bjtmp++;
1697         }
1698 
1699         /* finished row i; check zero pivot, then stick row i into b->a */
1700         rs  = 0.0;
1701         /* L part */
1702         pc1 = b->a + bi[i];
1703         pj  = b->j + bi[i] ;
1704         nz  = bi[i+1] - bi[i];
1705         for (j=0; j<nz; j++) {
1706           col = pj[j];
1707           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1708         }
1709         /* U part */
1710         pc1 = b->a + bdiag[i+1]+1;
1711         pj  = b->j + bdiag[i+1]+1;
1712         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1713         for (j=0; j<nz; j++) {
1714           col = pj[j];
1715           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1716         }
1717 
1718         sctx.rs  = rs;
1719         sctx.pv  = rtmp1[i];
1720         ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr);
1721 	if(sctx.newshift) break;
1722         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1723         *pc1 = 1.0/sctx.pv;
1724 
1725         /* Now take care of 1st column of diagonal 4x4 block. */
1726         pc2 = rtmp2 + i;
1727         pc3 = rtmp3 + i;
1728 	pc4 = rtmp4 + i;
1729         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1730           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1731           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1732 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1733           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1734           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1735           for (j=0; j<nz; j++) {
1736             col = pj[j];
1737             rtmp2[col] -= mul2 * rtmp1[col];
1738             rtmp3[col] -= mul3 * rtmp1[col];
1739 	    rtmp4[col] -= mul4 * rtmp1[col];
1740           }
1741           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1742         }
1743 
1744         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1745         rs = 0.0;
1746         /* L part */
1747         pc2 = b->a + bi[i+1];
1748         pj  = b->j + bi[i+1] ;
1749         nz  = bi[i+2] - bi[i+1];
1750         for (j=0; j<nz; j++) {
1751           col = pj[j];
1752           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1753         }
1754         /* U part */
1755         pc2 = b->a + bdiag[i+2]+1;
1756         pj  = b->j + bdiag[i+2]+1;
1757         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1758         for (j=0; j<nz; j++) {
1759           col = pj[j];
1760           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1761         }
1762 
1763         sctx.rs  = rs;
1764         sctx.pv  = rtmp2[i+1];
1765         ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr);
1766 	if(sctx.newshift) break;
1767         pc2  = b->a + bdiag[i+1];
1768         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1769 
1770         /* Now take care of 2nd column of diagonal 4x4 block. */
1771         pc3 = rtmp3 + i+1;
1772 	pc4 = rtmp4 + i+1;
1773         if (*pc3 != 0.0 || *pc4 != 0.0){
1774           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1775 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1776           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1777           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1778           for (j=0; j<nz; j++) {
1779             col = pj[j];
1780             rtmp3[col] -= mul3 * rtmp2[col];
1781 	    rtmp4[col] -= mul4 * rtmp2[col];
1782           }
1783           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1784         }
1785 
1786         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1787         rs = 0.0;
1788         /* L part */
1789         pc3 = b->a + bi[i+2];
1790         pj  = b->j + bi[i+2] ;
1791         nz  = bi[i+3] - bi[i+2];
1792         for (j=0; j<nz; j++) {
1793           col = pj[j];
1794           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1795         }
1796         /* U part */
1797         pc3 = b->a + bdiag[i+3]+1;
1798         pj  = b->j + bdiag[i+3]+1;
1799         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1800         for (j=0; j<nz; j++) {
1801           col = pj[j];
1802           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1803         }
1804 
1805         sctx.rs  = rs;
1806         sctx.pv  = rtmp3[i+2];
1807         ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr);
1808 	if(sctx.newshift) break;
1809         pc3  = b->a + bdiag[i+2];
1810         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1811 
1812 	/* Now take care of 3rd column of diagonal 4x4 block. */
1813 	pc4 = rtmp4 + i+2;
1814         if (*pc4 != 0.0){
1815 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1816           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1817           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1818           for (j=0; j<nz; j++) {
1819             col = pj[j];
1820 	    rtmp4[col] -= mul4 * rtmp3[col];
1821           }
1822           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1823         }
1824 
1825         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1826         rs = 0.0;
1827         /* L part */
1828         pc4 = b->a + bi[i+3];
1829         pj  = b->j + bi[i+3] ;
1830         nz  = bi[i+4] - bi[i+3];
1831         for (j=0; j<nz; j++) {
1832           col = pj[j];
1833           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1834         }
1835         /* U part */
1836         pc4 = b->a + bdiag[i+4]+1;
1837         pj  = b->j + bdiag[i+4]+1;
1838         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1839         for (j=0; j<nz; j++) {
1840           col = pj[j];
1841           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1842         }
1843 
1844         sctx.rs  = rs;
1845         sctx.pv  = rtmp4[i+3];
1846         ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr);
1847 	if(sctx.newshift) break;
1848         pc4  = b->a + bdiag[i+3];
1849         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1850          break;
1851 
1852       default:
1853         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1854       }
1855       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1856       i += nodesz;                 /* Update the row */
1857     }
1858 
1859     /* MatPivotRefine() */
1860     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1861       /*
1862        * if no shift in this attempt & shifting & started shifting & can refine,
1863        * then try lower shift
1864        */
1865       sctx.shift_hi       = sctx.shift_fraction;
1866       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1867       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1868       sctx.newshift       = PETSC_TRUE;
1869       sctx.nshift++;
1870     }
1871   } while (sctx.newshift);
1872 
1873   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1874   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1875   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1876   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1877 
1878   C->ops->solve              = MatSolve_SeqAIJ;
1879   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1880   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1881   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1882   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1883   C->assembled    = PETSC_TRUE;
1884   C->preallocated = PETSC_TRUE;
1885   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1886 
1887   /* MatShiftView(A,info,&sctx) */
1888   if (sctx.nshift){
1889     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1890       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1892       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1894       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1895     }
1896   }
1897   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 #undef __FUNCT__
1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1904 {
1905   Mat               C = B;
1906   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1907   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1908   PetscErrorCode    ierr;
1909   const PetscInt    *r,*ic,*c,*ics;
1910   PetscInt          n = A->rmap->n,*bi = b->i;
1911   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1912   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1913   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1914   PetscScalar       mul1,mul2,mul3,tmp;
1915   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1916   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1917   PetscReal         rs=0.0;
1918   FactorShiftCtx    sctx;
1919 
1920   PetscFunctionBegin;
1921   sctx.shift_top      = 0;
1922   sctx.nshift_max     = 0;
1923   sctx.shift_lo       = 0;
1924   sctx.shift_hi       = 0;
1925   sctx.shift_fraction = 0;
1926 
1927   /* if both shift schemes are chosen by user, only use info->shiftpd */
1928   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1929     sctx.shift_top = 0;
1930     for (i=0; i<n; i++) {
1931       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1932       rs    = 0.0;
1933       ajtmp = aj + ai[i];
1934       rtmp1 = aa + ai[i];
1935       nz = ai[i+1] - ai[i];
1936       for (j=0; j<nz; j++){
1937         if (*ajtmp != i){
1938           rs += PetscAbsScalar(*rtmp1++);
1939         } else {
1940           rs -= PetscRealPart(*rtmp1++);
1941         }
1942         ajtmp++;
1943       }
1944       if (rs>sctx.shift_top) sctx.shift_top = rs;
1945     }
1946     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1947     sctx.shift_top *= 1.1;
1948     sctx.nshift_max = 5;
1949     sctx.shift_lo   = 0.;
1950     sctx.shift_hi   = 1.;
1951   }
1952   sctx.shift_amount = 0;
1953   sctx.nshift       = 0;
1954 
1955   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1956   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1957   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1958   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1959   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1960   ics   = ic ;
1961   rtmp22 = rtmp11 + n;
1962   rtmp33 = rtmp22 + n;
1963 
1964   node_max = a->inode.node_count;
1965   ns       = a->inode.size;
1966   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1967 
1968   /* If max inode size > 3, split it into two inodes.*/
1969   /* also map the inode sizes according to the ordering */
1970   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1971   for (i=0,j=0; i<node_max; ++i,++j){
1972     if (ns[i]>3) {
1973       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1974       ++j;
1975       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1976     } else {
1977       tmp_vec1[j] = ns[i];
1978     }
1979   }
1980   /* Use the correct node_max */
1981   node_max = j;
1982 
1983   /* Now reorder the inode info based on mat re-ordering info */
1984   /* First create a row -> inode_size_array_index map */
1985   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1986   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1987   for (i=0,row=0; i<node_max; i++) {
1988     nodesz = tmp_vec1[i];
1989     for (j=0; j<nodesz; j++,row++) {
1990       nsmap[row] = i;
1991     }
1992   }
1993   /* Using nsmap, create a reordered ns structure */
1994   for (i=0,j=0; i< node_max; i++) {
1995     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1996     tmp_vec2[i]  = nodesz;
1997     j           += nodesz;
1998   }
1999   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2000   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2001   /* Now use the correct ns */
2002   ns = tmp_vec2;
2003 
2004   do {
2005     sctx.newshift = PETSC_FALSE;
2006     /* Now loop over each block-row, and do the factorization */
2007     for (i=0,row=0; i<node_max; i++) {
2008       nodesz = ns[i];
2009       nz     = bi[row+1] - bi[row];
2010       bjtmp  = bj + bi[row];
2011 
2012       switch (nodesz){
2013       case 1:
2014         for  (j=0; j<nz; j++){
2015           idx        = bjtmp[j];
2016           rtmp11[idx] = 0.0;
2017         }
2018 
2019         /* load in initial (unfactored row) */
2020         idx    = r[row];
2021         nz_tmp = ai[idx+1] - ai[idx];
2022         ajtmp  = aj + ai[idx];
2023         v1     = aa + ai[idx];
2024 
2025         for (j=0; j<nz_tmp; j++) {
2026           idx        = ics[ajtmp[j]];
2027           rtmp11[idx] = v1[j];
2028         }
2029         rtmp11[ics[r[row]]] += sctx.shift_amount;
2030 
2031         prow = *bjtmp++ ;
2032         while (prow < row) {
2033           pc1 = rtmp11 + prow;
2034           if (*pc1 != 0.0){
2035             pv   = ba + bd[prow];
2036             pj   = nbj + bd[prow];
2037             mul1 = *pc1 * *pv++;
2038             *pc1 = mul1;
2039             nz_tmp = bi[prow+1] - bd[prow] - 1;
2040             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2041             for (j=0; j<nz_tmp; j++) {
2042               tmp = pv[j];
2043               idx = pj[j];
2044               rtmp11[idx] -= mul1 * tmp;
2045             }
2046           }
2047           prow = *bjtmp++ ;
2048         }
2049         pj  = bj + bi[row];
2050         pc1 = ba + bi[row];
2051 
2052         sctx.pv    = rtmp11[row];
2053         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2054         rs         = 0.0;
2055         for (j=0; j<nz; j++) {
2056           idx    = pj[j];
2057           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2058           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2059         }
2060         sctx.rs  = rs;
2061         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2062         if (sctx.newshift) goto endofwhile;
2063         break;
2064 
2065       case 2:
2066         for (j=0; j<nz; j++) {
2067           idx        = bjtmp[j];
2068           rtmp11[idx] = 0.0;
2069           rtmp22[idx] = 0.0;
2070         }
2071 
2072         /* load in initial (unfactored row) */
2073         idx    = r[row];
2074         nz_tmp = ai[idx+1] - ai[idx];
2075         ajtmp  = aj + ai[idx];
2076         v1     = aa + ai[idx];
2077         v2     = aa + ai[idx+1];
2078         for (j=0; j<nz_tmp; j++) {
2079           idx        = ics[ajtmp[j]];
2080           rtmp11[idx] = v1[j];
2081           rtmp22[idx] = v2[j];
2082         }
2083         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2084         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2085 
2086         prow = *bjtmp++ ;
2087         while (prow < row) {
2088           pc1 = rtmp11 + prow;
2089           pc2 = rtmp22 + prow;
2090           if (*pc1 != 0.0 || *pc2 != 0.0){
2091             pv   = ba + bd[prow];
2092             pj   = nbj + bd[prow];
2093             mul1 = *pc1 * *pv;
2094             mul2 = *pc2 * *pv;
2095             ++pv;
2096             *pc1 = mul1;
2097             *pc2 = mul2;
2098 
2099             nz_tmp = bi[prow+1] - bd[prow] - 1;
2100             for (j=0; j<nz_tmp; j++) {
2101               tmp = pv[j];
2102               idx = pj[j];
2103               rtmp11[idx] -= mul1 * tmp;
2104               rtmp22[idx] -= mul2 * tmp;
2105             }
2106             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2107           }
2108           prow = *bjtmp++ ;
2109         }
2110 
2111         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2112         pc1 = rtmp11 + prow;
2113         pc2 = rtmp22 + prow;
2114 
2115         sctx.pv = *pc1;
2116         pj      = bj + bi[prow];
2117         rs      = 0.0;
2118         for (j=0; j<nz; j++){
2119           idx = pj[j];
2120           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2121         }
2122         sctx.rs = rs;
2123         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2124         if (sctx.newshift) goto endofwhile;
2125 
2126         if (*pc2 != 0.0){
2127           pj     = nbj + bd[prow];
2128           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2129           *pc2   = mul2;
2130           nz_tmp = bi[prow+1] - bd[prow] - 1;
2131           for (j=0; j<nz_tmp; j++) {
2132             idx = pj[j] ;
2133             tmp = rtmp11[idx];
2134             rtmp22[idx] -= mul2 * tmp;
2135           }
2136           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2137         }
2138 
2139         pj  = bj + bi[row];
2140         pc1 = ba + bi[row];
2141         pc2 = ba + bi[row+1];
2142 
2143         sctx.pv = rtmp22[row+1];
2144         rs = 0.0;
2145         rtmp11[row]   = 1.0/rtmp11[row];
2146         rtmp22[row+1] = 1.0/rtmp22[row+1];
2147         /* copy row entries from dense representation to sparse */
2148         for (j=0; j<nz; j++) {
2149           idx    = pj[j];
2150           pc1[j] = rtmp11[idx];
2151           pc2[j] = rtmp22[idx];
2152           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2153         }
2154         sctx.rs = rs;
2155         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2156         if (sctx.newshift) goto endofwhile;
2157         break;
2158 
2159       case 3:
2160         for  (j=0; j<nz; j++) {
2161           idx        = bjtmp[j];
2162           rtmp11[idx] = 0.0;
2163           rtmp22[idx] = 0.0;
2164           rtmp33[idx] = 0.0;
2165         }
2166         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2167         idx    = r[row];
2168         nz_tmp = ai[idx+1] - ai[idx];
2169         ajtmp = aj + ai[idx];
2170         v1    = aa + ai[idx];
2171         v2    = aa + ai[idx+1];
2172         v3    = aa + ai[idx+2];
2173         for (j=0; j<nz_tmp; j++) {
2174           idx        = ics[ajtmp[j]];
2175           rtmp11[idx] = v1[j];
2176           rtmp22[idx] = v2[j];
2177           rtmp33[idx] = v3[j];
2178         }
2179         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2180         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2181         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2182 
2183         /* loop over all pivot row blocks above this row block */
2184         prow = *bjtmp++ ;
2185         while (prow < row) {
2186           pc1 = rtmp11 + prow;
2187           pc2 = rtmp22 + prow;
2188           pc3 = rtmp33 + prow;
2189           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2190             pv   = ba  + bd[prow];
2191             pj   = nbj + bd[prow];
2192             mul1 = *pc1 * *pv;
2193             mul2 = *pc2 * *pv;
2194             mul3 = *pc3 * *pv;
2195             ++pv;
2196             *pc1 = mul1;
2197             *pc2 = mul2;
2198             *pc3 = mul3;
2199 
2200             nz_tmp = bi[prow+1] - bd[prow] - 1;
2201             /* update this row based on pivot row */
2202             for (j=0; j<nz_tmp; j++) {
2203               tmp = pv[j];
2204               idx = pj[j];
2205               rtmp11[idx] -= mul1 * tmp;
2206               rtmp22[idx] -= mul2 * tmp;
2207               rtmp33[idx] -= mul3 * tmp;
2208             }
2209             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2210           }
2211           prow = *bjtmp++ ;
2212         }
2213 
2214         /* Now take care of diagonal 3x3 block in this set of rows */
2215         /* note: prow = row here */
2216         pc1 = rtmp11 + prow;
2217         pc2 = rtmp22 + prow;
2218         pc3 = rtmp33 + prow;
2219 
2220         sctx.pv = *pc1;
2221         pj      = bj + bi[prow];
2222         rs      = 0.0;
2223         for (j=0; j<nz; j++){
2224           idx = pj[j];
2225           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2226         }
2227         sctx.rs = rs;
2228         ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr);
2229         if (sctx.newshift) goto endofwhile;
2230 
2231         if (*pc2 != 0.0 || *pc3 != 0.0){
2232           mul2 = (*pc2)/(*pc1);
2233           mul3 = (*pc3)/(*pc1);
2234           *pc2 = mul2;
2235           *pc3 = mul3;
2236           nz_tmp = bi[prow+1] - bd[prow] - 1;
2237           pj     = nbj + bd[prow];
2238           for (j=0; j<nz_tmp; j++) {
2239             idx = pj[j] ;
2240             tmp = rtmp11[idx];
2241             rtmp22[idx] -= mul2 * tmp;
2242             rtmp33[idx] -= mul3 * tmp;
2243           }
2244           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2245         }
2246         ++prow;
2247 
2248         pc2 = rtmp22 + prow;
2249         pc3 = rtmp33 + prow;
2250         sctx.pv = *pc2;
2251         pj      = bj + bi[prow];
2252         rs      = 0.0;
2253         for (j=0; j<nz; j++){
2254           idx = pj[j];
2255           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2256         }
2257         sctx.rs = rs;
2258         ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr);
2259         if (sctx.newshift) goto endofwhile;
2260 
2261         if (*pc3 != 0.0){
2262           mul3   = (*pc3)/(*pc2);
2263           *pc3   = mul3;
2264           pj     = nbj + bd[prow];
2265           nz_tmp = bi[prow+1] - bd[prow] - 1;
2266           for (j=0; j<nz_tmp; j++) {
2267             idx = pj[j] ;
2268             tmp = rtmp22[idx];
2269             rtmp33[idx] -= mul3 * tmp;
2270           }
2271           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2272         }
2273 
2274         pj  = bj + bi[row];
2275         pc1 = ba + bi[row];
2276         pc2 = ba + bi[row+1];
2277         pc3 = ba + bi[row+2];
2278 
2279         sctx.pv = rtmp33[row+2];
2280         rs = 0.0;
2281         rtmp11[row]   = 1.0/rtmp11[row];
2282         rtmp22[row+1] = 1.0/rtmp22[row+1];
2283         rtmp33[row+2] = 1.0/rtmp33[row+2];
2284         /* copy row entries from dense representation to sparse */
2285         for (j=0; j<nz; j++) {
2286           idx    = pj[j];
2287           pc1[j] = rtmp11[idx];
2288           pc2[j] = rtmp22[idx];
2289           pc3[j] = rtmp33[idx];
2290           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2291         }
2292 
2293         sctx.rs = rs;
2294         ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr);
2295         if (sctx.newshift) goto endofwhile;
2296         break;
2297 
2298       default:
2299         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2300       }
2301       row += nodesz;                 /* Update the row */
2302     }
2303     endofwhile:;
2304   } while (sctx.newshift);
2305   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2306   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2307   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2308   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2309   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2310   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2311   /* do not set solve add, since MatSolve_Inode + Add is faster */
2312   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2313   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2314   C->assembled   = PETSC_TRUE;
2315   C->preallocated = PETSC_TRUE;
2316   if (sctx.nshift) {
2317     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2318       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2319     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2320       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2321     }
2322   }
2323   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2324   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2325   PetscFunctionReturn(0);
2326 }
2327 
2328 
2329 /* ----------------------------------------------------------- */
2330 #undef __FUNCT__
2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2333 {
2334   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2335   IS                iscol = a->col,isrow = a->row;
2336   PetscErrorCode    ierr;
2337   const PetscInt    *r,*c,*rout,*cout;
2338   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2339   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2340   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2341   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2342   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2343   const PetscScalar *b;
2344 
2345   PetscFunctionBegin;
2346   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2347   node_max = a->inode.node_count;
2348   ns       = a->inode.size;     /* Node Size array */
2349 
2350   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2351   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2352   tmp  = a->solve_work;
2353 
2354   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2355   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2356 
2357   /* forward solve the lower triangular */
2358   tmps = tmp ;
2359   aa   = a_a ;
2360   aj   = a_j ;
2361   ad   = a->diag;
2362 
2363   for (i = 0,row = 0; i< node_max; ++i){
2364     nsz = ns[i];
2365     aii = ai[row];
2366     v1  = aa + aii;
2367     vi  = aj + aii;
2368     nz  = ai[row+1]- ai[row];
2369 
2370     if (i < node_max-1) {
2371       /* Prefetch the indices for the next block */
2372       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2373       /* Prefetch the data for the next block */
2374       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2375     }
2376 
2377     switch (nsz){               /* Each loop in 'case' is unrolled */
2378     case 1 :
2379       sum1 = b[r[row]];
2380       for(j=0; j<nz-1; j+=2){
2381         i0   = vi[j];
2382         i1   = vi[j+1];
2383         tmp0 = tmps[i0];
2384         tmp1 = tmps[i1];
2385         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2386       }
2387       if(j == nz-1){
2388         tmp0 = tmps[vi[j]];
2389         sum1 -= v1[j]*tmp0;
2390       }
2391       tmp[row++]=sum1;
2392       break;
2393     case 2:
2394       sum1 = b[r[row]];
2395       sum2 = b[r[row+1]];
2396       v2   = aa + ai[row+1];
2397 
2398       for(j=0; j<nz-1; j+=2){
2399         i0   = vi[j];
2400         i1   = vi[j+1];
2401         tmp0 = tmps[i0];
2402         tmp1 = tmps[i1];
2403         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2404         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2405       }
2406       if(j == nz-1){
2407         tmp0 = tmps[vi[j]];
2408         sum1 -= v1[j] *tmp0;
2409         sum2 -= v2[j] *tmp0;
2410       }
2411       sum2 -= v2[nz] * sum1;
2412       tmp[row ++]=sum1;
2413       tmp[row ++]=sum2;
2414       break;
2415     case 3:
2416       sum1 = b[r[row]];
2417       sum2 = b[r[row+1]];
2418       sum3 = b[r[row+2]];
2419       v2   = aa + ai[row+1];
2420       v3   = aa + ai[row+2];
2421 
2422       for (j=0; j<nz-1; j+=2){
2423         i0   = vi[j];
2424         i1   = vi[j+1];
2425         tmp0 = tmps[i0];
2426         tmp1 = tmps[i1];
2427         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2428         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2429         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2430       }
2431       if (j == nz-1){
2432         tmp0 = tmps[vi[j]];
2433         sum1 -= v1[j] *tmp0;
2434         sum2 -= v2[j] *tmp0;
2435         sum3 -= v3[j] *tmp0;
2436       }
2437       sum2 -= v2[nz] * sum1;
2438       sum3 -= v3[nz] * sum1;
2439       sum3 -= v3[nz+1] * sum2;
2440       tmp[row ++]=sum1;
2441       tmp[row ++]=sum2;
2442       tmp[row ++]=sum3;
2443       break;
2444 
2445     case 4:
2446       sum1 = b[r[row]];
2447       sum2 = b[r[row+1]];
2448       sum3 = b[r[row+2]];
2449       sum4 = b[r[row+3]];
2450       v2   = aa + ai[row+1];
2451       v3   = aa + ai[row+2];
2452       v4   = aa + ai[row+3];
2453 
2454       for (j=0; j<nz-1; j+=2){
2455         i0   = vi[j];
2456         i1   = vi[j+1];
2457         tmp0 = tmps[i0];
2458         tmp1 = tmps[i1];
2459         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2460         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2461         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2462         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2463       }
2464       if (j == nz-1){
2465         tmp0 = tmps[vi[j]];
2466         sum1 -= v1[j] *tmp0;
2467         sum2 -= v2[j] *tmp0;
2468         sum3 -= v3[j] *tmp0;
2469         sum4 -= v4[j] *tmp0;
2470       }
2471       sum2 -= v2[nz] * sum1;
2472       sum3 -= v3[nz] * sum1;
2473       sum4 -= v4[nz] * sum1;
2474       sum3 -= v3[nz+1] * sum2;
2475       sum4 -= v4[nz+1] * sum2;
2476       sum4 -= v4[nz+2] * sum3;
2477 
2478       tmp[row ++]=sum1;
2479       tmp[row ++]=sum2;
2480       tmp[row ++]=sum3;
2481       tmp[row ++]=sum4;
2482       break;
2483     case 5:
2484       sum1 = b[r[row]];
2485       sum2 = b[r[row+1]];
2486       sum3 = b[r[row+2]];
2487       sum4 = b[r[row+3]];
2488       sum5 = b[r[row+4]];
2489       v2   = aa + ai[row+1];
2490       v3   = aa + ai[row+2];
2491       v4   = aa + ai[row+3];
2492       v5   = aa + ai[row+4];
2493 
2494       for (j=0; j<nz-1; j+=2){
2495         i0   = vi[j];
2496         i1   = vi[j+1];
2497         tmp0 = tmps[i0];
2498         tmp1 = tmps[i1];
2499         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2500         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2501         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2502         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2503         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2504       }
2505       if (j == nz-1){
2506         tmp0 = tmps[vi[j]];
2507         sum1 -= v1[j] *tmp0;
2508         sum2 -= v2[j] *tmp0;
2509         sum3 -= v3[j] *tmp0;
2510         sum4 -= v4[j] *tmp0;
2511         sum5 -= v5[j] *tmp0;
2512       }
2513 
2514       sum2 -= v2[nz] * sum1;
2515       sum3 -= v3[nz] * sum1;
2516       sum4 -= v4[nz] * sum1;
2517       sum5 -= v5[nz] * sum1;
2518       sum3 -= v3[nz+1] * sum2;
2519       sum4 -= v4[nz+1] * sum2;
2520       sum5 -= v5[nz+1] * sum2;
2521       sum4 -= v4[nz+2] * sum3;
2522       sum5 -= v5[nz+2] * sum3;
2523       sum5 -= v5[nz+3] * sum4;
2524 
2525       tmp[row ++]=sum1;
2526       tmp[row ++]=sum2;
2527       tmp[row ++]=sum3;
2528       tmp[row ++]=sum4;
2529       tmp[row ++]=sum5;
2530       break;
2531     default:
2532       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2533     }
2534   }
2535   /* backward solve the upper triangular */
2536   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2537     nsz = ns[i];
2538     aii = ad[row+1] + 1;
2539     v1  = aa + aii;
2540     vi  = aj + aii;
2541     nz  = ad[row]- ad[row+1] - 1;
2542 
2543     if (i > 0) {
2544       /* Prefetch the indices for the next block */
2545       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2546       /* Prefetch the data for the next block */
2547       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2548     }
2549 
2550     switch (nsz){               /* Each loop in 'case' is unrolled */
2551     case 1 :
2552       sum1 = tmp[row];
2553 
2554       for(j=0 ; j<nz-1; j+=2){
2555         i0   = vi[j];
2556         i1   = vi[j+1];
2557         tmp0 = tmps[i0];
2558         tmp1 = tmps[i1];
2559         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2560       }
2561       if (j == nz-1){
2562         tmp0  = tmps[vi[j]];
2563         sum1 -= v1[j]*tmp0;
2564       }
2565       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2566       break;
2567     case 2 :
2568       sum1 = tmp[row];
2569       sum2 = tmp[row-1];
2570       v2   = aa + ad[row] + 1;
2571       for (j=0 ; j<nz-1; j+=2){
2572         i0   = vi[j];
2573         i1   = vi[j+1];
2574         tmp0 = tmps[i0];
2575         tmp1 = tmps[i1];
2576         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2577         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2578       }
2579       if (j == nz-1){
2580         tmp0  = tmps[vi[j]];
2581         sum1 -= v1[j]* tmp0;
2582         sum2 -= v2[j+1]* tmp0;
2583       }
2584 
2585       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2586       sum2   -= v2[0] * tmp0;
2587       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2588       break;
2589     case 3 :
2590       sum1 = tmp[row];
2591       sum2 = tmp[row -1];
2592       sum3 = tmp[row -2];
2593       v2   = aa + ad[row] + 1;
2594       v3   = aa + ad[row -1] + 1;
2595       for (j=0 ; j<nz-1; j+=2){
2596         i0   = vi[j];
2597         i1   = vi[j+1];
2598         tmp0 = tmps[i0];
2599         tmp1 = tmps[i1];
2600         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2601         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2602         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2603       }
2604       if (j== nz-1){
2605         tmp0  = tmps[vi[j]];
2606         sum1 -= v1[j] * tmp0;
2607         sum2 -= v2[j+1] * tmp0;
2608         sum3 -= v3[j+2] * tmp0;
2609       }
2610       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2611       sum2   -= v2[0]* tmp0;
2612       sum3   -= v3[1] * tmp0;
2613       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2614       sum3   -= v3[0]* tmp0;
2615       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2616 
2617       break;
2618     case 4 :
2619       sum1 = tmp[row];
2620       sum2 = tmp[row -1];
2621       sum3 = tmp[row -2];
2622       sum4 = tmp[row -3];
2623       v2   = aa + ad[row]+1;
2624       v3   = aa + ad[row -1]+1;
2625       v4   = aa + ad[row -2]+1;
2626 
2627       for (j=0 ; j<nz-1; j+=2){
2628         i0   = vi[j];
2629         i1   = vi[j+1];
2630         tmp0 = tmps[i0];
2631         tmp1 = tmps[i1];
2632         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2633         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2634         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2635         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2636       }
2637       if (j== nz-1){
2638         tmp0  = tmps[vi[j]];
2639         sum1 -= v1[j] * tmp0;
2640         sum2 -= v2[j+1] * tmp0;
2641         sum3 -= v3[j+2] * tmp0;
2642         sum4 -= v4[j+3] * tmp0;
2643       }
2644 
2645       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2646       sum2   -= v2[0] * tmp0;
2647       sum3   -= v3[1] * tmp0;
2648       sum4   -= v4[2] * tmp0;
2649       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650       sum3   -= v3[0] * tmp0;
2651       sum4   -= v4[1] * tmp0;
2652       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2653       sum4   -= v4[0] * tmp0;
2654       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2655       break;
2656     case 5 :
2657       sum1 = tmp[row];
2658       sum2 = tmp[row -1];
2659       sum3 = tmp[row -2];
2660       sum4 = tmp[row -3];
2661       sum5 = tmp[row -4];
2662       v2   = aa + ad[row]+1;
2663       v3   = aa + ad[row -1]+1;
2664       v4   = aa + ad[row -2]+1;
2665       v5   = aa + ad[row -3]+1;
2666       for (j=0 ; j<nz-1; j+=2){
2667         i0   = vi[j];
2668         i1   = vi[j+1];
2669         tmp0 = tmps[i0];
2670         tmp1 = tmps[i1];
2671         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2672         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2673         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2674         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2675         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2676       }
2677       if (j==nz-1){
2678         tmp0  = tmps[vi[j]];
2679         sum1 -= v1[j] * tmp0;
2680         sum2 -= v2[j+1] * tmp0;
2681         sum3 -= v3[j+2] * tmp0;
2682         sum4 -= v4[j+3] * tmp0;
2683         sum5 -= v5[j+4] * tmp0;
2684       }
2685 
2686       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2687       sum2   -= v2[0] * tmp0;
2688       sum3   -= v3[1] * tmp0;
2689       sum4   -= v4[2] * tmp0;
2690       sum5   -= v5[3] * tmp0;
2691       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2692       sum3   -= v3[0] * tmp0;
2693       sum4   -= v4[1] * tmp0;
2694       sum5   -= v5[2] * tmp0;
2695       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2696       sum4   -= v4[0] * tmp0;
2697       sum5   -= v5[1] * tmp0;
2698       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2699       sum5   -= v5[0] * tmp0;
2700       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2701       break;
2702     default:
2703       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2704     }
2705   }
2706   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2707   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2708   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2709   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2710   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 
2715 /*
2716      Makes a longer coloring[] array and calls the usual code with that
2717 */
2718 #undef __FUNCT__
2719 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2720 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2721 {
2722   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2723   PetscErrorCode  ierr;
2724   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2725   PetscInt        *colorused,i;
2726   ISColoringValue *newcolor;
2727 
2728   PetscFunctionBegin;
2729   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2730   /* loop over inodes, marking a color for each column*/
2731   row = 0;
2732   for (i=0; i<m; i++){
2733     for (j=0; j<ns[i]; j++) {
2734       newcolor[row++] = coloring[i] + j*ncolors;
2735     }
2736   }
2737 
2738   /* eliminate unneeded colors */
2739   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2740   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2741   for (i=0; i<n; i++) {
2742     colorused[newcolor[i]] = 1;
2743   }
2744 
2745   for (i=1; i<5*ncolors; i++) {
2746     colorused[i] += colorused[i-1];
2747   }
2748   ncolors = colorused[5*ncolors-1];
2749   for (i=0; i<n; i++) {
2750     newcolor[i] = colorused[newcolor[i]]-1;
2751   }
2752   ierr = PetscFree(colorused);CHKERRQ(ierr);
2753   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2754   ierr = PetscFree(coloring);CHKERRQ(ierr);
2755   PetscFunctionReturn(0);
2756 }
2757 
2758 #include "../src/mat/blockinvert.h"
2759 
2760 #undef __FUNCT__
2761 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2762 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2763 {
2764   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2765   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2766   MatScalar          *ibdiag,*bdiag,work[25];
2767   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2768   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2769   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2770   PetscErrorCode     ierr;
2771   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2772   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2773 
2774   PetscFunctionBegin;
2775   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2776   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2777   if (its > 1) {
2778     /* switch to non-inode version */
2779     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2780     PetscFunctionReturn(0);
2781   }
2782 
2783   if (!a->inode.ibdiagvalid) {
2784     if (!a->inode.ibdiag) {
2785       /* calculate space needed for diagonal blocks */
2786       for (i=0; i<m; i++) {
2787 	cnt += sizes[i]*sizes[i];
2788       }
2789       a->inode.bdiagsize = cnt;
2790       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2791     }
2792 
2793     /* copy over the diagonal blocks and invert them */
2794     ibdiag = a->inode.ibdiag;
2795     bdiag  = a->inode.bdiag;
2796     cnt = 0;
2797     for (i=0, row = 0; i<m; i++) {
2798       for (j=0; j<sizes[i]; j++) {
2799         for (k=0; k<sizes[i]; k++) {
2800           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2801         }
2802       }
2803       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2804 
2805       switch(sizes[i]) {
2806         case 1:
2807           /* Create matrix data structure */
2808           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2809           ibdiag[cnt] = 1.0/ibdiag[cnt];
2810           break;
2811         case 2:
2812           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2813           break;
2814         case 3:
2815           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2816           break;
2817         case 4:
2818           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2819           break;
2820         case 5:
2821           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2822           break;
2823        default:
2824 	 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2825       }
2826       cnt += sizes[i]*sizes[i];
2827       row += sizes[i];
2828     }
2829     a->inode.ibdiagvalid = PETSC_TRUE;
2830   }
2831   ibdiag = a->inode.ibdiag;
2832   bdiag  = a->inode.bdiag;
2833 
2834 
2835   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2836   if (flag & SOR_ZERO_INITIAL_GUESS) {
2837     const PetscScalar *b;
2838     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2839     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2840     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2841 
2842       for (i=0, row=0; i<m; i++) {
2843         sz  = diag[row] - ii[row];
2844         v1  = a->a + ii[row];
2845         idx = a->j + ii[row];
2846 
2847         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2848         switch (sizes[i]){
2849           case 1:
2850 
2851             sum1  = b[row];
2852             for(n = 0; n<sz-1; n+=2) {
2853               i1   = idx[0];
2854               i2   = idx[1];
2855               idx += 2;
2856               tmp0 = x[i1];
2857               tmp1 = x[i2];
2858               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2859             }
2860 
2861             if (n == sz-1){
2862               tmp0  = x[*idx];
2863               sum1 -= *v1 * tmp0;
2864             }
2865             x[row++] = sum1*(*ibdiag++);
2866             break;
2867           case 2:
2868             v2    = a->a + ii[row+1];
2869             sum1  = b[row];
2870             sum2  = b[row+1];
2871             for(n = 0; n<sz-1; n+=2) {
2872               i1   = idx[0];
2873               i2   = idx[1];
2874               idx += 2;
2875               tmp0 = x[i1];
2876               tmp1 = x[i2];
2877               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2878               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2879             }
2880 
2881             if (n == sz-1){
2882               tmp0  = x[*idx];
2883               sum1 -= v1[0] * tmp0;
2884               sum2 -= v2[0] * tmp0;
2885             }
2886             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2887             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2888             ibdiag  += 4;
2889             break;
2890           case 3:
2891             v2    = a->a + ii[row+1];
2892             v3    = a->a + ii[row+2];
2893             sum1  = b[row];
2894             sum2  = b[row+1];
2895             sum3  = b[row+2];
2896             for(n = 0; n<sz-1; n+=2) {
2897               i1   = idx[0];
2898               i2   = idx[1];
2899               idx += 2;
2900               tmp0 = x[i1];
2901               tmp1 = x[i2];
2902               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2903               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2904               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2905             }
2906 
2907             if (n == sz-1){
2908               tmp0  = x[*idx];
2909               sum1 -= v1[0] * tmp0;
2910               sum2 -= v2[0] * tmp0;
2911               sum3 -= v3[0] * tmp0;
2912             }
2913             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916             ibdiag  += 9;
2917             break;
2918           case 4:
2919             v2    = a->a + ii[row+1];
2920             v3    = a->a + ii[row+2];
2921             v4    = a->a + ii[row+3];
2922             sum1  = b[row];
2923             sum2  = b[row+1];
2924             sum3  = b[row+2];
2925             sum4  = b[row+3];
2926             for(n = 0; n<sz-1; n+=2) {
2927               i1   = idx[0];
2928               i2   = idx[1];
2929               idx += 2;
2930               tmp0 = x[i1];
2931               tmp1 = x[i2];
2932               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936             }
2937 
2938             if (n == sz-1){
2939               tmp0  = x[*idx];
2940               sum1 -= v1[0] * tmp0;
2941               sum2 -= v2[0] * tmp0;
2942               sum3 -= v3[0] * tmp0;
2943               sum4 -= v4[0] * tmp0;
2944             }
2945             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2946             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2947             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2948             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2949             ibdiag  += 16;
2950             break;
2951           case 5:
2952             v2    = a->a + ii[row+1];
2953             v3    = a->a + ii[row+2];
2954             v4    = a->a + ii[row+3];
2955             v5    = a->a + ii[row+4];
2956             sum1  = b[row];
2957             sum2  = b[row+1];
2958             sum3  = b[row+2];
2959             sum4  = b[row+3];
2960             sum5  = b[row+4];
2961             for(n = 0; n<sz-1; n+=2) {
2962               i1   = idx[0];
2963               i2   = idx[1];
2964               idx += 2;
2965               tmp0 = x[i1];
2966               tmp1 = x[i2];
2967               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2968               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2969               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2970               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2971               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2972             }
2973 
2974             if (n == sz-1){
2975               tmp0  = x[*idx];
2976               sum1 -= v1[0] * tmp0;
2977               sum2 -= v2[0] * tmp0;
2978               sum3 -= v3[0] * tmp0;
2979               sum4 -= v4[0] * tmp0;
2980               sum5 -= v5[0] * tmp0;
2981             }
2982             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2983             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2984             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2985             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2986             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2987             ibdiag  += 25;
2988             break;
2989 	  default:
2990    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2991         }
2992       }
2993 
2994       xb = x;
2995       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2996     } else xb = b;
2997     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2998         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2999       cnt = 0;
3000       for (i=0, row=0; i<m; i++) {
3001 
3002         switch (sizes[i]){
3003           case 1:
3004             x[row++] *= bdiag[cnt++];
3005             break;
3006           case 2:
3007             x1   = x[row]; x2 = x[row+1];
3008             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3009             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3010             x[row++] = tmp1;
3011             x[row++] = tmp2;
3012             cnt += 4;
3013             break;
3014           case 3:
3015             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3016             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3017             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3018             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3019             x[row++] = tmp1;
3020             x[row++] = tmp2;
3021             x[row++] = tmp3;
3022             cnt += 9;
3023             break;
3024           case 4:
3025             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3026             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3027             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3028             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3029             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3030             x[row++] = tmp1;
3031             x[row++] = tmp2;
3032             x[row++] = tmp3;
3033             x[row++] = tmp4;
3034             cnt += 16;
3035             break;
3036           case 5:
3037             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3038             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3039             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3040             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3041             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3042             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3043             x[row++] = tmp1;
3044             x[row++] = tmp2;
3045             x[row++] = tmp3;
3046             x[row++] = tmp4;
3047             x[row++] = tmp5;
3048             cnt += 25;
3049             break;
3050 	  default:
3051    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3052         }
3053       }
3054       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3055     }
3056     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3057 
3058       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3059       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3060         ibdiag -= sizes[i]*sizes[i];
3061         sz      = ii[row+1] - diag[row] - 1;
3062         v1      = a->a + diag[row] + 1;
3063         idx     = a->j + diag[row] + 1;
3064 
3065         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3066         switch (sizes[i]){
3067           case 1:
3068 
3069             sum1  = xb[row];
3070             for(n = 0; n<sz-1; n+=2) {
3071               i1   = idx[0];
3072               i2   = idx[1];
3073               idx += 2;
3074               tmp0 = x[i1];
3075               tmp1 = x[i2];
3076               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3077             }
3078 
3079             if (n == sz-1){
3080               tmp0  = x[*idx];
3081               sum1 -= *v1*tmp0;
3082             }
3083             x[row--] = sum1*(*ibdiag);
3084             break;
3085 
3086           case 2:
3087 
3088             sum1  = xb[row];
3089             sum2  = xb[row-1];
3090             /* note that sum1 is associated with the second of the two rows */
3091             v2    = a->a + diag[row-1] + 2;
3092             for(n = 0; n<sz-1; n+=2) {
3093               i1   = idx[0];
3094               i2   = idx[1];
3095               idx += 2;
3096               tmp0 = x[i1];
3097               tmp1 = x[i2];
3098               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3099               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3100             }
3101 
3102             if (n == sz-1){
3103               tmp0  = x[*idx];
3104               sum1 -= *v1*tmp0;
3105               sum2 -= *v2*tmp0;
3106             }
3107             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3108             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3109             break;
3110           case 3:
3111 
3112             sum1  = xb[row];
3113             sum2  = xb[row-1];
3114             sum3  = xb[row-2];
3115             v2    = a->a + diag[row-1] + 2;
3116             v3    = a->a + diag[row-2] + 3;
3117             for(n = 0; n<sz-1; n+=2) {
3118               i1   = idx[0];
3119               i2   = idx[1];
3120               idx += 2;
3121               tmp0 = x[i1];
3122               tmp1 = x[i2];
3123               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3124               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3125               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3126             }
3127 
3128             if (n == sz-1){
3129               tmp0  = x[*idx];
3130               sum1 -= *v1*tmp0;
3131               sum2 -= *v2*tmp0;
3132               sum3 -= *v3*tmp0;
3133             }
3134             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3135             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3136             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3137             break;
3138           case 4:
3139 
3140             sum1  = xb[row];
3141             sum2  = xb[row-1];
3142             sum3  = xb[row-2];
3143             sum4  = xb[row-3];
3144             v2    = a->a + diag[row-1] + 2;
3145             v3    = a->a + diag[row-2] + 3;
3146             v4    = a->a + diag[row-3] + 4;
3147             for(n = 0; n<sz-1; n+=2) {
3148               i1   = idx[0];
3149               i2   = idx[1];
3150               idx += 2;
3151               tmp0 = x[i1];
3152               tmp1 = x[i2];
3153               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3154               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3155               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3156               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3157             }
3158 
3159             if (n == sz-1){
3160               tmp0  = x[*idx];
3161               sum1 -= *v1*tmp0;
3162               sum2 -= *v2*tmp0;
3163               sum3 -= *v3*tmp0;
3164               sum4 -= *v4*tmp0;
3165             }
3166             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3167             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3168             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3169             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3170             break;
3171           case 5:
3172 
3173             sum1  = xb[row];
3174             sum2  = xb[row-1];
3175             sum3  = xb[row-2];
3176             sum4  = xb[row-3];
3177             sum5  = xb[row-4];
3178             v2    = a->a + diag[row-1] + 2;
3179             v3    = a->a + diag[row-2] + 3;
3180             v4    = a->a + diag[row-3] + 4;
3181             v5    = a->a + diag[row-4] + 5;
3182             for(n = 0; n<sz-1; n+=2) {
3183               i1   = idx[0];
3184               i2   = idx[1];
3185               idx += 2;
3186               tmp0 = x[i1];
3187               tmp1 = x[i2];
3188               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3190               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3191               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3192               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3193             }
3194 
3195             if (n == sz-1){
3196               tmp0  = x[*idx];
3197               sum1 -= *v1*tmp0;
3198               sum2 -= *v2*tmp0;
3199               sum3 -= *v3*tmp0;
3200               sum4 -= *v4*tmp0;
3201               sum5 -= *v5*tmp0;
3202             }
3203             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3204             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3205             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3206             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3207             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3208             break;
3209 	  default:
3210    	    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3211         }
3212       }
3213 
3214       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3215     }
3216     its--;
3217     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3218     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3219   }
3220   if (flag & SOR_EISENSTAT) {
3221     const PetscScalar *b;
3222     MatScalar         *t = a->inode.ssor_work;
3223 
3224     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3225     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3226     /*
3227           Apply  (U + D)^-1  where D is now the block diagonal
3228     */
3229     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3230     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3231       ibdiag -= sizes[i]*sizes[i];
3232       sz      = ii[row+1] - diag[row] - 1;
3233       v1      = a->a + diag[row] + 1;
3234       idx     = a->j + diag[row] + 1;
3235       CHKMEMQ;
3236       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3237       switch (sizes[i]){
3238         case 1:
3239 
3240 	  sum1  = b[row];
3241 	  for(n = 0; n<sz-1; n+=2) {
3242 	    i1   = idx[0];
3243 	    i2   = idx[1];
3244 	    idx += 2;
3245 	    tmp0 = x[i1];
3246 	    tmp1 = x[i2];
3247 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248 	  }
3249 
3250 	  if (n == sz-1){
3251 	    tmp0  = x[*idx];
3252 	    sum1 -= *v1*tmp0;
3253 	  }
3254 	  x[row] = sum1*(*ibdiag);row--;
3255 	  break;
3256 
3257         case 2:
3258 
3259 	  sum1  = b[row];
3260 	  sum2  = b[row-1];
3261 	  /* note that sum1 is associated with the second of the two rows */
3262 	  v2    = a->a + diag[row-1] + 2;
3263 	  for(n = 0; n<sz-1; n+=2) {
3264 	    i1   = idx[0];
3265 	    i2   = idx[1];
3266 	    idx += 2;
3267 	    tmp0 = x[i1];
3268 	    tmp1 = x[i2];
3269 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3270 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3271 	  }
3272 
3273 	  if (n == sz-1){
3274 	    tmp0  = x[*idx];
3275 	    sum1 -= *v1*tmp0;
3276 	    sum2 -= *v2*tmp0;
3277 	  }
3278 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3279 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3280           row -= 2;
3281 	  break;
3282         case 3:
3283 
3284 	  sum1  = b[row];
3285 	  sum2  = b[row-1];
3286 	  sum3  = b[row-2];
3287 	  v2    = a->a + diag[row-1] + 2;
3288 	  v3    = a->a + diag[row-2] + 3;
3289 	  for(n = 0; n<sz-1; n+=2) {
3290 	    i1   = idx[0];
3291 	    i2   = idx[1];
3292 	    idx += 2;
3293 	    tmp0 = x[i1];
3294 	    tmp1 = x[i2];
3295 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3296 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3297 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3298 	  }
3299 
3300 	  if (n == sz-1){
3301 	    tmp0  = x[*idx];
3302 	    sum1 -= *v1*tmp0;
3303 	    sum2 -= *v2*tmp0;
3304 	    sum3 -= *v3*tmp0;
3305 	  }
3306 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3307 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3308 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3309           row -= 3;
3310 	  break;
3311         case 4:
3312 
3313 	  sum1  = b[row];
3314 	  sum2  = b[row-1];
3315 	  sum3  = b[row-2];
3316 	  sum4  = b[row-3];
3317 	  v2    = a->a + diag[row-1] + 2;
3318 	  v3    = a->a + diag[row-2] + 3;
3319 	  v4    = a->a + diag[row-3] + 4;
3320 	  for(n = 0; n<sz-1; n+=2) {
3321 	    i1   = idx[0];
3322 	    i2   = idx[1];
3323 	    idx += 2;
3324 	    tmp0 = x[i1];
3325 	    tmp1 = x[i2];
3326 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3327 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3328 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3329 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3330 	  }
3331 
3332 	  if (n == sz-1){
3333 	    tmp0  = x[*idx];
3334 	    sum1 -= *v1*tmp0;
3335 	    sum2 -= *v2*tmp0;
3336 	    sum3 -= *v3*tmp0;
3337 	    sum4 -= *v4*tmp0;
3338 	  }
3339 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3340 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3341 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3342 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3343           row -= 4;
3344 	  break;
3345         case 5:
3346 
3347 	  sum1  = b[row];
3348 	  sum2  = b[row-1];
3349 	  sum3  = b[row-2];
3350 	  sum4  = b[row-3];
3351 	  sum5  = b[row-4];
3352 	  v2    = a->a + diag[row-1] + 2;
3353 	  v3    = a->a + diag[row-2] + 3;
3354 	  v4    = a->a + diag[row-3] + 4;
3355 	  v5    = a->a + diag[row-4] + 5;
3356 	  for(n = 0; n<sz-1; n+=2) {
3357 	    i1   = idx[0];
3358 	    i2   = idx[1];
3359 	    idx += 2;
3360 	    tmp0 = x[i1];
3361 	    tmp1 = x[i2];
3362 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367 	  }
3368 
3369 	  if (n == sz-1){
3370 	    tmp0  = x[*idx];
3371 	    sum1 -= *v1*tmp0;
3372 	    sum2 -= *v2*tmp0;
3373 	    sum3 -= *v3*tmp0;
3374 	    sum4 -= *v4*tmp0;
3375 	    sum5 -= *v5*tmp0;
3376 	  }
3377 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3378 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3379 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3380 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3381 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3382           row -= 5;
3383 	  break;
3384         default:
3385 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3386       }
3387       CHKMEMQ;
3388     }
3389     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3390 
3391     /*
3392            t = b - D x    where D is the block diagonal
3393     */
3394     cnt = 0;
3395     for (i=0, row=0; i<m; i++) {
3396       CHKMEMQ;
3397       switch (sizes[i]){
3398         case 1:
3399 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3400 	  break;
3401         case 2:
3402 	  x1   = x[row]; x2 = x[row+1];
3403 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3404 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3405 	  t[row]   = b[row] - tmp1;
3406 	  t[row+1] = b[row+1] - tmp2; row += 2;
3407 	  cnt += 4;
3408 	  break;
3409         case 3:
3410 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3411 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3412 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3413 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3414 	  t[row] = b[row] - tmp1;
3415 	  t[row+1] = b[row+1] - tmp2;
3416 	  t[row+2] = b[row+2] - tmp3; row += 3;
3417 	  cnt += 9;
3418 	  break;
3419         case 4:
3420 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3421 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3422 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3423 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3424 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3425 	  t[row] = b[row] - tmp1;
3426 	  t[row+1] = b[row+1] - tmp2;
3427 	  t[row+2] = b[row+2] - tmp3;
3428 	  t[row+3] = b[row+3] - tmp4; row += 4;
3429 	  cnt += 16;
3430 	  break;
3431         case 5:
3432 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3433 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3434 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3435 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3436 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3437 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3438 	  t[row] = b[row] - tmp1;
3439 	  t[row+1] = b[row+1] - tmp2;
3440 	  t[row+2] = b[row+2] - tmp3;
3441 	  t[row+3] = b[row+3] - tmp4;
3442 	  t[row+4] = b[row+4] - tmp5;row += 5;
3443 	  cnt += 25;
3444 	  break;
3445         default:
3446 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3447       }
3448       CHKMEMQ;
3449     }
3450     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3451 
3452 
3453 
3454     /*
3455           Apply (L + D)^-1 where D is the block diagonal
3456     */
3457     for (i=0, row=0; i<m; i++) {
3458       sz  = diag[row] - ii[row];
3459       v1  = a->a + ii[row];
3460       idx = a->j + ii[row];
3461       CHKMEMQ;
3462       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3463       switch (sizes[i]){
3464         case 1:
3465 
3466 	  sum1  = t[row];
3467 	  for(n = 0; n<sz-1; n+=2) {
3468 	    i1   = idx[0];
3469 	    i2   = idx[1];
3470 	    idx += 2;
3471 	    tmp0 = t[i1];
3472 	    tmp1 = t[i2];
3473 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3474 	  }
3475 
3476 	  if (n == sz-1){
3477 	    tmp0  = t[*idx];
3478 	    sum1 -= *v1 * tmp0;
3479 	  }
3480 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3481 	  break;
3482         case 2:
3483 	  v2    = a->a + ii[row+1];
3484 	  sum1  = t[row];
3485 	  sum2  = t[row+1];
3486 	  for(n = 0; n<sz-1; n+=2) {
3487 	    i1   = idx[0];
3488 	    i2   = idx[1];
3489 	    idx += 2;
3490 	    tmp0 = t[i1];
3491 	    tmp1 = t[i2];
3492 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3493 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3494 	  }
3495 
3496 	  if (n == sz-1){
3497 	    tmp0  = t[*idx];
3498 	    sum1 -= v1[0] * tmp0;
3499 	    sum2 -= v2[0] * tmp0;
3500 	  }
3501 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3502 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3503 	  ibdiag  += 4; row += 2;
3504 	  break;
3505         case 3:
3506 	  v2    = a->a + ii[row+1];
3507 	  v3    = a->a + ii[row+2];
3508 	  sum1  = t[row];
3509 	  sum2  = t[row+1];
3510 	  sum3  = t[row+2];
3511 	  for(n = 0; n<sz-1; n+=2) {
3512 	    i1   = idx[0];
3513 	    i2   = idx[1];
3514 	    idx += 2;
3515 	    tmp0 = t[i1];
3516 	    tmp1 = t[i2];
3517 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3518 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3519 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3520 	  }
3521 
3522 	  if (n == sz-1){
3523 	    tmp0  = t[*idx];
3524 	    sum1 -= v1[0] * tmp0;
3525 	    sum2 -= v2[0] * tmp0;
3526 	    sum3 -= v3[0] * tmp0;
3527 	  }
3528 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3529 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3530 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3531 	  ibdiag  += 9; row += 3;
3532 	  break;
3533         case 4:
3534 	  v2    = a->a + ii[row+1];
3535 	  v3    = a->a + ii[row+2];
3536 	  v4    = a->a + ii[row+3];
3537 	  sum1  = t[row];
3538 	  sum2  = t[row+1];
3539 	  sum3  = t[row+2];
3540 	  sum4  = t[row+3];
3541 	  for(n = 0; n<sz-1; n+=2) {
3542 	    i1   = idx[0];
3543 	    i2   = idx[1];
3544 	    idx += 2;
3545 	    tmp0 = t[i1];
3546 	    tmp1 = t[i2];
3547 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3548 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3549 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3550 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3551 	  }
3552 
3553 	  if (n == sz-1){
3554 	    tmp0  = t[*idx];
3555 	    sum1 -= v1[0] * tmp0;
3556 	    sum2 -= v2[0] * tmp0;
3557 	    sum3 -= v3[0] * tmp0;
3558 	    sum4 -= v4[0] * tmp0;
3559 	  }
3560 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3561 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3562 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3563 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3564 	  ibdiag  += 16; row += 4;
3565 	  break;
3566         case 5:
3567 	  v2    = a->a + ii[row+1];
3568 	  v3    = a->a + ii[row+2];
3569 	  v4    = a->a + ii[row+3];
3570 	  v5    = a->a + ii[row+4];
3571 	  sum1  = t[row];
3572 	  sum2  = t[row+1];
3573 	  sum3  = t[row+2];
3574 	  sum4  = t[row+3];
3575 	  sum5  = t[row+4];
3576 	  for(n = 0; n<sz-1; n+=2) {
3577 	    i1   = idx[0];
3578 	    i2   = idx[1];
3579 	    idx += 2;
3580 	    tmp0 = t[i1];
3581 	    tmp1 = t[i2];
3582 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3583 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3584 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3585 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3586 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3587 	  }
3588 
3589 	  if (n == sz-1){
3590 	    tmp0  = t[*idx];
3591 	    sum1 -= v1[0] * tmp0;
3592 	    sum2 -= v2[0] * tmp0;
3593 	    sum3 -= v3[0] * tmp0;
3594 	    sum4 -= v4[0] * tmp0;
3595 	    sum5 -= v5[0] * tmp0;
3596 	  }
3597 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3598 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3599 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3600 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3601 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3602 	  ibdiag  += 25; row += 5;
3603 	  break;
3604         default:
3605 	  SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3606       }
3607       CHKMEMQ;
3608     }
3609     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3610     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3611     ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3612   }
3613   PetscFunctionReturn(0);
3614 }
3615 
3616 #undef __FUNCT__
3617 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3618 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3619 {
3620   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3621   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3622   const MatScalar    *bdiag = a->inode.bdiag;
3623   const PetscScalar  *b;
3624   PetscErrorCode      ierr;
3625   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3626   const PetscInt      *sizes = a->inode.size;
3627 
3628   PetscFunctionBegin;
3629   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3630   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3631   cnt = 0;
3632   for (i=0, row=0; i<m; i++) {
3633     switch (sizes[i]){
3634       case 1:
3635 	x[row] = b[row]*bdiag[cnt++];row++;
3636 	break;
3637       case 2:
3638 	x1   = b[row]; x2 = b[row+1];
3639 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3640 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3641 	x[row++] = tmp1;
3642 	x[row++] = tmp2;
3643 	cnt += 4;
3644 	break;
3645       case 3:
3646 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3647 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3648 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3649 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3650 	x[row++] = tmp1;
3651 	x[row++] = tmp2;
3652 	x[row++] = tmp3;
3653 	cnt += 9;
3654 	break;
3655       case 4:
3656 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3657 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3658 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3659 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3660 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3661 	x[row++] = tmp1;
3662 	x[row++] = tmp2;
3663 	x[row++] = tmp3;
3664 	x[row++] = tmp4;
3665 	cnt += 16;
3666 	break;
3667       case 5:
3668 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3669 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3670 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3671 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3672 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3673 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3674 	x[row++] = tmp1;
3675 	x[row++] = tmp2;
3676 	x[row++] = tmp3;
3677 	x[row++] = tmp4;
3678 	x[row++] = tmp5;
3679 	cnt += 25;
3680 	break;
3681       default:
3682 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3683     }
3684   }
3685   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3686   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3687   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3688   PetscFunctionReturn(0);
3689 }
3690 
3691 /*
3692     samestructure indicates that the matrix has not changed its nonzero structure so we
3693     do not need to recompute the inodes
3694 */
3695 #undef __FUNCT__
3696 #define __FUNCT__ "Mat_CheckInode"
3697 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3698 {
3699   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3700   PetscErrorCode ierr;
3701   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3702   PetscTruth     flag;
3703 
3704   PetscFunctionBegin;
3705   if (!a->inode.use)                     PetscFunctionReturn(0);
3706   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3707 
3708 
3709   m = A->rmap->n;
3710   if (a->inode.size) {ns = a->inode.size;}
3711   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3712 
3713   i          = 0;
3714   node_count = 0;
3715   idx        = a->j;
3716   ii         = a->i;
3717   while (i < m){                /* For each row */
3718     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3719     /* Limits the number of elements in a node to 'a->inode.limit' */
3720     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3721       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3722       if (nzy != nzx) break;
3723       idy  += nzx;             /* Same nonzero pattern */
3724       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3725       if (!flag) break;
3726     }
3727     ns[node_count++] = blk_size;
3728     idx += blk_size*nzx;
3729     i    = j;
3730   }
3731   /* If not enough inodes found,, do not use inode version of the routines */
3732   if (!m || node_count > .9*m) {
3733     ierr = PetscFree(ns);CHKERRQ(ierr);
3734     a->inode.node_count       = 0;
3735     a->inode.size             = PETSC_NULL;
3736     a->inode.use              = PETSC_FALSE;
3737     A->ops->mult              = MatMult_SeqAIJ;
3738     A->ops->sor               = MatSOR_SeqAIJ;
3739     A->ops->multadd           = MatMultAdd_SeqAIJ;
3740     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
3741     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
3742     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
3743     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
3744     A->ops->coloringpatch     = 0;
3745     A->ops->multdiagonalblock = 0;
3746     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3747   } else {
3748     if (!A->factortype) {
3749       A->ops->mult              = MatMult_SeqAIJ_Inode;
3750       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3751       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3752       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3753       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3754       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3755       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3756       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3757       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3758     } else {
3759       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3760     }
3761     a->inode.node_count       = node_count;
3762     a->inode.size             = ns;
3763     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3764   }
3765   PetscFunctionReturn(0);
3766 }
3767 
3768 #undef __FUNCT__
3769 #define __FUNCT__ "MatGetRow_FactoredLU"
3770 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
3771 {
3772   PetscInt k, *vi;
3773   PetscFunctionBegin;
3774   vi = aj + ai[row];
3775   for(k=0;k<nzl;k++) cols[k] = vi[k];
3776   vi = aj + adiag[row];
3777   cols[nzl] = vi[0];
3778   vi = aj + adiag[row+1]+1;
3779   for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
3780   PetscFunctionReturn(0);
3781 }
3782 /*
3783    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3784    Modified from Mat_CheckInode().
3785 
3786    Input Parameters:
3787 +  Mat A - ILU or LU matrix factor
3788 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
3789     do not need to recompute the inodes
3790 */
3791 #undef __FUNCT__
3792 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3793 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3794 {
3795   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3796   PetscErrorCode ierr;
3797   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3798   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3799   PetscTruth     flag;
3800 
3801   PetscFunctionBegin;
3802   if (!a->inode.use)                     PetscFunctionReturn(0);
3803   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3804 
3805   m = A->rmap->n;
3806   if (a->inode.size) {ns = a->inode.size;}
3807   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3808 
3809   i          = 0;
3810   node_count = 0;
3811   while (i < m){                /* For each row */
3812     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3813     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3814     nzx  = nzl1 + nzu1 + 1;
3815     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3816     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3817 
3818     /* Limits the number of elements in a node to 'a->inode.limit' */
3819     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3820       nzl2    = ai[j+1] - ai[j];
3821       nzu2    = adiag[j] - adiag[j+1] - 1;
3822       nzy     = nzl2 + nzu2 + 1;
3823       if( nzy != nzx) break;
3824       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3825       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
3826       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3827       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3828       ierr = PetscFree(cols2);CHKERRQ(ierr);
3829     }
3830     ns[node_count++] = blk_size;
3831     ierr = PetscFree(cols1);CHKERRQ(ierr);
3832     i    = j;
3833   }
3834   /* If not enough inodes found,, do not use inode version of the routines */
3835   if (!m || node_count > .9*m) {
3836     ierr = PetscFree(ns);CHKERRQ(ierr);
3837     a->inode.node_count     = 0;
3838     a->inode.size           = PETSC_NULL;
3839     a->inode.use            = PETSC_FALSE;
3840     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3841   } else {
3842     A->ops->mult              = 0;
3843     A->ops->sor               = 0;
3844     A->ops->multadd           = 0;
3845     A->ops->getrowij          = 0;
3846     A->ops->restorerowij      = 0;
3847     A->ops->getcolumnij       = 0;
3848     A->ops->restorecolumnij   = 0;
3849     A->ops->coloringpatch     = 0;
3850     A->ops->multdiagonalblock = 0;
3851     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3852     a->inode.node_count       = node_count;
3853     a->inode.size             = ns;
3854     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3855   }
3856   PetscFunctionReturn(0);
3857 }
3858 
3859 /*
3860      This is really ugly. if inodes are used this replaces the
3861   permutations with ones that correspond to rows/cols of the matrix
3862   rather then inode blocks
3863 */
3864 #undef __FUNCT__
3865 #define __FUNCT__ "MatInodeAdjustForInodes"
3866 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3867 {
3868   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3869 
3870   PetscFunctionBegin;
3871   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3872   if (f) {
3873     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3874   }
3875   PetscFunctionReturn(0);
3876 }
3877 
3878 EXTERN_C_BEGIN
3879 #undef __FUNCT__
3880 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3881 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3882 {
3883   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3884   PetscErrorCode ierr;
3885   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3886   const PetscInt *ridx,*cidx;
3887   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3888   PetscInt       nslim_col,*ns_col;
3889   IS             ris = *rperm,cis = *cperm;
3890 
3891   PetscFunctionBegin;
3892   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3893   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3894 
3895   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3896   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3897   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3898 
3899   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3900   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3901 
3902   /* Form the inode structure for the rows of permuted matric using inv perm*/
3903   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3904 
3905   /* Construct the permutations for rows*/
3906   for (i=0,row = 0; i<nslim_row; ++i){
3907     indx      = ridx[i];
3908     start_val = tns[indx];
3909     end_val   = tns[indx + 1];
3910     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3911   }
3912 
3913   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3914   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3915 
3916  /* Construct permutations for columns */
3917   for (i=0,col=0; i<nslim_col; ++i){
3918     indx      = cidx[i];
3919     start_val = tns[indx];
3920     end_val   = tns[indx + 1];
3921     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3922   }
3923 
3924   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3925   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3926   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3927   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3928 
3929   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3930   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3931 
3932   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3933   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3934   ierr = ISDestroy(cis);CHKERRQ(ierr);
3935   ierr = ISDestroy(ris);CHKERRQ(ierr);
3936   ierr = PetscFree(tns);CHKERRQ(ierr);
3937   PetscFunctionReturn(0);
3938 }
3939 EXTERN_C_END
3940 
3941 #undef __FUNCT__
3942 #define __FUNCT__ "MatInodeGetInodeSizes"
3943 /*@C
3944    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3945 
3946    Collective on Mat
3947 
3948    Input Parameter:
3949 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3950 
3951    Output Parameter:
3952 +  node_count - no of inodes present in the matrix.
3953 .  sizes      - an array of size node_count,with sizes of each inode.
3954 -  limit      - the max size used to generate the inodes.
3955 
3956    Level: advanced
3957 
3958    Notes: This routine returns some internal storage information
3959    of the matrix, it is intended to be used by advanced users.
3960    It should be called after the matrix is assembled.
3961    The contents of the sizes[] array should not be changed.
3962    PETSC_NULL may be passed for information not requested.
3963 
3964 .keywords: matrix, seqaij, get, inode
3965 
3966 .seealso: MatGetInfo()
3967 @*/
3968 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3969 {
3970   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3971 
3972   PetscFunctionBegin;
3973   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3974   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3975   if (f) {
3976     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3977   }
3978   PetscFunctionReturn(0);
3979 }
3980 
3981 EXTERN_C_BEGIN
3982 #undef __FUNCT__
3983 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3984 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3985 {
3986   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3987 
3988   PetscFunctionBegin;
3989   if (node_count) *node_count = a->inode.node_count;
3990   if (sizes)      *sizes      = a->inode.size;
3991   if (limit)      *limit      = a->inode.limit;
3992   PetscFunctionReturn(0);
3993 }
3994 EXTERN_C_END
3995