xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 6c69925837aec6980b8a9f96be97fd240ba46d6a)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1195   FactorShiftCtx   sctx;
1196   const PetscInt   *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,node_max,col;
1200   const PetscInt   *ns;
1201   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1202 
1203   PetscFunctionBegin;
1204   /* MatPivotSetUp(): initialize shift context sctx */
1205   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1206 
1207   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1208     ddiag          = a->diag;
1209     sctx.shift_top = info->zeropivot;
1210     for (i=0; i<n; i++) {
1211       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1212       d  = (aa)[ddiag[i]];
1213       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1214       v  = aa+ai[i];
1215       nz = ai[i+1] - ai[i];
1216       for (j=0; j<nz; j++)
1217 	rs += PetscAbsScalar(v[j]);
1218       if (rs>sctx.shift_top) sctx.shift_top = rs;
1219     }
1220     sctx.shift_top   *= 1.1;
1221     sctx.nshift_max   = 5;
1222     sctx.shift_lo     = 0.;
1223     sctx.shift_hi     = 1.;
1224   }
1225 
1226   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1227   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1228 
1229   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1231   rtmp2 = rtmp1 + n;
1232   rtmp3 = rtmp2 + n;
1233   rtmp4 = rtmp3 + n;
1234   ics   = ic;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns){
1239     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1240   }
1241 
1242   /* If max inode size > 4, split it into two inodes.*/
1243   /* also map the inode sizes according to the ordering */
1244   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1245   for (i=0,j=0; i<node_max; ++i,++j){
1246     if (ns[i]>4) {
1247       tmp_vec1[j] = 4;
1248       ++j;
1249       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1250     } else {
1251       tmp_vec1[j] = ns[i];
1252     }
1253   }
1254   /* Use the correct node_max */
1255   node_max = j;
1256 
1257   /* Now reorder the inode info based on mat re-ordering info */
1258   /* First create a row -> inode_size_array_index map */
1259   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1260   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1261   for (i=0,row=0; i<node_max; i++) {
1262     nodesz = tmp_vec1[i];
1263     for (j=0; j<nodesz; j++,row++) {
1264       nsmap[row] = i;
1265     }
1266   }
1267   /* Using nsmap, create a reordered ns structure */
1268   for (i=0,j=0; i< node_max; i++) {
1269     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1270     tmp_vec2[i]  = nodesz;
1271     j           += nodesz;
1272   }
1273   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1274   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1275 
1276   /* Now use the correct ns */
1277   ns = tmp_vec2;
1278 
1279   do {
1280     sctx.useshift = PETSC_FALSE;
1281     /* Now loop over each block-row, and do the factorization */
1282     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1283       nodesz = ns[inod];
1284 
1285       switch (nodesz){
1286       case 1:
1287       /*----------*/
1288         /* zero rtmp1 */
1289         /* L part */
1290         nz    = bi[i+1] - bi[i];
1291         bjtmp = bj + bi[i];
1292         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1293 
1294         /* U part */
1295         nz = bdiag[i]-bdiag[i+1];
1296         bjtmp = bj + bdiag[i+1]+1;
1297         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1298 
1299         /* load in initial (unfactored row) */
1300         nz    = ai[r[i]+1] - ai[r[i]];
1301         ajtmp = aj + ai[r[i]];
1302         v     = aa + ai[r[i]];
1303         for (j=0; j<nz; j++) {
1304           rtmp1[ics[ajtmp[j]]] = v[j];
1305         }
1306         /* ZeropivotApply() */
1307         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1308 
1309         /* elimination */
1310         bjtmp = bj + bi[i];
1311         row   = *bjtmp++;
1312         nzL   = bi[i+1] - bi[i];
1313         for(k=0; k < nzL;k++) {
1314           pc = rtmp1 + row;
1315           if (*pc != 0.0) {
1316             pv   = b->a + bdiag[row];
1317             mul1 = *pc * (*pv);
1318             *pc  = mul1;
1319             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1320             pv = b->a + bdiag[row+1]+1;
1321             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1322             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1323             ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1324           }
1325           row = *bjtmp++;
1326         }
1327 
1328         /* finished row so stick it into b->a */
1329         rs = 0.0;
1330         /* L part */
1331         pv = b->a + bi[i] ;
1332         pj = b->j + bi[i] ;
1333         nz = bi[i+1] - bi[i];
1334         for (j=0; j<nz; j++) {
1335           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1336         }
1337 
1338         /* U part */
1339         pv = b->a + bdiag[i+1]+1;
1340         pj = b->j + bdiag[i+1]+1;
1341         nz = bdiag[i] - bdiag[i+1]-1;
1342         for (j=0; j<nz; j++) {
1343           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1344         }
1345 
1346         /* Check zero pivot */
1347         sctx.rs = rs;
1348         sctx.pv = rtmp1[i];
1349         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1350 
1351         /* Mark diagonal and invert diagonal for simplier triangular solves */
1352         pv  = b->a + bdiag[i];
1353         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1354         break;
1355 
1356       case 2:
1357       /*----------*/
1358         /* zero rtmp1 and rtmp2 */
1359         /* L part */
1360         nz    = bi[i+1] - bi[i];
1361         bjtmp = bj + bi[i];
1362         for  (j=0; j<nz; j++) {
1363           col = bjtmp[j];
1364           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365         }
1366 
1367         /* U part */
1368         nz = bdiag[i]-bdiag[i+1];
1369         bjtmp = bj + bdiag[i+1]+1;
1370         for  (j=0; j<nz; j++) {
1371           col = bjtmp[j];
1372           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1373         }
1374 
1375         /* load in initial (unfactored row) */
1376         nz    = ai[r[i]+1] - ai[r[i]];
1377         ajtmp = aj + ai[r[i]];
1378         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1379         for (j=0; j<nz; j++) {
1380           col = ics[ajtmp[j]];
1381           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1382         }
1383         /* ZeropivotApply(): shift the diagonal of the matrix  */
1384         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1385 
1386         /* elimination */
1387         bjtmp = bj + bi[i];
1388         row   = *bjtmp++; /* pivot row */
1389         nzL   = bi[i+1] - bi[i];
1390         for(k=0; k < nzL;k++) {
1391           pc1 = rtmp1 + row;
1392           pc2 = rtmp2 + row;
1393           if (*pc1 != 0.0 || *pc2 != 0.0) {
1394             pv   = b->a + bdiag[row];
1395             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1396             *pc1 = mul1;       *pc2 = mul2;
1397 
1398             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1399             pv = b->a + bdiag[row+1]+1;
1400             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1401             for (j=0; j<nz; j++){
1402               col = pj[j];
1403               rtmp1[col] -= mul1 * pv[j];
1404               rtmp2[col] -= mul2 * pv[j];
1405             }
1406             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1407           }
1408           row = *bjtmp++;
1409         }
1410 
1411         /* finished row i; check zero pivot, then stick row i into b->a */
1412         rs  = 0.0;
1413         /* L part */
1414         pc1 = b->a + bi[i];
1415         pj  = b->j + bi[i] ;
1416         nz  = bi[i+1] - bi[i];
1417         for (j=0; j<nz; j++) {
1418           col = pj[j];
1419           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420         }
1421         /* U part */
1422         pc1 = b->a + bdiag[i+1]+1;
1423         pj  = b->j + bdiag[i+1]+1;
1424         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1425         for (j=0; j<nz; j++) {
1426           col = pj[j];
1427           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1428         }
1429 
1430         sctx.rs  = rs;
1431         sctx.pv  = rtmp1[i];
1432         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1433         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1434         *pc1 = 1.0/sctx.pv;
1435 
1436         /* Now take care of diagonal 2x2 block. */
1437         pc2 = rtmp2 + i;
1438         if (*pc2 != 0.0){
1439           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1440           *pc2 = mul1;          /* insert L entry */
1441           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1442           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1443           for (j=0; j<nz; j++) {
1444             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1445           }
1446           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1447         }
1448 
1449         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1450         rs = 0.0;
1451         /* L part */
1452         pc2 = b->a + bi[i+1];
1453         pj  = b->j + bi[i+1] ;
1454         nz  = bi[i+2] - bi[i+1];
1455         for (j=0; j<nz; j++) {
1456           col = pj[j];
1457           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1458         }
1459         /* U part */
1460         pc2 = b->a + bdiag[i+2]+1;
1461         pj  = b->j + bdiag[i+2]+1;
1462         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1463         for (j=0; j<nz; j++) {
1464           col = pj[j];
1465           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1466         }
1467 
1468         sctx.rs  = rs;
1469         sctx.pv  = rtmp2[i+1];
1470         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1471         pc2  = b->a + bdiag[i+1];
1472         *pc2 = 1.0/sctx.pv;
1473         break;
1474 
1475       case 3:
1476       /*----------*/
1477         /* zero rtmp */
1478         /* L part */
1479         nz    = bi[i+1] - bi[i];
1480         bjtmp = bj + bi[i];
1481         for  (j=0; j<nz; j++) {
1482           col = bjtmp[j];
1483           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1484         }
1485 
1486         /* U part */
1487         nz = bdiag[i]-bdiag[i+1];
1488         bjtmp = bj + bdiag[i+1]+1;
1489         for  (j=0; j<nz; j++) {
1490           col = bjtmp[j];
1491           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1492         }
1493 
1494         /* load in initial (unfactored row) */
1495         nz    = ai[r[i]+1] - ai[r[i]];
1496         ajtmp = aj + ai[r[i]];
1497         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1498         for (j=0; j<nz; j++) {
1499           col = ics[ajtmp[j]];
1500           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1501         }
1502         /* ZeropivotApply(): shift the diagonal of the matrix  */
1503         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1504 
1505         /* elimination */
1506         bjtmp = bj + bi[i];
1507         row   = *bjtmp++; /* pivot row */
1508         nzL   = bi[i+1] - bi[i];
1509         for(k=0; k < nzL;k++) {
1510           pc1 = rtmp1 + row;
1511           pc2 = rtmp2 + row;
1512           pc3 = rtmp3 + row;
1513           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1514             pv  = b->a + bdiag[row];
1515             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1516             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1517 
1518             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1519             pv = b->a + bdiag[row+1]+1;
1520             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1521             for (j=0; j<nz; j++){
1522               col = pj[j];
1523               rtmp1[col] -= mul1 * pv[j];
1524               rtmp2[col] -= mul2 * pv[j];
1525               rtmp3[col] -= mul3 * pv[j];
1526             }
1527             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1528           }
1529           row = *bjtmp++;
1530         }
1531 
1532         /* finished row i; check zero pivot, then stick row i into b->a */
1533         rs  = 0.0;
1534         /* L part */
1535         pc1 = b->a + bi[i];
1536         pj  = b->j + bi[i] ;
1537         nz  = bi[i+1] - bi[i];
1538         for (j=0; j<nz; j++) {
1539           col = pj[j];
1540           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1541         }
1542         /* U part */
1543         pc1 = b->a + bdiag[i+1]+1;
1544         pj  = b->j + bdiag[i+1]+1;
1545         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1546         for (j=0; j<nz; j++) {
1547           col = pj[j];
1548           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1549         }
1550 
1551         sctx.rs  = rs;
1552         sctx.pv  = rtmp1[i];
1553         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1554         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1555         *pc1 = 1.0/sctx.pv;
1556 
1557         /* Now take care of 1st column of diagonal 3x3 block. */
1558         pc2 = rtmp2 + i;
1559         pc3 = rtmp3 + i;
1560         if (*pc2 != 0.0 || *pc3 != 0.0){
1561           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1562           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1563           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1564           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1565           for (j=0; j<nz; j++) {
1566             col = pj[j];
1567             rtmp2[col] -= mul2 * rtmp1[col];
1568             rtmp3[col] -= mul3 * rtmp1[col];
1569           }
1570           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1571         }
1572 
1573         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1574         rs = 0.0;
1575         /* L part */
1576         pc2 = b->a + bi[i+1];
1577         pj  = b->j + bi[i+1] ;
1578         nz  = bi[i+2] - bi[i+1];
1579         for (j=0; j<nz; j++) {
1580           col = pj[j];
1581           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1582         }
1583         /* U part */
1584         pc2 = b->a + bdiag[i+2]+1;
1585         pj  = b->j + bdiag[i+2]+1;
1586         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1587         for (j=0; j<nz; j++) {
1588           col = pj[j];
1589           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1590         }
1591 
1592         sctx.rs  = rs;
1593         sctx.pv  = rtmp2[i+1];
1594         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1595         pc2  = b->a + bdiag[i+1];
1596         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1597 
1598         /* Now take care of 2nd column of diagonal 3x3 block. */
1599         pc3 = rtmp3 + i+1;
1600         if (*pc3 != 0.0){
1601           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1602           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1603           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1604           for (j=0; j<nz; j++) {
1605             col = pj[j];
1606             rtmp3[col] -= mul3 * rtmp2[col];
1607           }
1608           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1609         }
1610 
1611         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1612         rs = 0.0;
1613         /* L part */
1614         pc3 = b->a + bi[i+2];
1615         pj  = b->j + bi[i+2] ;
1616         nz  = bi[i+3] - bi[i+2];
1617         for (j=0; j<nz; j++) {
1618           col = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621         /* U part */
1622         pc3 = b->a + bdiag[i+3]+1;
1623         pj  = b->j + bdiag[i+3]+1;
1624         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1625         for (j=0; j<nz; j++) {
1626           col = pj[j];
1627           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1628         }
1629 
1630         sctx.rs  = rs;
1631         sctx.pv  = rtmp3[i+2];
1632         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1633         pc3  = b->a + bdiag[i+2];
1634         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1635         break;
1636       case 4:
1637       /*----------*/
1638         /* zero rtmp */
1639         /* L part */
1640         nz    = bi[i+1] - bi[i];
1641         bjtmp = bj + bi[i];
1642         for  (j=0; j<nz; j++) {
1643           col = bjtmp[j];
1644           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1645         }
1646 
1647         /* U part */
1648         nz = bdiag[i]-bdiag[i+1];
1649         bjtmp = bj + bdiag[i+1]+1;
1650         for  (j=0; j<nz; j++) {
1651           col = bjtmp[j];
1652           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1653         }
1654 
1655         /* load in initial (unfactored row) */
1656         nz    = ai[r[i]+1] - ai[r[i]];
1657         ajtmp = aj + ai[r[i]];
1658         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1659         for (j=0; j<nz; j++) {
1660           col = ics[ajtmp[j]];
1661           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1662         }
1663         /* ZeropivotApply(): shift the diagonal of the matrix  */
1664         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1665 
1666         /* elimination */
1667         bjtmp = bj + bi[i];
1668         row   = *bjtmp++; /* pivot row */
1669         nzL   = bi[i+1] - bi[i];
1670         for(k=0; k < nzL;k++) {
1671           pc1 = rtmp1 + row;
1672           pc2 = rtmp2 + row;
1673           pc3 = rtmp3 + row;
1674 	  pc4 = rtmp4 + row;
1675           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1676             pv  = b->a + bdiag[row];
1677             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1678             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1679 
1680             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1681             pv = b->a + bdiag[row+1]+1;
1682             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1683             for (j=0; j<nz; j++){
1684               col = pj[j];
1685               rtmp1[col] -= mul1 * pv[j];
1686               rtmp2[col] -= mul2 * pv[j];
1687               rtmp3[col] -= mul3 * pv[j];
1688 	      rtmp4[col] -= mul4 * pv[j];
1689             }
1690             ierr = PetscLogFlops(8*nz);CHKERRQ(ierr);
1691           }
1692           row = *bjtmp++;
1693         }
1694 
1695         /* finished row i; check zero pivot, then stick row i into b->a */
1696         rs  = 0.0;
1697         /* L part */
1698         pc1 = b->a + bi[i];
1699         pj  = b->j + bi[i] ;
1700         nz  = bi[i+1] - bi[i];
1701         for (j=0; j<nz; j++) {
1702           col = pj[j];
1703           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1704         }
1705         /* U part */
1706         pc1 = b->a + bdiag[i+1]+1;
1707         pj  = b->j + bdiag[i+1]+1;
1708         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1709         for (j=0; j<nz; j++) {
1710           col = pj[j];
1711           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1712         }
1713 
1714         sctx.rs  = rs;
1715         sctx.pv  = rtmp1[i];
1716         ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr);
1717         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1718         *pc1 = 1.0/sctx.pv;
1719 
1720         /* Now take care of 1st column of diagonal 4x4 block. */
1721         pc2 = rtmp2 + i;
1722         pc3 = rtmp3 + i;
1723 	pc4 = rtmp4 + i;
1724         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){
1725           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1726           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1727 	  mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1728           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1729           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1730           for (j=0; j<nz; j++) {
1731             col = pj[j];
1732             rtmp2[col] -= mul2 * rtmp1[col];
1733             rtmp3[col] -= mul3 * rtmp1[col];
1734 	    rtmp4[col] -= mul4 * rtmp1[col];
1735           }
1736           ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1737         }
1738 
1739         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1740         rs = 0.0;
1741         /* L part */
1742         pc2 = b->a + bi[i+1];
1743         pj  = b->j + bi[i+1] ;
1744         nz  = bi[i+2] - bi[i+1];
1745         for (j=0; j<nz; j++) {
1746           col = pj[j];
1747           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1748         }
1749         /* U part */
1750         pc2 = b->a + bdiag[i+2]+1;
1751         pj  = b->j + bdiag[i+2]+1;
1752         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1753         for (j=0; j<nz; j++) {
1754           col = pj[j];
1755           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1756         }
1757 
1758         sctx.rs  = rs;
1759         sctx.pv  = rtmp2[i+1];
1760         ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr);
1761         pc2  = b->a + bdiag[i+1];
1762         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1763 
1764         /* Now take care of 2nd column of diagonal 4x4 block. */
1765         pc3 = rtmp3 + i+1;
1766 	pc4 = rtmp4 + i+1;
1767         if (*pc3 != 0.0 || *pc4 != 0.0){
1768           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1769 	  mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1770           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1771           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1772           for (j=0; j<nz; j++) {
1773             col = pj[j];
1774             rtmp3[col] -= mul3 * rtmp2[col];
1775 	    rtmp4[col] -= mul4 * rtmp2[col];
1776           }
1777           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1778         }
1779 
1780         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1781         rs = 0.0;
1782         /* L part */
1783         pc3 = b->a + bi[i+2];
1784         pj  = b->j + bi[i+2] ;
1785         nz  = bi[i+3] - bi[i+2];
1786         for (j=0; j<nz; j++) {
1787           col = pj[j];
1788           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1789         }
1790         /* U part */
1791         pc3 = b->a + bdiag[i+3]+1;
1792         pj  = b->j + bdiag[i+3]+1;
1793         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1794         for (j=0; j<nz; j++) {
1795           col = pj[j];
1796           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1797         }
1798 
1799         sctx.rs  = rs;
1800         sctx.pv  = rtmp3[i+2];
1801         ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr);
1802         pc3  = b->a + bdiag[i+2];
1803         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1804 
1805 	/* Now take care of 3rd column of diagonal 4x4 block. */
1806 	pc4 = rtmp4 + i+2;
1807         if (*pc4 != 0.0){
1808 	  mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1810           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811           for (j=0; j<nz; j++) {
1812             col = pj[j];
1813 	    rtmp4[col] -= mul4 * rtmp3[col];
1814           }
1815           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1816         }
1817 
1818         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819         rs = 0.0;
1820         /* L part */
1821         pc4 = b->a + bi[i+3];
1822         pj  = b->j + bi[i+3] ;
1823         nz  = bi[i+4] - bi[i+3];
1824         for (j=0; j<nz; j++) {
1825           col = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828         /* U part */
1829         pc4 = b->a + bdiag[i+4]+1;
1830         pj  = b->j + bdiag[i+4]+1;
1831         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832         for (j=0; j<nz; j++) {
1833           col = pj[j];
1834           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835         }
1836 
1837         sctx.rs  = rs;
1838         sctx.pv  = rtmp4[i+3];
1839         ierr = MatPivotCheck(info,sctx,i+3);CHKERRQ(ierr);
1840         pc4  = b->a + bdiag[i+3];
1841         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1842          break;
1843 
1844       default:
1845         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1846       }
1847       i += nodesz;                 /* Update the row */
1848     }
1849 
1850     /* MatPivotRefine() */
1851     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1852       /*
1853        * if no shift in this attempt & shifting & started shifting & can refine,
1854        * then try lower shift
1855        */
1856       sctx.shift_hi       = sctx.shift_fraction;
1857       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1858       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1859       sctx.useshift       = PETSC_TRUE;
1860       sctx.nshift++;
1861     }
1862   } while (sctx.useshift);
1863 
1864   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1865   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1866   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1867   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1868 
1869   C->ops->solve              = MatSolve_SeqAIJ;
1870   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1871   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1872   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1873   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1874   C->assembled    = PETSC_TRUE;
1875   C->preallocated = PETSC_TRUE;
1876   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1877 
1878   /* MatShiftView(A,info,&sctx) */
1879   if (sctx.nshift){
1880     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1881       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1882     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1883       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1884     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){
1885       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1886     }
1887   }
1888   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 #undef __FUNCT__
1893 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1894 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1895 {
1896   Mat               C = B;
1897   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1898   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1899   PetscErrorCode    ierr;
1900   const PetscInt    *r,*ic,*c,*ics;
1901   PetscInt          n = A->rmap->n,*bi = b->i;
1902   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1903   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1904   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1905   PetscScalar       mul1,mul2,mul3,tmp;
1906   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1907   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1908   PetscReal         rs=0.0;
1909   FactorShiftCtx    sctx;
1910   PetscInt          newshift;
1911 
1912   PetscFunctionBegin;
1913   sctx.shift_top      = 0;
1914   sctx.nshift_max     = 0;
1915   sctx.shift_lo       = 0;
1916   sctx.shift_hi       = 0;
1917   sctx.shift_fraction = 0;
1918 
1919   /* if both shift schemes are chosen by user, only use info->shiftpd */
1920   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1921     sctx.shift_top = 0;
1922     for (i=0; i<n; i++) {
1923       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1924       rs    = 0.0;
1925       ajtmp = aj + ai[i];
1926       rtmp1 = aa + ai[i];
1927       nz = ai[i+1] - ai[i];
1928       for (j=0; j<nz; j++){
1929         if (*ajtmp != i){
1930           rs += PetscAbsScalar(*rtmp1++);
1931         } else {
1932           rs -= PetscRealPart(*rtmp1++);
1933         }
1934         ajtmp++;
1935       }
1936       if (rs>sctx.shift_top) sctx.shift_top = rs;
1937     }
1938     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1939     sctx.shift_top *= 1.1;
1940     sctx.nshift_max = 5;
1941     sctx.shift_lo   = 0.;
1942     sctx.shift_hi   = 1.;
1943   }
1944   sctx.shift_amount = 0;
1945   sctx.nshift       = 0;
1946 
1947   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1948   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1949   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1950   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1951   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1952   ics   = ic ;
1953   rtmp22 = rtmp11 + n;
1954   rtmp33 = rtmp22 + n;
1955 
1956   node_max = a->inode.node_count;
1957   ns       = a->inode.size;
1958   if (!ns){
1959     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1960   }
1961 
1962   /* If max inode size > 3, split it into two inodes.*/
1963   /* also map the inode sizes according to the ordering */
1964   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1965   for (i=0,j=0; i<node_max; ++i,++j){
1966     if (ns[i]>3) {
1967       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1968       ++j;
1969       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1970     } else {
1971       tmp_vec1[j] = ns[i];
1972     }
1973   }
1974   /* Use the correct node_max */
1975   node_max = j;
1976 
1977   /* Now reorder the inode info based on mat re-ordering info */
1978   /* First create a row -> inode_size_array_index map */
1979   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1980   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1981   for (i=0,row=0; i<node_max; i++) {
1982     nodesz = tmp_vec1[i];
1983     for (j=0; j<nodesz; j++,row++) {
1984       nsmap[row] = i;
1985     }
1986   }
1987   /* Using nsmap, create a reordered ns structure */
1988   for (i=0,j=0; i< node_max; i++) {
1989     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1990     tmp_vec2[i]  = nodesz;
1991     j           += nodesz;
1992   }
1993   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1994   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1995   /* Now use the correct ns */
1996   ns = tmp_vec2;
1997 
1998   do {
1999     sctx.useshift = PETSC_FALSE;
2000     /* Now loop over each block-row, and do the factorization */
2001     for (i=0,row=0; i<node_max; i++) {
2002       nodesz = ns[i];
2003       nz     = bi[row+1] - bi[row];
2004       bjtmp  = bj + bi[row];
2005 
2006       switch (nodesz){
2007       case 1:
2008         for  (j=0; j<nz; j++){
2009           idx        = bjtmp[j];
2010           rtmp11[idx] = 0.0;
2011         }
2012 
2013         /* load in initial (unfactored row) */
2014         idx    = r[row];
2015         nz_tmp = ai[idx+1] - ai[idx];
2016         ajtmp  = aj + ai[idx];
2017         v1     = aa + ai[idx];
2018 
2019         for (j=0; j<nz_tmp; j++) {
2020           idx        = ics[ajtmp[j]];
2021           rtmp11[idx] = v1[j];
2022         }
2023         rtmp11[ics[r[row]]] += sctx.shift_amount;
2024 
2025         prow = *bjtmp++ ;
2026         while (prow < row) {
2027           pc1 = rtmp11 + prow;
2028           if (*pc1 != 0.0){
2029             pv   = ba + bd[prow];
2030             pj   = nbj + bd[prow];
2031             mul1 = *pc1 * *pv++;
2032             *pc1 = mul1;
2033             nz_tmp = bi[prow+1] - bd[prow] - 1;
2034             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2035             for (j=0; j<nz_tmp; j++) {
2036               tmp = pv[j];
2037               idx = pj[j];
2038               rtmp11[idx] -= mul1 * tmp;
2039             }
2040           }
2041           prow = *bjtmp++ ;
2042         }
2043         pj  = bj + bi[row];
2044         pc1 = ba + bi[row];
2045 
2046         sctx.pv    = rtmp11[row];
2047         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2048         rs         = 0.0;
2049         for (j=0; j<nz; j++) {
2050           idx    = pj[j];
2051           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2052           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2053         }
2054         sctx.rs  = rs;
2055         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2056         if (newshift == 1) goto endofwhile;
2057         break;
2058 
2059       case 2:
2060         for (j=0; j<nz; j++) {
2061           idx        = bjtmp[j];
2062           rtmp11[idx] = 0.0;
2063           rtmp22[idx] = 0.0;
2064         }
2065 
2066         /* load in initial (unfactored row) */
2067         idx    = r[row];
2068         nz_tmp = ai[idx+1] - ai[idx];
2069         ajtmp  = aj + ai[idx];
2070         v1     = aa + ai[idx];
2071         v2     = aa + ai[idx+1];
2072         for (j=0; j<nz_tmp; j++) {
2073           idx        = ics[ajtmp[j]];
2074           rtmp11[idx] = v1[j];
2075           rtmp22[idx] = v2[j];
2076         }
2077         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2078         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2079 
2080         prow = *bjtmp++ ;
2081         while (prow < row) {
2082           pc1 = rtmp11 + prow;
2083           pc2 = rtmp22 + prow;
2084           if (*pc1 != 0.0 || *pc2 != 0.0){
2085             pv   = ba + bd[prow];
2086             pj   = nbj + bd[prow];
2087             mul1 = *pc1 * *pv;
2088             mul2 = *pc2 * *pv;
2089             ++pv;
2090             *pc1 = mul1;
2091             *pc2 = mul2;
2092 
2093             nz_tmp = bi[prow+1] - bd[prow] - 1;
2094             for (j=0; j<nz_tmp; j++) {
2095               tmp = pv[j];
2096               idx = pj[j];
2097               rtmp11[idx] -= mul1 * tmp;
2098               rtmp22[idx] -= mul2 * tmp;
2099             }
2100             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2101           }
2102           prow = *bjtmp++ ;
2103         }
2104 
2105         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2106         pc1 = rtmp11 + prow;
2107         pc2 = rtmp22 + prow;
2108 
2109         sctx.pv = *pc1;
2110         pj      = bj + bi[prow];
2111         rs      = 0.0;
2112         for (j=0; j<nz; j++){
2113           idx = pj[j];
2114           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2115         }
2116         sctx.rs = rs;
2117         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2118         if (newshift == 1) goto endofwhile;
2119 
2120         if (*pc2 != 0.0){
2121           pj     = nbj + bd[prow];
2122           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2123           *pc2   = mul2;
2124           nz_tmp = bi[prow+1] - bd[prow] - 1;
2125           for (j=0; j<nz_tmp; j++) {
2126             idx = pj[j] ;
2127             tmp = rtmp11[idx];
2128             rtmp22[idx] -= mul2 * tmp;
2129           }
2130           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2131         }
2132 
2133         pj  = bj + bi[row];
2134         pc1 = ba + bi[row];
2135         pc2 = ba + bi[row+1];
2136 
2137         sctx.pv = rtmp22[row+1];
2138         rs = 0.0;
2139         rtmp11[row]   = 1.0/rtmp11[row];
2140         rtmp22[row+1] = 1.0/rtmp22[row+1];
2141         /* copy row entries from dense representation to sparse */
2142         for (j=0; j<nz; j++) {
2143           idx    = pj[j];
2144           pc1[j] = rtmp11[idx];
2145           pc2[j] = rtmp22[idx];
2146           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2147         }
2148         sctx.rs = rs;
2149         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2150         if (newshift == 1) goto endofwhile;
2151         break;
2152 
2153       case 3:
2154         for  (j=0; j<nz; j++) {
2155           idx        = bjtmp[j];
2156           rtmp11[idx] = 0.0;
2157           rtmp22[idx] = 0.0;
2158           rtmp33[idx] = 0.0;
2159         }
2160         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2161         idx    = r[row];
2162         nz_tmp = ai[idx+1] - ai[idx];
2163         ajtmp = aj + ai[idx];
2164         v1    = aa + ai[idx];
2165         v2    = aa + ai[idx+1];
2166         v3    = aa + ai[idx+2];
2167         for (j=0; j<nz_tmp; j++) {
2168           idx        = ics[ajtmp[j]];
2169           rtmp11[idx] = v1[j];
2170           rtmp22[idx] = v2[j];
2171           rtmp33[idx] = v3[j];
2172         }
2173         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2174         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2175         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2176 
2177         /* loop over all pivot row blocks above this row block */
2178         prow = *bjtmp++ ;
2179         while (prow < row) {
2180           pc1 = rtmp11 + prow;
2181           pc2 = rtmp22 + prow;
2182           pc3 = rtmp33 + prow;
2183           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
2184             pv   = ba  + bd[prow];
2185             pj   = nbj + bd[prow];
2186             mul1 = *pc1 * *pv;
2187             mul2 = *pc2 * *pv;
2188             mul3 = *pc3 * *pv;
2189             ++pv;
2190             *pc1 = mul1;
2191             *pc2 = mul2;
2192             *pc3 = mul3;
2193 
2194             nz_tmp = bi[prow+1] - bd[prow] - 1;
2195             /* update this row based on pivot row */
2196             for (j=0; j<nz_tmp; j++) {
2197               tmp = pv[j];
2198               idx = pj[j];
2199               rtmp11[idx] -= mul1 * tmp;
2200               rtmp22[idx] -= mul2 * tmp;
2201               rtmp33[idx] -= mul3 * tmp;
2202             }
2203             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2204           }
2205           prow = *bjtmp++ ;
2206         }
2207 
2208         /* Now take care of diagonal 3x3 block in this set of rows */
2209         /* note: prow = row here */
2210         pc1 = rtmp11 + prow;
2211         pc2 = rtmp22 + prow;
2212         pc3 = rtmp33 + prow;
2213 
2214         sctx.pv = *pc1;
2215         pj      = bj + bi[prow];
2216         rs      = 0.0;
2217         for (j=0; j<nz; j++){
2218           idx = pj[j];
2219           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2220         }
2221         sctx.rs = rs;
2222         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2223         if (newshift == 1) goto endofwhile;
2224 
2225         if (*pc2 != 0.0 || *pc3 != 0.0){
2226           mul2 = (*pc2)/(*pc1);
2227           mul3 = (*pc3)/(*pc1);
2228           *pc2 = mul2;
2229           *pc3 = mul3;
2230           nz_tmp = bi[prow+1] - bd[prow] - 1;
2231           pj     = nbj + bd[prow];
2232           for (j=0; j<nz_tmp; j++) {
2233             idx = pj[j] ;
2234             tmp = rtmp11[idx];
2235             rtmp22[idx] -= mul2 * tmp;
2236             rtmp33[idx] -= mul3 * tmp;
2237           }
2238           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2239         }
2240         ++prow;
2241 
2242         pc2 = rtmp22 + prow;
2243         pc3 = rtmp33 + prow;
2244         sctx.pv = *pc2;
2245         pj      = bj + bi[prow];
2246         rs      = 0.0;
2247         for (j=0; j<nz; j++){
2248           idx = pj[j];
2249           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2250         }
2251         sctx.rs = rs;
2252         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2253         if (newshift == 1) goto endofwhile;
2254 
2255         if (*pc3 != 0.0){
2256           mul3   = (*pc3)/(*pc2);
2257           *pc3   = mul3;
2258           pj     = nbj + bd[prow];
2259           nz_tmp = bi[prow+1] - bd[prow] - 1;
2260           for (j=0; j<nz_tmp; j++) {
2261             idx = pj[j] ;
2262             tmp = rtmp22[idx];
2263             rtmp33[idx] -= mul3 * tmp;
2264           }
2265           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2266         }
2267 
2268         pj  = bj + bi[row];
2269         pc1 = ba + bi[row];
2270         pc2 = ba + bi[row+1];
2271         pc3 = ba + bi[row+2];
2272 
2273         sctx.pv = rtmp33[row+2];
2274         rs = 0.0;
2275         rtmp11[row]   = 1.0/rtmp11[row];
2276         rtmp22[row+1] = 1.0/rtmp22[row+1];
2277         rtmp33[row+2] = 1.0/rtmp33[row+2];
2278         /* copy row entries from dense representation to sparse */
2279         for (j=0; j<nz; j++) {
2280           idx    = pj[j];
2281           pc1[j] = rtmp11[idx];
2282           pc2[j] = rtmp22[idx];
2283           pc3[j] = rtmp33[idx];
2284           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2285         }
2286 
2287         sctx.rs = rs;
2288         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2289         if (newshift == 1) goto endofwhile;
2290         break;
2291 
2292       default:
2293         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2294       }
2295       row += nodesz;                 /* Update the row */
2296     }
2297     endofwhile:;
2298   } while (sctx.useshift);
2299   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2300   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2301   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2302   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2303   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2304   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2305   /* do not set solve add, since MatSolve_Inode + Add is faster */
2306   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2307   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2308   C->assembled   = PETSC_TRUE;
2309   C->preallocated = PETSC_TRUE;
2310   if (sctx.nshift) {
2311     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2312       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2313     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2314       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2315     }
2316   }
2317   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2318   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2319   PetscFunctionReturn(0);
2320 }
2321 
2322 
2323 /* ----------------------------------------------------------- */
2324 #undef __FUNCT__
2325 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2326 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2327 {
2328   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2329   IS                iscol = a->col,isrow = a->row;
2330   PetscErrorCode    ierr;
2331   const PetscInt    *r,*c,*rout,*cout;
2332   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2333   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2334   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2335   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2336   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2337   const PetscScalar *b;
2338 
2339   PetscFunctionBegin;
2340   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2341   node_max = a->inode.node_count;
2342   ns       = a->inode.size;     /* Node Size array */
2343 
2344   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2345   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2346   tmp  = a->solve_work;
2347 
2348   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2349   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2350 
2351   /* forward solve the lower triangular */
2352   tmps = tmp ;
2353   aa   = a_a ;
2354   aj   = a_j ;
2355   ad   = a->diag;
2356 
2357   for (i = 0,row = 0; i< node_max; ++i){
2358     nsz = ns[i];
2359     aii = ai[row];
2360     v1  = aa + aii;
2361     vi  = aj + aii;
2362     nz  = ai[row+1]- ai[row];
2363 
2364     if (i < node_max-1) {
2365       /* Prefetch the indices for the next block */
2366       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2367       /* Prefetch the data for the next block */
2368       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2369     }
2370 
2371     switch (nsz){               /* Each loop in 'case' is unrolled */
2372     case 1 :
2373       sum1 = b[r[row]];
2374       for(j=0; j<nz-1; j+=2){
2375         i0   = vi[j];
2376         i1   = vi[j+1];
2377         tmp0 = tmps[i0];
2378         tmp1 = tmps[i1];
2379         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2380       }
2381       if(j == nz-1){
2382         tmp0 = tmps[vi[j]];
2383         sum1 -= v1[j]*tmp0;
2384       }
2385       tmp[row++]=sum1;
2386       break;
2387     case 2:
2388       sum1 = b[r[row]];
2389       sum2 = b[r[row+1]];
2390       v2   = aa + ai[row+1];
2391 
2392       for(j=0; j<nz-1; j+=2){
2393         i0   = vi[j];
2394         i1   = vi[j+1];
2395         tmp0 = tmps[i0];
2396         tmp1 = tmps[i1];
2397         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2398         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2399       }
2400       if(j == nz-1){
2401         tmp0 = tmps[vi[j]];
2402         sum1 -= v1[j] *tmp0;
2403         sum2 -= v2[j] *tmp0;
2404       }
2405       sum2 -= v2[nz] * sum1;
2406       tmp[row ++]=sum1;
2407       tmp[row ++]=sum2;
2408       break;
2409     case 3:
2410       sum1 = b[r[row]];
2411       sum2 = b[r[row+1]];
2412       sum3 = b[r[row+2]];
2413       v2   = aa + ai[row+1];
2414       v3   = aa + ai[row+2];
2415 
2416       for (j=0; j<nz-1; j+=2){
2417         i0   = vi[j];
2418         i1   = vi[j+1];
2419         tmp0 = tmps[i0];
2420         tmp1 = tmps[i1];
2421         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2422         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2423         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2424       }
2425       if (j == nz-1){
2426         tmp0 = tmps[vi[j]];
2427         sum1 -= v1[j] *tmp0;
2428         sum2 -= v2[j] *tmp0;
2429         sum3 -= v3[j] *tmp0;
2430       }
2431       sum2 -= v2[nz] * sum1;
2432       sum3 -= v3[nz] * sum1;
2433       sum3 -= v3[nz+1] * sum2;
2434       tmp[row ++]=sum1;
2435       tmp[row ++]=sum2;
2436       tmp[row ++]=sum3;
2437       break;
2438 
2439     case 4:
2440       sum1 = b[r[row]];
2441       sum2 = b[r[row+1]];
2442       sum3 = b[r[row+2]];
2443       sum4 = b[r[row+3]];
2444       v2   = aa + ai[row+1];
2445       v3   = aa + ai[row+2];
2446       v4   = aa + ai[row+3];
2447 
2448       for (j=0; j<nz-1; j+=2){
2449         i0   = vi[j];
2450         i1   = vi[j+1];
2451         tmp0 = tmps[i0];
2452         tmp1 = tmps[i1];
2453         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2454         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2455         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2456         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2457       }
2458       if (j == nz-1){
2459         tmp0 = tmps[vi[j]];
2460         sum1 -= v1[j] *tmp0;
2461         sum2 -= v2[j] *tmp0;
2462         sum3 -= v3[j] *tmp0;
2463         sum4 -= v4[j] *tmp0;
2464       }
2465       sum2 -= v2[nz] * sum1;
2466       sum3 -= v3[nz] * sum1;
2467       sum4 -= v4[nz] * sum1;
2468       sum3 -= v3[nz+1] * sum2;
2469       sum4 -= v4[nz+1] * sum2;
2470       sum4 -= v4[nz+2] * sum3;
2471 
2472       tmp[row ++]=sum1;
2473       tmp[row ++]=sum2;
2474       tmp[row ++]=sum3;
2475       tmp[row ++]=sum4;
2476       break;
2477     case 5:
2478       sum1 = b[r[row]];
2479       sum2 = b[r[row+1]];
2480       sum3 = b[r[row+2]];
2481       sum4 = b[r[row+3]];
2482       sum5 = b[r[row+4]];
2483       v2   = aa + ai[row+1];
2484       v3   = aa + ai[row+2];
2485       v4   = aa + ai[row+3];
2486       v5   = aa + ai[row+4];
2487 
2488       for (j=0; j<nz-1; j+=2){
2489         i0   = vi[j];
2490         i1   = vi[j+1];
2491         tmp0 = tmps[i0];
2492         tmp1 = tmps[i1];
2493         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2494         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2495         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2496         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2497         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2498       }
2499       if (j == nz-1){
2500         tmp0 = tmps[vi[j]];
2501         sum1 -= v1[j] *tmp0;
2502         sum2 -= v2[j] *tmp0;
2503         sum3 -= v3[j] *tmp0;
2504         sum4 -= v4[j] *tmp0;
2505         sum5 -= v5[j] *tmp0;
2506       }
2507 
2508       sum2 -= v2[nz] * sum1;
2509       sum3 -= v3[nz] * sum1;
2510       sum4 -= v4[nz] * sum1;
2511       sum5 -= v5[nz] * sum1;
2512       sum3 -= v3[nz+1] * sum2;
2513       sum4 -= v4[nz+1] * sum2;
2514       sum5 -= v5[nz+1] * sum2;
2515       sum4 -= v4[nz+2] * sum3;
2516       sum5 -= v5[nz+2] * sum3;
2517       sum5 -= v5[nz+3] * sum4;
2518 
2519       tmp[row ++]=sum1;
2520       tmp[row ++]=sum2;
2521       tmp[row ++]=sum3;
2522       tmp[row ++]=sum4;
2523       tmp[row ++]=sum5;
2524       break;
2525     default:
2526       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2527     }
2528   }
2529   /* backward solve the upper triangular */
2530   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2531     nsz = ns[i];
2532     aii = ad[row+1] + 1;
2533     v1  = aa + aii;
2534     vi  = aj + aii;
2535     nz  = ad[row]- ad[row+1] - 1;
2536 
2537     if (i > 0) {
2538       /* Prefetch the indices for the next block */
2539       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2540       /* Prefetch the data for the next block */
2541       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2542     }
2543 
2544     switch (nsz){               /* Each loop in 'case' is unrolled */
2545     case 1 :
2546       sum1 = tmp[row];
2547 
2548       for(j=0 ; j<nz-1; j+=2){
2549         i0   = vi[j];
2550         i1   = vi[j+1];
2551         tmp0 = tmps[i0];
2552         tmp1 = tmps[i1];
2553         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2554       }
2555       if (j == nz-1){
2556         tmp0  = tmps[vi[j]];
2557         sum1 -= v1[j]*tmp0;
2558       }
2559       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2560       break;
2561     case 2 :
2562       sum1 = tmp[row];
2563       sum2 = tmp[row-1];
2564       v2   = aa + ad[row] + 1;
2565       for (j=0 ; j<nz-1; j+=2){
2566         i0   = vi[j];
2567         i1   = vi[j+1];
2568         tmp0 = tmps[i0];
2569         tmp1 = tmps[i1];
2570         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2571         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2572       }
2573       if (j == nz-1){
2574         tmp0  = tmps[vi[j]];
2575         sum1 -= v1[j]* tmp0;
2576         sum2 -= v2[j+1]* tmp0;
2577       }
2578 
2579       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2580       sum2   -= v2[0] * tmp0;
2581       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2582       break;
2583     case 3 :
2584       sum1 = tmp[row];
2585       sum2 = tmp[row -1];
2586       sum3 = tmp[row -2];
2587       v2   = aa + ad[row] + 1;
2588       v3   = aa + ad[row -1] + 1;
2589       for (j=0 ; j<nz-1; j+=2){
2590         i0   = vi[j];
2591         i1   = vi[j+1];
2592         tmp0 = tmps[i0];
2593         tmp1 = tmps[i1];
2594         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2595         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2596         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2597       }
2598       if (j== nz-1){
2599         tmp0  = tmps[vi[j]];
2600         sum1 -= v1[j] * tmp0;
2601         sum2 -= v2[j+1] * tmp0;
2602         sum3 -= v3[j+2] * tmp0;
2603       }
2604       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2605       sum2   -= v2[0]* tmp0;
2606       sum3   -= v3[1] * tmp0;
2607       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608       sum3   -= v3[0]* tmp0;
2609       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2610 
2611       break;
2612     case 4 :
2613       sum1 = tmp[row];
2614       sum2 = tmp[row -1];
2615       sum3 = tmp[row -2];
2616       sum4 = tmp[row -3];
2617       v2   = aa + ad[row]+1;
2618       v3   = aa + ad[row -1]+1;
2619       v4   = aa + ad[row -2]+1;
2620 
2621       for (j=0 ; j<nz-1; j+=2){
2622         i0   = vi[j];
2623         i1   = vi[j+1];
2624         tmp0 = tmps[i0];
2625         tmp1 = tmps[i1];
2626         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2627         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2628         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2629         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2630       }
2631       if (j== nz-1){
2632         tmp0  = tmps[vi[j]];
2633         sum1 -= v1[j] * tmp0;
2634         sum2 -= v2[j+1] * tmp0;
2635         sum3 -= v3[j+2] * tmp0;
2636         sum4 -= v4[j+3] * tmp0;
2637       }
2638 
2639       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2640       sum2   -= v2[0] * tmp0;
2641       sum3   -= v3[1] * tmp0;
2642       sum4   -= v4[2] * tmp0;
2643       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2644       sum3   -= v3[0] * tmp0;
2645       sum4   -= v4[1] * tmp0;
2646       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2647       sum4   -= v4[0] * tmp0;
2648       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2649       break;
2650     case 5 :
2651       sum1 = tmp[row];
2652       sum2 = tmp[row -1];
2653       sum3 = tmp[row -2];
2654       sum4 = tmp[row -3];
2655       sum5 = tmp[row -4];
2656       v2   = aa + ad[row]+1;
2657       v3   = aa + ad[row -1]+1;
2658       v4   = aa + ad[row -2]+1;
2659       v5   = aa + ad[row -3]+1;
2660       for (j=0 ; j<nz-1; j+=2){
2661         i0   = vi[j];
2662         i1   = vi[j+1];
2663         tmp0 = tmps[i0];
2664         tmp1 = tmps[i1];
2665         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2666         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2667         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2668         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2669         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2670       }
2671       if (j==nz-1){
2672         tmp0  = tmps[vi[j]];
2673         sum1 -= v1[j] * tmp0;
2674         sum2 -= v2[j+1] * tmp0;
2675         sum3 -= v3[j+2] * tmp0;
2676         sum4 -= v4[j+3] * tmp0;
2677         sum5 -= v5[j+4] * tmp0;
2678       }
2679 
2680       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2681       sum2   -= v2[0] * tmp0;
2682       sum3   -= v3[1] * tmp0;
2683       sum4   -= v4[2] * tmp0;
2684       sum5   -= v5[3] * tmp0;
2685       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2686       sum3   -= v3[0] * tmp0;
2687       sum4   -= v4[1] * tmp0;
2688       sum5   -= v5[2] * tmp0;
2689       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2690       sum4   -= v4[0] * tmp0;
2691       sum5   -= v5[1] * tmp0;
2692       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2693       sum5   -= v5[0] * tmp0;
2694       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2695       break;
2696     default:
2697       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2698     }
2699   }
2700   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2701   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2702   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2704   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 
2709 /*
2710      Makes a longer coloring[] array and calls the usual code with that
2711 */
2712 #undef __FUNCT__
2713 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2714 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2715 {
2716   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2717   PetscErrorCode  ierr;
2718   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2719   PetscInt        *colorused,i;
2720   ISColoringValue *newcolor;
2721 
2722   PetscFunctionBegin;
2723   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2724   /* loop over inodes, marking a color for each column*/
2725   row = 0;
2726   for (i=0; i<m; i++){
2727     for (j=0; j<ns[i]; j++) {
2728       newcolor[row++] = coloring[i] + j*ncolors;
2729     }
2730   }
2731 
2732   /* eliminate unneeded colors */
2733   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2734   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2735   for (i=0; i<n; i++) {
2736     colorused[newcolor[i]] = 1;
2737   }
2738 
2739   for (i=1; i<5*ncolors; i++) {
2740     colorused[i] += colorused[i-1];
2741   }
2742   ncolors = colorused[5*ncolors-1];
2743   for (i=0; i<n; i++) {
2744     newcolor[i] = colorused[newcolor[i]]-1;
2745   }
2746   ierr = PetscFree(colorused);CHKERRQ(ierr);
2747   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2748   ierr = PetscFree(coloring);CHKERRQ(ierr);
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 #include "../src/mat/blockinvert.h"
2753 
2754 #undef __FUNCT__
2755 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2756 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2757 {
2758   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2759   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2760   MatScalar          *ibdiag,*bdiag,work[25];
2761   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2762   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2763   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2764   PetscErrorCode     ierr;
2765   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2766   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2767 
2768   PetscFunctionBegin;
2769   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2770   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2771   if (its > 1) {
2772     /* switch to non-inode version */
2773     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2774     PetscFunctionReturn(0);
2775   }
2776 
2777   if (!a->inode.ibdiagvalid) {
2778     if (!a->inode.ibdiag) {
2779       /* calculate space needed for diagonal blocks */
2780       for (i=0; i<m; i++) {
2781 	cnt += sizes[i]*sizes[i];
2782       }
2783       a->inode.bdiagsize = cnt;
2784       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2785     }
2786 
2787     /* copy over the diagonal blocks and invert them */
2788     ibdiag = a->inode.ibdiag;
2789     bdiag  = a->inode.bdiag;
2790     cnt = 0;
2791     for (i=0, row = 0; i<m; i++) {
2792       for (j=0; j<sizes[i]; j++) {
2793         for (k=0; k<sizes[i]; k++) {
2794           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2795         }
2796       }
2797       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2798 
2799       switch(sizes[i]) {
2800         case 1:
2801           /* Create matrix data structure */
2802           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2803           ibdiag[cnt] = 1.0/ibdiag[cnt];
2804           break;
2805         case 2:
2806           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2807           break;
2808         case 3:
2809           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2810           break;
2811         case 4:
2812           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2813           break;
2814         case 5:
2815           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2816           break;
2817        default:
2818 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819       }
2820       cnt += sizes[i]*sizes[i];
2821       row += sizes[i];
2822     }
2823     a->inode.ibdiagvalid = PETSC_TRUE;
2824   }
2825   ibdiag = a->inode.ibdiag;
2826   bdiag  = a->inode.bdiag;
2827 
2828 
2829   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2830   if (flag & SOR_ZERO_INITIAL_GUESS) {
2831     PetscScalar *b;
2832     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2833     if (xx != bb) {
2834       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2835     } else {
2836       b = x;
2837     }
2838     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2839 
2840       for (i=0, row=0; i<m; i++) {
2841         sz  = diag[row] - ii[row];
2842         v1  = a->a + ii[row];
2843         idx = a->j + ii[row];
2844 
2845         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2846         switch (sizes[i]){
2847           case 1:
2848 
2849             sum1  = b[row];
2850             for(n = 0; n<sz-1; n+=2) {
2851               i1   = idx[0];
2852               i2   = idx[1];
2853               idx += 2;
2854               tmp0 = x[i1];
2855               tmp1 = x[i2];
2856               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2857             }
2858 
2859             if (n == sz-1){
2860               tmp0  = x[*idx];
2861               sum1 -= *v1 * tmp0;
2862             }
2863             x[row++] = sum1*(*ibdiag++);
2864             break;
2865           case 2:
2866             v2    = a->a + ii[row+1];
2867             sum1  = b[row];
2868             sum2  = b[row+1];
2869             for(n = 0; n<sz-1; n+=2) {
2870               i1   = idx[0];
2871               i2   = idx[1];
2872               idx += 2;
2873               tmp0 = x[i1];
2874               tmp1 = x[i2];
2875               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2876               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2877             }
2878 
2879             if (n == sz-1){
2880               tmp0  = x[*idx];
2881               sum1 -= v1[0] * tmp0;
2882               sum2 -= v2[0] * tmp0;
2883             }
2884             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2885             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2886             ibdiag  += 4;
2887             break;
2888           case 3:
2889             v2    = a->a + ii[row+1];
2890             v3    = a->a + ii[row+2];
2891             sum1  = b[row];
2892             sum2  = b[row+1];
2893             sum3  = b[row+2];
2894             for(n = 0; n<sz-1; n+=2) {
2895               i1   = idx[0];
2896               i2   = idx[1];
2897               idx += 2;
2898               tmp0 = x[i1];
2899               tmp1 = x[i2];
2900               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2901               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2902               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2903             }
2904 
2905             if (n == sz-1){
2906               tmp0  = x[*idx];
2907               sum1 -= v1[0] * tmp0;
2908               sum2 -= v2[0] * tmp0;
2909               sum3 -= v3[0] * tmp0;
2910             }
2911             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2912             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2913             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2914             ibdiag  += 9;
2915             break;
2916           case 4:
2917             v2    = a->a + ii[row+1];
2918             v3    = a->a + ii[row+2];
2919             v4    = a->a + ii[row+3];
2920             sum1  = b[row];
2921             sum2  = b[row+1];
2922             sum3  = b[row+2];
2923             sum4  = b[row+3];
2924             for(n = 0; n<sz-1; n+=2) {
2925               i1   = idx[0];
2926               i2   = idx[1];
2927               idx += 2;
2928               tmp0 = x[i1];
2929               tmp1 = x[i2];
2930               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2931               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2932               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2933               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2934             }
2935 
2936             if (n == sz-1){
2937               tmp0  = x[*idx];
2938               sum1 -= v1[0] * tmp0;
2939               sum2 -= v2[0] * tmp0;
2940               sum3 -= v3[0] * tmp0;
2941               sum4 -= v4[0] * tmp0;
2942             }
2943             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2944             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2945             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2946             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2947             ibdiag  += 16;
2948             break;
2949           case 5:
2950             v2    = a->a + ii[row+1];
2951             v3    = a->a + ii[row+2];
2952             v4    = a->a + ii[row+3];
2953             v5    = a->a + ii[row+4];
2954             sum1  = b[row];
2955             sum2  = b[row+1];
2956             sum3  = b[row+2];
2957             sum4  = b[row+3];
2958             sum5  = b[row+4];
2959             for(n = 0; n<sz-1; n+=2) {
2960               i1   = idx[0];
2961               i2   = idx[1];
2962               idx += 2;
2963               tmp0 = x[i1];
2964               tmp1 = x[i2];
2965               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2966               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2967               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2968               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2969               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2970             }
2971 
2972             if (n == sz-1){
2973               tmp0  = x[*idx];
2974               sum1 -= v1[0] * tmp0;
2975               sum2 -= v2[0] * tmp0;
2976               sum3 -= v3[0] * tmp0;
2977               sum4 -= v4[0] * tmp0;
2978               sum5 -= v5[0] * tmp0;
2979             }
2980             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2981             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2982             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2983             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2984             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2985             ibdiag  += 25;
2986             break;
2987 	  default:
2988    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2989         }
2990       }
2991 
2992       xb = x;
2993       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2994     } else xb = b;
2995     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2996         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2997       cnt = 0;
2998       for (i=0, row=0; i<m; i++) {
2999 
3000         switch (sizes[i]){
3001           case 1:
3002             x[row++] *= bdiag[cnt++];
3003             break;
3004           case 2:
3005             x1   = x[row]; x2 = x[row+1];
3006             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3007             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3008             x[row++] = tmp1;
3009             x[row++] = tmp2;
3010             cnt += 4;
3011             break;
3012           case 3:
3013             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3014             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3015             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3016             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3017             x[row++] = tmp1;
3018             x[row++] = tmp2;
3019             x[row++] = tmp3;
3020             cnt += 9;
3021             break;
3022           case 4:
3023             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3024             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3025             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3026             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3027             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3028             x[row++] = tmp1;
3029             x[row++] = tmp2;
3030             x[row++] = tmp3;
3031             x[row++] = tmp4;
3032             cnt += 16;
3033             break;
3034           case 5:
3035             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3036             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3037             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3038             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3039             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3040             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3041             x[row++] = tmp1;
3042             x[row++] = tmp2;
3043             x[row++] = tmp3;
3044             x[row++] = tmp4;
3045             x[row++] = tmp5;
3046             cnt += 25;
3047             break;
3048 	  default:
3049    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3050         }
3051       }
3052       ierr = PetscLogFlops(m);CHKERRQ(ierr);
3053     }
3054     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
3055 
3056       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3057       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3058         ibdiag -= sizes[i]*sizes[i];
3059         sz      = ii[row+1] - diag[row] - 1;
3060         v1      = a->a + diag[row] + 1;
3061         idx     = a->j + diag[row] + 1;
3062 
3063         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3064         switch (sizes[i]){
3065           case 1:
3066 
3067             sum1  = xb[row];
3068             for(n = 0; n<sz-1; n+=2) {
3069               i1   = idx[0];
3070               i2   = idx[1];
3071               idx += 2;
3072               tmp0 = x[i1];
3073               tmp1 = x[i2];
3074               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075             }
3076 
3077             if (n == sz-1){
3078               tmp0  = x[*idx];
3079               sum1 -= *v1*tmp0;
3080             }
3081             x[row--] = sum1*(*ibdiag);
3082             break;
3083 
3084           case 2:
3085 
3086             sum1  = xb[row];
3087             sum2  = xb[row-1];
3088             /* note that sum1 is associated with the second of the two rows */
3089             v2    = a->a + diag[row-1] + 2;
3090             for(n = 0; n<sz-1; n+=2) {
3091               i1   = idx[0];
3092               i2   = idx[1];
3093               idx += 2;
3094               tmp0 = x[i1];
3095               tmp1 = x[i2];
3096               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3097               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3098             }
3099 
3100             if (n == sz-1){
3101               tmp0  = x[*idx];
3102               sum1 -= *v1*tmp0;
3103               sum2 -= *v2*tmp0;
3104             }
3105             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3106             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3107             break;
3108           case 3:
3109 
3110             sum1  = xb[row];
3111             sum2  = xb[row-1];
3112             sum3  = xb[row-2];
3113             v2    = a->a + diag[row-1] + 2;
3114             v3    = a->a + diag[row-2] + 3;
3115             for(n = 0; n<sz-1; n+=2) {
3116               i1   = idx[0];
3117               i2   = idx[1];
3118               idx += 2;
3119               tmp0 = x[i1];
3120               tmp1 = x[i2];
3121               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3122               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3123               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3124             }
3125 
3126             if (n == sz-1){
3127               tmp0  = x[*idx];
3128               sum1 -= *v1*tmp0;
3129               sum2 -= *v2*tmp0;
3130               sum3 -= *v3*tmp0;
3131             }
3132             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3133             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3134             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3135             break;
3136           case 4:
3137 
3138             sum1  = xb[row];
3139             sum2  = xb[row-1];
3140             sum3  = xb[row-2];
3141             sum4  = xb[row-3];
3142             v2    = a->a + diag[row-1] + 2;
3143             v3    = a->a + diag[row-2] + 3;
3144             v4    = a->a + diag[row-3] + 4;
3145             for(n = 0; n<sz-1; n+=2) {
3146               i1   = idx[0];
3147               i2   = idx[1];
3148               idx += 2;
3149               tmp0 = x[i1];
3150               tmp1 = x[i2];
3151               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3152               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3153               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3154               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3155             }
3156 
3157             if (n == sz-1){
3158               tmp0  = x[*idx];
3159               sum1 -= *v1*tmp0;
3160               sum2 -= *v2*tmp0;
3161               sum3 -= *v3*tmp0;
3162               sum4 -= *v4*tmp0;
3163             }
3164             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3165             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3166             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3167             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3168             break;
3169           case 5:
3170 
3171             sum1  = xb[row];
3172             sum2  = xb[row-1];
3173             sum3  = xb[row-2];
3174             sum4  = xb[row-3];
3175             sum5  = xb[row-4];
3176             v2    = a->a + diag[row-1] + 2;
3177             v3    = a->a + diag[row-2] + 3;
3178             v4    = a->a + diag[row-3] + 4;
3179             v5    = a->a + diag[row-4] + 5;
3180             for(n = 0; n<sz-1; n+=2) {
3181               i1   = idx[0];
3182               i2   = idx[1];
3183               idx += 2;
3184               tmp0 = x[i1];
3185               tmp1 = x[i2];
3186               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3187               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3188               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3189               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3190               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3191             }
3192 
3193             if (n == sz-1){
3194               tmp0  = x[*idx];
3195               sum1 -= *v1*tmp0;
3196               sum2 -= *v2*tmp0;
3197               sum3 -= *v3*tmp0;
3198               sum4 -= *v4*tmp0;
3199               sum5 -= *v5*tmp0;
3200             }
3201             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3202             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3203             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3204             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3205             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3206             break;
3207 	  default:
3208    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3209         }
3210       }
3211 
3212       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3213     }
3214     its--;
3215     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3216     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3217   }
3218   if (flag & SOR_EISENSTAT) {
3219     const PetscScalar *b;
3220     MatScalar         *t = a->inode.ssor_work;
3221 
3222     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3223     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3224     /*
3225           Apply  (U + D)^-1  where D is now the block diagonal
3226     */
3227     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3228     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3229       ibdiag -= sizes[i]*sizes[i];
3230       sz      = ii[row+1] - diag[row] - 1;
3231       v1      = a->a + diag[row] + 1;
3232       idx     = a->j + diag[row] + 1;
3233       CHKMEMQ;
3234       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3235       switch (sizes[i]){
3236         case 1:
3237 
3238 	  sum1  = b[row];
3239 	  for(n = 0; n<sz-1; n+=2) {
3240 	    i1   = idx[0];
3241 	    i2   = idx[1];
3242 	    idx += 2;
3243 	    tmp0 = x[i1];
3244 	    tmp1 = x[i2];
3245 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3246 	  }
3247 
3248 	  if (n == sz-1){
3249 	    tmp0  = x[*idx];
3250 	    sum1 -= *v1*tmp0;
3251 	  }
3252 	  x[row] = sum1*(*ibdiag);row--;
3253 	  break;
3254 
3255         case 2:
3256 
3257 	  sum1  = b[row];
3258 	  sum2  = b[row-1];
3259 	  /* note that sum1 is associated with the second of the two rows */
3260 	  v2    = a->a + diag[row-1] + 2;
3261 	  for(n = 0; n<sz-1; n+=2) {
3262 	    i1   = idx[0];
3263 	    i2   = idx[1];
3264 	    idx += 2;
3265 	    tmp0 = x[i1];
3266 	    tmp1 = x[i2];
3267 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3268 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3269 	  }
3270 
3271 	  if (n == sz-1){
3272 	    tmp0  = x[*idx];
3273 	    sum1 -= *v1*tmp0;
3274 	    sum2 -= *v2*tmp0;
3275 	  }
3276 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3277 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3278           row -= 2;
3279 	  break;
3280         case 3:
3281 
3282 	  sum1  = b[row];
3283 	  sum2  = b[row-1];
3284 	  sum3  = b[row-2];
3285 	  v2    = a->a + diag[row-1] + 2;
3286 	  v3    = a->a + diag[row-2] + 3;
3287 	  for(n = 0; n<sz-1; n+=2) {
3288 	    i1   = idx[0];
3289 	    i2   = idx[1];
3290 	    idx += 2;
3291 	    tmp0 = x[i1];
3292 	    tmp1 = x[i2];
3293 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3294 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3295 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3296 	  }
3297 
3298 	  if (n == sz-1){
3299 	    tmp0  = x[*idx];
3300 	    sum1 -= *v1*tmp0;
3301 	    sum2 -= *v2*tmp0;
3302 	    sum3 -= *v3*tmp0;
3303 	  }
3304 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3305 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3306 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3307           row -= 3;
3308 	  break;
3309         case 4:
3310 
3311 	  sum1  = b[row];
3312 	  sum2  = b[row-1];
3313 	  sum3  = b[row-2];
3314 	  sum4  = b[row-3];
3315 	  v2    = a->a + diag[row-1] + 2;
3316 	  v3    = a->a + diag[row-2] + 3;
3317 	  v4    = a->a + diag[row-3] + 4;
3318 	  for(n = 0; n<sz-1; n+=2) {
3319 	    i1   = idx[0];
3320 	    i2   = idx[1];
3321 	    idx += 2;
3322 	    tmp0 = x[i1];
3323 	    tmp1 = x[i2];
3324 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3325 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3326 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3327 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3328 	  }
3329 
3330 	  if (n == sz-1){
3331 	    tmp0  = x[*idx];
3332 	    sum1 -= *v1*tmp0;
3333 	    sum2 -= *v2*tmp0;
3334 	    sum3 -= *v3*tmp0;
3335 	    sum4 -= *v4*tmp0;
3336 	  }
3337 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3338 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3339 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3340 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3341           row -= 4;
3342 	  break;
3343         case 5:
3344 
3345 	  sum1  = b[row];
3346 	  sum2  = b[row-1];
3347 	  sum3  = b[row-2];
3348 	  sum4  = b[row-3];
3349 	  sum5  = b[row-4];
3350 	  v2    = a->a + diag[row-1] + 2;
3351 	  v3    = a->a + diag[row-2] + 3;
3352 	  v4    = a->a + diag[row-3] + 4;
3353 	  v5    = a->a + diag[row-4] + 5;
3354 	  for(n = 0; n<sz-1; n+=2) {
3355 	    i1   = idx[0];
3356 	    i2   = idx[1];
3357 	    idx += 2;
3358 	    tmp0 = x[i1];
3359 	    tmp1 = x[i2];
3360 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3361 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3362 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3363 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3364 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3365 	  }
3366 
3367 	  if (n == sz-1){
3368 	    tmp0  = x[*idx];
3369 	    sum1 -= *v1*tmp0;
3370 	    sum2 -= *v2*tmp0;
3371 	    sum3 -= *v3*tmp0;
3372 	    sum4 -= *v4*tmp0;
3373 	    sum5 -= *v5*tmp0;
3374 	  }
3375 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3376 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3377 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3378 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3379 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3380           row -= 5;
3381 	  break;
3382         default:
3383 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3384       }
3385       CHKMEMQ;
3386     }
3387     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3388 
3389     /*
3390            t = b - D x    where D is the block diagonal
3391     */
3392     cnt = 0;
3393     for (i=0, row=0; i<m; i++) {
3394       CHKMEMQ;
3395       switch (sizes[i]){
3396         case 1:
3397 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3398 	  break;
3399         case 2:
3400 	  x1   = x[row]; x2 = x[row+1];
3401 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3402 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3403 	  t[row]   = b[row] - tmp1;
3404 	  t[row+1] = b[row+1] - tmp2; row += 2;
3405 	  cnt += 4;
3406 	  break;
3407         case 3:
3408 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3409 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3410 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3411 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3412 	  t[row] = b[row] - tmp1;
3413 	  t[row+1] = b[row+1] - tmp2;
3414 	  t[row+2] = b[row+2] - tmp3; row += 3;
3415 	  cnt += 9;
3416 	  break;
3417         case 4:
3418 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3419 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3420 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3421 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3422 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3423 	  t[row] = b[row] - tmp1;
3424 	  t[row+1] = b[row+1] - tmp2;
3425 	  t[row+2] = b[row+2] - tmp3;
3426 	  t[row+3] = b[row+3] - tmp4; row += 4;
3427 	  cnt += 16;
3428 	  break;
3429         case 5:
3430 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3431 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3432 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3433 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3434 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3435 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3436 	  t[row] = b[row] - tmp1;
3437 	  t[row+1] = b[row+1] - tmp2;
3438 	  t[row+2] = b[row+2] - tmp3;
3439 	  t[row+3] = b[row+3] - tmp4;
3440 	  t[row+4] = b[row+4] - tmp5;row += 5;
3441 	  cnt += 25;
3442 	  break;
3443         default:
3444 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3445       }
3446       CHKMEMQ;
3447     }
3448     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3449 
3450 
3451 
3452     /*
3453           Apply (L + D)^-1 where D is the block diagonal
3454     */
3455     for (i=0, row=0; i<m; i++) {
3456       sz  = diag[row] - ii[row];
3457       v1  = a->a + ii[row];
3458       idx = a->j + ii[row];
3459       CHKMEMQ;
3460       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3461       switch (sizes[i]){
3462         case 1:
3463 
3464 	  sum1  = t[row];
3465 	  for(n = 0; n<sz-1; n+=2) {
3466 	    i1   = idx[0];
3467 	    i2   = idx[1];
3468 	    idx += 2;
3469 	    tmp0 = t[i1];
3470 	    tmp1 = t[i2];
3471 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3472 	  }
3473 
3474 	  if (n == sz-1){
3475 	    tmp0  = t[*idx];
3476 	    sum1 -= *v1 * tmp0;
3477 	  }
3478 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3479 	  break;
3480         case 2:
3481 	  v2    = a->a + ii[row+1];
3482 	  sum1  = t[row];
3483 	  sum2  = t[row+1];
3484 	  for(n = 0; n<sz-1; n+=2) {
3485 	    i1   = idx[0];
3486 	    i2   = idx[1];
3487 	    idx += 2;
3488 	    tmp0 = t[i1];
3489 	    tmp1 = t[i2];
3490 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3491 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3492 	  }
3493 
3494 	  if (n == sz-1){
3495 	    tmp0  = t[*idx];
3496 	    sum1 -= v1[0] * tmp0;
3497 	    sum2 -= v2[0] * tmp0;
3498 	  }
3499 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3500 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3501 	  ibdiag  += 4; row += 2;
3502 	  break;
3503         case 3:
3504 	  v2    = a->a + ii[row+1];
3505 	  v3    = a->a + ii[row+2];
3506 	  sum1  = t[row];
3507 	  sum2  = t[row+1];
3508 	  sum3  = t[row+2];
3509 	  for(n = 0; n<sz-1; n+=2) {
3510 	    i1   = idx[0];
3511 	    i2   = idx[1];
3512 	    idx += 2;
3513 	    tmp0 = t[i1];
3514 	    tmp1 = t[i2];
3515 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3516 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3517 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3518 	  }
3519 
3520 	  if (n == sz-1){
3521 	    tmp0  = t[*idx];
3522 	    sum1 -= v1[0] * tmp0;
3523 	    sum2 -= v2[0] * tmp0;
3524 	    sum3 -= v3[0] * tmp0;
3525 	  }
3526 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3527 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3528 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3529 	  ibdiag  += 9; row += 3;
3530 	  break;
3531         case 4:
3532 	  v2    = a->a + ii[row+1];
3533 	  v3    = a->a + ii[row+2];
3534 	  v4    = a->a + ii[row+3];
3535 	  sum1  = t[row];
3536 	  sum2  = t[row+1];
3537 	  sum3  = t[row+2];
3538 	  sum4  = t[row+3];
3539 	  for(n = 0; n<sz-1; n+=2) {
3540 	    i1   = idx[0];
3541 	    i2   = idx[1];
3542 	    idx += 2;
3543 	    tmp0 = t[i1];
3544 	    tmp1 = t[i2];
3545 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3546 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3547 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3548 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3549 	  }
3550 
3551 	  if (n == sz-1){
3552 	    tmp0  = t[*idx];
3553 	    sum1 -= v1[0] * tmp0;
3554 	    sum2 -= v2[0] * tmp0;
3555 	    sum3 -= v3[0] * tmp0;
3556 	    sum4 -= v4[0] * tmp0;
3557 	  }
3558 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3559 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3560 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3561 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3562 	  ibdiag  += 16; row += 4;
3563 	  break;
3564         case 5:
3565 	  v2    = a->a + ii[row+1];
3566 	  v3    = a->a + ii[row+2];
3567 	  v4    = a->a + ii[row+3];
3568 	  v5    = a->a + ii[row+4];
3569 	  sum1  = t[row];
3570 	  sum2  = t[row+1];
3571 	  sum3  = t[row+2];
3572 	  sum4  = t[row+3];
3573 	  sum5  = t[row+4];
3574 	  for(n = 0; n<sz-1; n+=2) {
3575 	    i1   = idx[0];
3576 	    i2   = idx[1];
3577 	    idx += 2;
3578 	    tmp0 = t[i1];
3579 	    tmp1 = t[i2];
3580 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3581 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3582 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3583 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3584 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3585 	  }
3586 
3587 	  if (n == sz-1){
3588 	    tmp0  = t[*idx];
3589 	    sum1 -= v1[0] * tmp0;
3590 	    sum2 -= v2[0] * tmp0;
3591 	    sum3 -= v3[0] * tmp0;
3592 	    sum4 -= v4[0] * tmp0;
3593 	    sum5 -= v5[0] * tmp0;
3594 	  }
3595 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3596 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3597 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3598 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3599 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3600 	  ibdiag  += 25; row += 5;
3601 	  break;
3602         default:
3603 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3604       }
3605       CHKMEMQ;
3606     }
3607     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3608     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3609     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3610   }
3611   PetscFunctionReturn(0);
3612 }
3613 
3614 #undef __FUNCT__
3615 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3616 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3617 {
3618   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3619   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3620   const MatScalar    *bdiag = a->inode.bdiag;
3621   const PetscScalar  *b;
3622   PetscErrorCode      ierr;
3623   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3624   const PetscInt      *sizes = a->inode.size;
3625 
3626   PetscFunctionBegin;
3627   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3628   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3629   cnt = 0;
3630   for (i=0, row=0; i<m; i++) {
3631     switch (sizes[i]){
3632       case 1:
3633 	x[row] = b[row]*bdiag[cnt++];row++;
3634 	break;
3635       case 2:
3636 	x1   = b[row]; x2 = b[row+1];
3637 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3638 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3639 	x[row++] = tmp1;
3640 	x[row++] = tmp2;
3641 	cnt += 4;
3642 	break;
3643       case 3:
3644 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3645 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3646 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3647 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3648 	x[row++] = tmp1;
3649 	x[row++] = tmp2;
3650 	x[row++] = tmp3;
3651 	cnt += 9;
3652 	break;
3653       case 4:
3654 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3655 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3656 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3657 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3658 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3659 	x[row++] = tmp1;
3660 	x[row++] = tmp2;
3661 	x[row++] = tmp3;
3662 	x[row++] = tmp4;
3663 	cnt += 16;
3664 	break;
3665       case 5:
3666 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3667 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3668 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3669 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3670 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3671 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3672 	x[row++] = tmp1;
3673 	x[row++] = tmp2;
3674 	x[row++] = tmp3;
3675 	x[row++] = tmp4;
3676 	x[row++] = tmp5;
3677 	cnt += 25;
3678 	break;
3679       default:
3680 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3681     }
3682   }
3683   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3684   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3685   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3686   PetscFunctionReturn(0);
3687 }
3688 
3689 /*
3690     samestructure indicates that the matrix has not changed its nonzero structure so we
3691     do not need to recompute the inodes
3692 */
3693 #undef __FUNCT__
3694 #define __FUNCT__ "Mat_CheckInode"
3695 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3696 {
3697   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3698   PetscErrorCode ierr;
3699   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3700   PetscTruth     flag;
3701 
3702   PetscFunctionBegin;
3703   if (!a->inode.use)                     PetscFunctionReturn(0);
3704   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3705 
3706 
3707   m = A->rmap->n;
3708   if (a->inode.size) {ns = a->inode.size;}
3709   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3710 
3711   i          = 0;
3712   node_count = 0;
3713   idx        = a->j;
3714   ii         = a->i;
3715   while (i < m){                /* For each row */
3716     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3717     /* Limits the number of elements in a node to 'a->inode.limit' */
3718     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3719       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3720       if (nzy != nzx) break;
3721       idy  += nzx;             /* Same nonzero pattern */
3722       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3723       if (!flag) break;
3724     }
3725     ns[node_count++] = blk_size;
3726     idx += blk_size*nzx;
3727     i    = j;
3728   }
3729   /* If not enough inodes found,, do not use inode version of the routines */
3730   if (!a->inode.size && m && node_count > .9*m) {
3731     ierr = PetscFree(ns);CHKERRQ(ierr);
3732     a->inode.node_count     = 0;
3733     a->inode.size           = PETSC_NULL;
3734     a->inode.use            = PETSC_FALSE;
3735     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3736   } else {
3737     if (!A->factor) {
3738       A->ops->mult              = MatMult_SeqAIJ_Inode;
3739       A->ops->sor               = MatSOR_SeqAIJ_Inode;
3740       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3741       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3742       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3743       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3744       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3745       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3746       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3747     } else {
3748       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
3749     }
3750     a->inode.node_count       = node_count;
3751     a->inode.size             = ns;
3752     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3753   }
3754   PetscFunctionReturn(0);
3755 }
3756 
3757 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3758 PetscInt __k, *__vi; \
3759 __vi = aj + ai[row];				\
3760 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3761 __vi = aj + adiag[row];				\
3762 cols[nzl] = __vi[0];\
3763 __vi = aj + adiag[row+1]+1;\
3764 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3765 
3766 
3767 /*
3768    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3769    Modified from Mat_CheckInode().
3770 
3771    Input Parameters:
3772 +  Mat A - ILU or LU matrix factor
3773 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3774     do not need to recompute the inodes
3775 */
3776 #undef __FUNCT__
3777 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3778 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3779 {
3780   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3781   PetscErrorCode ierr;
3782   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3783   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3784   PetscTruth     flag;
3785 
3786   PetscFunctionBegin;
3787   if (!a->inode.use)                     PetscFunctionReturn(0);
3788   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3789 
3790   m = A->rmap->n;
3791   if (a->inode.size) {ns = a->inode.size;}
3792   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3793 
3794   i          = 0;
3795   node_count = 0;
3796   while (i < m){                /* For each row */
3797     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3798     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3799     nzx  = nzl1 + nzu1 + 1;
3800     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3801     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3802 
3803     /* Limits the number of elements in a node to 'a->inode.limit' */
3804     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3805       nzl2    = ai[j+1] - ai[j];
3806       nzu2    = adiag[j] - adiag[j+1] - 1;
3807       nzy     = nzl2 + nzu2 + 1;
3808       if( nzy != nzx) break;
3809       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3810       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3811       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3812       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3813       ierr = PetscFree(cols2);CHKERRQ(ierr);
3814     }
3815     ns[node_count++] = blk_size;
3816     ierr = PetscFree(cols1);CHKERRQ(ierr);
3817     i    = j;
3818   }
3819   /* If not enough inodes found,, do not use inode version of the routines */
3820   if (!a->inode.size && m && node_count > .9*m) {
3821     ierr = PetscFree(ns);CHKERRQ(ierr);
3822     a->inode.node_count     = 0;
3823     a->inode.size           = PETSC_NULL;
3824     a->inode.use            = PETSC_FALSE;
3825     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3826   } else {
3827     A->ops->mult              = 0;
3828     A->ops->sor               = 0;
3829     A->ops->multadd           = 0;
3830     A->ops->getrowij          = 0;
3831     A->ops->restorerowij      = 0;
3832     A->ops->getcolumnij       = 0;
3833     A->ops->restorecolumnij   = 0;
3834     A->ops->coloringpatch     = 0;
3835     A->ops->multdiagonalblock = 0;
3836     A->ops->solve             = MatSolve_SeqAIJ_Inode;
3837     a->inode.node_count       = node_count;
3838     a->inode.size             = ns;
3839     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3840   }
3841   PetscFunctionReturn(0);
3842 }
3843 
3844 /*
3845      This is really ugly. if inodes are used this replaces the
3846   permutations with ones that correspond to rows/cols of the matrix
3847   rather then inode blocks
3848 */
3849 #undef __FUNCT__
3850 #define __FUNCT__ "MatInodeAdjustForInodes"
3851 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3852 {
3853   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3854 
3855   PetscFunctionBegin;
3856   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3857   if (f) {
3858     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3859   }
3860   PetscFunctionReturn(0);
3861 }
3862 
3863 EXTERN_C_BEGIN
3864 #undef __FUNCT__
3865 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3866 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3867 {
3868   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3869   PetscErrorCode ierr;
3870   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3871   const PetscInt *ridx,*cidx;
3872   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3873   PetscInt       nslim_col,*ns_col;
3874   IS             ris = *rperm,cis = *cperm;
3875 
3876   PetscFunctionBegin;
3877   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3878   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3879 
3880   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3881   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3882   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3883 
3884   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3885   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3886 
3887   /* Form the inode structure for the rows of permuted matric using inv perm*/
3888   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3889 
3890   /* Construct the permutations for rows*/
3891   for (i=0,row = 0; i<nslim_row; ++i){
3892     indx      = ridx[i];
3893     start_val = tns[indx];
3894     end_val   = tns[indx + 1];
3895     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3896   }
3897 
3898   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3899   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3900 
3901  /* Construct permutations for columns */
3902   for (i=0,col=0; i<nslim_col; ++i){
3903     indx      = cidx[i];
3904     start_val = tns[indx];
3905     end_val   = tns[indx + 1];
3906     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3907   }
3908 
3909   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3910   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3911   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3912   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3913 
3914   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3915   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3916 
3917   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3918   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3919   ierr = ISDestroy(cis);CHKERRQ(ierr);
3920   ierr = ISDestroy(ris);CHKERRQ(ierr);
3921   ierr = PetscFree(tns);CHKERRQ(ierr);
3922   PetscFunctionReturn(0);
3923 }
3924 EXTERN_C_END
3925 
3926 #undef __FUNCT__
3927 #define __FUNCT__ "MatInodeGetInodeSizes"
3928 /*@C
3929    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3930 
3931    Collective on Mat
3932 
3933    Input Parameter:
3934 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3935 
3936    Output Parameter:
3937 +  node_count - no of inodes present in the matrix.
3938 .  sizes      - an array of size node_count,with sizes of each inode.
3939 -  limit      - the max size used to generate the inodes.
3940 
3941    Level: advanced
3942 
3943    Notes: This routine returns some internal storage information
3944    of the matrix, it is intended to be used by advanced users.
3945    It should be called after the matrix is assembled.
3946    The contents of the sizes[] array should not be changed.
3947    PETSC_NULL may be passed for information not requested.
3948 
3949 .keywords: matrix, seqaij, get, inode
3950 
3951 .seealso: MatGetInfo()
3952 @*/
3953 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3954 {
3955   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3956 
3957   PetscFunctionBegin;
3958   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3959   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3960   if (f) {
3961     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3962   }
3963   PetscFunctionReturn(0);
3964 }
3965 
3966 EXTERN_C_BEGIN
3967 #undef __FUNCT__
3968 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3969 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3970 {
3971   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3972 
3973   PetscFunctionBegin;
3974   if (node_count) *node_count = a->inode.node_count;
3975   if (sizes)      *sizes      = a->inode.size;
3976   if (limit)      *limit      = a->inode.limit;
3977   PetscFunctionReturn(0);
3978 }
3979 EXTERN_C_END
3980