xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 5ebcd58d2ef61c69f1c1a24f2d3c9c0afa946639)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
781 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 /* ----------------------------------------------------------- */
1182 #undef __FUNCT__
1183 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct"
1184 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx)
1185 {
1186   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1187   IS                iscol = a->col,isrow = a->row;
1188   PetscErrorCode    ierr;
1189   const PetscInt    *r,*c,*rout,*cout;
1190   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1191   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1192   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1193   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1194   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1195   const PetscScalar *b;
1196 
1197   PetscFunctionBegin;
1198   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1199   node_max = a->inode.node_count;
1200   ns       = a->inode.size;     /* Node Size array */
1201 
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   tmp  = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* forward solve the lower triangular */
1210   tmps = tmp ;
1211   aa   = a_a ;
1212   aj   = a_j ;
1213   ad   = a->diag;
1214 
1215   for (i = 0,row = 0; i< node_max; ++i){
1216     nsz = ns[i];
1217     aii = ai[row];
1218     v1  = aa + aii;
1219     vi  = aj + aii;
1220     nz  = ai[row+1]- ai[row];
1221 
1222     switch (nsz){               /* Each loop in 'case' is unrolled */
1223     case 1 :
1224       sum1 = b[r[row]];
1225       for(j=0; j<nz-1; j+=2){
1226         i0   = vi[j];
1227         i1   = vi[j+1];
1228         tmp0 = tmps[i0];
1229         tmp1 = tmps[i1];
1230         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
1231       }
1232       if(j == nz-1){
1233         tmp0 = tmps[vi[j]];
1234         sum1 -= v1[j]*tmp0;
1235       }
1236       tmp[row++]=sum1;
1237       break;
1238     case 2:
1239       sum1 = b[r[row]];
1240       sum2 = b[r[row+1]];
1241       v2   = aa + ai[row+1];
1242 
1243       for(j=0; j<nz-1; j+=2){
1244         i0   = vi[j];
1245         i1   = vi[j+1];
1246         tmp0 = tmps[i0];
1247         tmp1 = tmps[i1];
1248         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1249         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1250       }
1251       if(j == nz-1){
1252         tmp0 = tmps[vi[j]];
1253         sum1 -= v1[j] *tmp0;
1254         sum2 -= v2[j] *tmp0;
1255       }
1256       sum2 -= v2[nz] * sum1;
1257       tmp[row ++]=sum1;
1258       tmp[row ++]=sum2;
1259       break;
1260     case 3:
1261       sum1 = b[r[row]];
1262       sum2 = b[r[row+1]];
1263       sum3 = b[r[row+2]];
1264       v2   = aa + ai[row+1];
1265       v3   = aa + ai[row+2];
1266 
1267       for (j=0; j<nz-1; j+=2){
1268         i0   = vi[j];
1269         i1   = vi[j+1];
1270         tmp0 = tmps[i0];
1271         tmp1 = tmps[i1];
1272         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1273         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1274         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1275       }
1276       if (j == nz-1){
1277         tmp0 = tmps[vi[j]];
1278         sum1 -= v1[j] *tmp0;
1279         sum2 -= v2[j] *tmp0;
1280         sum3 -= v3[j] *tmp0;
1281       }
1282       sum2 -= v2[nz] * sum1;
1283       sum3 -= v3[nz] * sum1;
1284       sum3 -= v3[nz+1] * sum2;
1285       tmp[row ++]=sum1;
1286       tmp[row ++]=sum2;
1287       tmp[row ++]=sum3;
1288       break;
1289 
1290     case 4:
1291       sum1 = b[r[row]];
1292       sum2 = b[r[row+1]];
1293       sum3 = b[r[row+2]];
1294       sum4 = b[r[row+3]];
1295       v2   = aa + ai[row+1];
1296       v3   = aa + ai[row+2];
1297       v4   = aa + ai[row+3];
1298 
1299       for (j=0; j<nz-1; j+=2){
1300         i0   = vi[j];
1301         i1   = vi[j+1];
1302         tmp0 = tmps[i0];
1303         tmp1 = tmps[i1];
1304         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1305         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1306         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1307         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1308       }
1309       if (j == nz-1){
1310         tmp0 = tmps[vi[j]];
1311         sum1 -= v1[j] *tmp0;
1312         sum2 -= v2[j] *tmp0;
1313         sum3 -= v3[j] *tmp0;
1314         sum4 -= v4[j] *tmp0;
1315       }
1316       sum2 -= v2[nz] * sum1;
1317       sum3 -= v3[nz] * sum1;
1318       sum4 -= v4[nz] * sum1;
1319       sum3 -= v3[nz+1] * sum2;
1320       sum4 -= v4[nz+1] * sum2;
1321       sum4 -= v4[nz+2] * sum3;
1322 
1323       tmp[row ++]=sum1;
1324       tmp[row ++]=sum2;
1325       tmp[row ++]=sum3;
1326       tmp[row ++]=sum4;
1327       break;
1328     case 5:
1329       sum1 = b[r[row]];
1330       sum2 = b[r[row+1]];
1331       sum3 = b[r[row+2]];
1332       sum4 = b[r[row+3]];
1333       sum5 = b[r[row+4]];
1334       v2   = aa + ai[row+1];
1335       v3   = aa + ai[row+2];
1336       v4   = aa + ai[row+3];
1337       v5   = aa + ai[row+4];
1338 
1339       for (j=0; j<nz-1; j+=2){
1340         i0   = vi[j];
1341         i1   = vi[j+1];
1342         tmp0 = tmps[i0];
1343         tmp1 = tmps[i1];
1344         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1345         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1346         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1347         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1348         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
1349       }
1350       if (j == nz-1){
1351         tmp0 = tmps[vi[j]];
1352         sum1 -= v1[j] *tmp0;
1353         sum2 -= v2[j] *tmp0;
1354         sum3 -= v3[j] *tmp0;
1355         sum4 -= v4[j] *tmp0;
1356         sum5 -= v5[j] *tmp0;
1357       }
1358 
1359       sum2 -= v2[nz] * sum1;
1360       sum3 -= v3[nz] * sum1;
1361       sum4 -= v4[nz] * sum1;
1362       sum5 -= v5[nz] * sum1;
1363       sum3 -= v3[nz+1] * sum2;
1364       sum4 -= v4[nz+1] * sum2;
1365       sum5 -= v5[nz+1] * sum2;
1366       sum4 -= v4[nz+2] * sum3;
1367       sum5 -= v5[nz+2] * sum3;
1368       sum5 -= v5[nz+3] * sum4;
1369 
1370       tmp[row ++]=sum1;
1371       tmp[row ++]=sum2;
1372       tmp[row ++]=sum3;
1373       tmp[row ++]=sum4;
1374       tmp[row ++]=sum5;
1375       break;
1376     default:
1377       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1378     }
1379   }
1380   /* backward solve the upper triangular */
1381   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1382     nsz = ns[i];
1383     aii = ad[row+1] + 1;
1384     v1  = aa + aii;
1385     vi  = aj + aii;
1386     nz  = ad[row]- ad[row+1] - 1;
1387     switch (nsz){               /* Each loop in 'case' is unrolled */
1388     case 1 :
1389       sum1 = tmp[row];
1390 
1391       for(j=0 ; j<nz-1; j+=2){
1392         i0   = vi[j];
1393         i1   = vi[j+1];
1394         tmp0 = tmps[i0];
1395         tmp1 = tmps[i1];
1396         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1397       }
1398       if (j == nz-1){
1399         tmp0  = tmps[vi[j]];
1400         sum1 -= v1[j]*tmp0;
1401       }
1402       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1403       break;
1404     case 2 :
1405       sum1 = tmp[row];
1406       sum2 = tmp[row-1];
1407       v2   = aa + ad[row] + 1;
1408       for (j=0 ; j<nz-1; j+=2){
1409         i0   = vi[j];
1410         i1   = vi[j+1];
1411         tmp0 = tmps[i0];
1412         tmp1 = tmps[i1];
1413         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1414         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1415       }
1416       if (j == nz-1){
1417         tmp0  = tmps[vi[j]];
1418         sum1 -= v1[j]* tmp0;
1419         sum2 -= v2[j+1]* tmp0;
1420       }
1421 
1422       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1423       sum2   -= v2[0] * tmp0;
1424       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1425       break;
1426     case 3 :
1427       sum1 = tmp[row];
1428       sum2 = tmp[row -1];
1429       sum3 = tmp[row -2];
1430       v2   = aa + ad[row] + 1;
1431       v3   = aa + ad[row -1] + 1;
1432       for (j=0 ; j<nz-1; j+=2){
1433         i0   = vi[j];
1434         i1   = vi[j+1];
1435         tmp0 = tmps[i0];
1436         tmp1 = tmps[i1];
1437         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1438         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1439         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1440       }
1441       if (j== nz-1){
1442         tmp0  = tmps[vi[j]];
1443         sum1 -= v1[j] * tmp0;
1444         sum2 -= v2[j+1] * tmp0;
1445         sum3 -= v3[j+2] * tmp0;
1446       }
1447       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1448       sum2   -= v2[0]* tmp0;
1449       sum3   -= v3[1] * tmp0;
1450       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1451       sum3   -= v3[0]* tmp0;
1452       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1453 
1454       break;
1455     case 4 :
1456       sum1 = tmp[row];
1457       sum2 = tmp[row -1];
1458       sum3 = tmp[row -2];
1459       sum4 = tmp[row -3];
1460       v2   = aa + ad[row]+1;
1461       v3   = aa + ad[row -1]+1;
1462       v4   = aa + ad[row -2]+1;
1463 
1464       for (j=0 ; j<nz-1; j+=2){
1465         i0   = vi[j];
1466         i1   = vi[j+1];
1467         tmp0 = tmps[i0];
1468         tmp1 = tmps[i1];
1469         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
1470         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1471         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1472         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1473       }
1474       if (j== nz-1){
1475         tmp0  = tmps[vi[j]];
1476         sum1 -= v1[j] * tmp0;
1477         sum2 -= v2[j+1] * tmp0;
1478         sum3 -= v3[j+2] * tmp0;
1479         sum4 -= v4[j+3] * tmp0;
1480       }
1481 
1482       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1483       sum2   -= v2[0] * tmp0;
1484       sum3   -= v3[1] * tmp0;
1485       sum4   -= v4[2] * tmp0;
1486       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1487       sum3   -= v3[0] * tmp0;
1488       sum4   -= v4[1] * tmp0;
1489       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1490       sum4   -= v4[0] * tmp0;
1491       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1492       break;
1493     case 5 :
1494       sum1 = tmp[row];
1495       sum2 = tmp[row -1];
1496       sum3 = tmp[row -2];
1497       sum4 = tmp[row -3];
1498       sum5 = tmp[row -4];
1499       v2   = aa + ad[row]+1;
1500       v3   = aa + ad[row -1]+1;
1501       v4   = aa + ad[row -2]+1;
1502       v5   = aa + ad[row -3]+1;
1503       for (j=0 ; j<nz-1; j+=2){
1504         i0   = vi[j];
1505         i1   = vi[j+1];
1506         tmp0 = tmps[i0];
1507         tmp1 = tmps[i1];
1508         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1509         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1510         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1511         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1512         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
1513       }
1514       if (j==nz-1){
1515         tmp0  = tmps[vi[j]];
1516         sum1 -= v1[j] * tmp0;
1517         sum2 -= v2[j+1] * tmp0;
1518         sum3 -= v3[j+2] * tmp0;
1519         sum4 -= v4[j+3] * tmp0;
1520         sum5 -= v5[j+4] * tmp0;
1521       }
1522 
1523       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1524       sum2   -= v2[0] * tmp0;
1525       sum3   -= v3[1] * tmp0;
1526       sum4   -= v4[2] * tmp0;
1527       sum5   -= v5[3] * tmp0;
1528       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1529       sum3   -= v3[0] * tmp0;
1530       sum4   -= v4[1] * tmp0;
1531       sum5   -= v5[2] * tmp0;
1532       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1533       sum4   -= v4[0] * tmp0;
1534       sum5   -= v5[1] * tmp0;
1535       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1536       sum5   -= v5[0] * tmp0;
1537       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
1538       break;
1539     default:
1540       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1541     }
1542   }
1543   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1544   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1545   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1546   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1547   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 
1552 /*
1553      Makes a longer coloring[] array and calls the usual code with that
1554 */
1555 #undef __FUNCT__
1556 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
1557 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1558 {
1559   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1560   PetscErrorCode  ierr;
1561   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1562   PetscInt        *colorused,i;
1563   ISColoringValue *newcolor;
1564 
1565   PetscFunctionBegin;
1566   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1567   /* loop over inodes, marking a color for each column*/
1568   row = 0;
1569   for (i=0; i<m; i++){
1570     for (j=0; j<ns[i]; j++) {
1571       newcolor[row++] = coloring[i] + j*ncolors;
1572     }
1573   }
1574 
1575   /* eliminate unneeded colors */
1576   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1577   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1578   for (i=0; i<n; i++) {
1579     colorused[newcolor[i]] = 1;
1580   }
1581 
1582   for (i=1; i<5*ncolors; i++) {
1583     colorused[i] += colorused[i-1];
1584   }
1585   ncolors = colorused[5*ncolors-1];
1586   for (i=0; i<n; i++) {
1587     newcolor[i] = colorused[newcolor[i]]-1;
1588   }
1589   ierr = PetscFree(colorused);CHKERRQ(ierr);
1590   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1591   ierr = PetscFree(coloring);CHKERRQ(ierr);
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 #include "../src/mat/blockinvert.h"
1596 
1597 #undef __FUNCT__
1598 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
1599 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1600 {
1601   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1602   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1603   MatScalar          *ibdiag,*bdiag;
1604   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1605   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1606   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1607   PetscErrorCode     ierr;
1608   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1609   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1610 
1611   PetscFunctionBegin;
1612   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1613   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1614   if (its > 1) {
1615     /* switch to non-inode version */
1616     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1617     PetscFunctionReturn(0);
1618   }
1619 
1620   if (!a->inode.ibdiagvalid) {
1621     if (!a->inode.ibdiag) {
1622       /* calculate space needed for diagonal blocks */
1623       for (i=0; i<m; i++) {
1624 	cnt += sizes[i]*sizes[i];
1625       }
1626       a->inode.bdiagsize = cnt;
1627       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1628     }
1629 
1630     /* copy over the diagonal blocks and invert them */
1631     ibdiag = a->inode.ibdiag;
1632     bdiag  = a->inode.bdiag;
1633     cnt = 0;
1634     for (i=0, row = 0; i<m; i++) {
1635       for (j=0; j<sizes[i]; j++) {
1636         for (k=0; k<sizes[i]; k++) {
1637           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1638         }
1639       }
1640       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1641 
1642       switch(sizes[i]) {
1643         case 1:
1644           /* Create matrix data structure */
1645           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1646           ibdiag[cnt] = 1.0/ibdiag[cnt];
1647           break;
1648         case 2:
1649           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1650           break;
1651         case 3:
1652           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1653           break;
1654         case 4:
1655           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1656           break;
1657         case 5:
1658           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1659           break;
1660        default:
1661 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1662       }
1663       cnt += sizes[i]*sizes[i];
1664       row += sizes[i];
1665     }
1666     a->inode.ibdiagvalid = PETSC_TRUE;
1667   }
1668   ibdiag = a->inode.ibdiag;
1669   bdiag  = a->inode.bdiag;
1670 
1671 
1672   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1673   if (flag & SOR_ZERO_INITIAL_GUESS) {
1674     PetscScalar *b;
1675     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1676     if (xx != bb) {
1677       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1678     } else {
1679       b = x;
1680     }
1681     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1682 
1683       for (i=0, row=0; i<m; i++) {
1684         sz  = diag[row] - ii[row];
1685         v1  = a->a + ii[row];
1686         idx = a->j + ii[row];
1687 
1688         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1689         switch (sizes[i]){
1690           case 1:
1691 
1692             sum1  = b[row];
1693             for(n = 0; n<sz-1; n+=2) {
1694               i1   = idx[0];
1695               i2   = idx[1];
1696               idx += 2;
1697               tmp0 = x[i1];
1698               tmp1 = x[i2];
1699               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1700             }
1701 
1702             if (n == sz-1){
1703               tmp0  = x[*idx];
1704               sum1 -= *v1 * tmp0;
1705             }
1706             x[row++] = sum1*(*ibdiag++);
1707             break;
1708           case 2:
1709             v2    = a->a + ii[row+1];
1710             sum1  = b[row];
1711             sum2  = b[row+1];
1712             for(n = 0; n<sz-1; n+=2) {
1713               i1   = idx[0];
1714               i2   = idx[1];
1715               idx += 2;
1716               tmp0 = x[i1];
1717               tmp1 = x[i2];
1718               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1719               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1720             }
1721 
1722             if (n == sz-1){
1723               tmp0  = x[*idx];
1724               sum1 -= v1[0] * tmp0;
1725               sum2 -= v2[0] * tmp0;
1726             }
1727             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1728             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1729             ibdiag  += 4;
1730             break;
1731           case 3:
1732             v2    = a->a + ii[row+1];
1733             v3    = a->a + ii[row+2];
1734             sum1  = b[row];
1735             sum2  = b[row+1];
1736             sum3  = b[row+2];
1737             for(n = 0; n<sz-1; n+=2) {
1738               i1   = idx[0];
1739               i2   = idx[1];
1740               idx += 2;
1741               tmp0 = x[i1];
1742               tmp1 = x[i2];
1743               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1744               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1745               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1746             }
1747 
1748             if (n == sz-1){
1749               tmp0  = x[*idx];
1750               sum1 -= v1[0] * tmp0;
1751               sum2 -= v2[0] * tmp0;
1752               sum3 -= v3[0] * tmp0;
1753             }
1754             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1755             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1756             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1757             ibdiag  += 9;
1758             break;
1759           case 4:
1760             v2    = a->a + ii[row+1];
1761             v3    = a->a + ii[row+2];
1762             v4    = a->a + ii[row+3];
1763             sum1  = b[row];
1764             sum2  = b[row+1];
1765             sum3  = b[row+2];
1766             sum4  = b[row+3];
1767             for(n = 0; n<sz-1; n+=2) {
1768               i1   = idx[0];
1769               i2   = idx[1];
1770               idx += 2;
1771               tmp0 = x[i1];
1772               tmp1 = x[i2];
1773               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1774               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1775               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1776               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1777             }
1778 
1779             if (n == sz-1){
1780               tmp0  = x[*idx];
1781               sum1 -= v1[0] * tmp0;
1782               sum2 -= v2[0] * tmp0;
1783               sum3 -= v3[0] * tmp0;
1784               sum4 -= v4[0] * tmp0;
1785             }
1786             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1787             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1788             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1789             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1790             ibdiag  += 16;
1791             break;
1792           case 5:
1793             v2    = a->a + ii[row+1];
1794             v3    = a->a + ii[row+2];
1795             v4    = a->a + ii[row+3];
1796             v5    = a->a + ii[row+4];
1797             sum1  = b[row];
1798             sum2  = b[row+1];
1799             sum3  = b[row+2];
1800             sum4  = b[row+3];
1801             sum5  = b[row+4];
1802             for(n = 0; n<sz-1; n+=2) {
1803               i1   = idx[0];
1804               i2   = idx[1];
1805               idx += 2;
1806               tmp0 = x[i1];
1807               tmp1 = x[i2];
1808               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1809               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1810               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1811               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1812               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1813             }
1814 
1815             if (n == sz-1){
1816               tmp0  = x[*idx];
1817               sum1 -= v1[0] * tmp0;
1818               sum2 -= v2[0] * tmp0;
1819               sum3 -= v3[0] * tmp0;
1820               sum4 -= v4[0] * tmp0;
1821               sum5 -= v5[0] * tmp0;
1822             }
1823             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1824             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1825             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1826             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1827             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1828             ibdiag  += 25;
1829             break;
1830 	  default:
1831    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1832         }
1833       }
1834 
1835       xb = x;
1836       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1837     } else xb = b;
1838     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1839         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1840       cnt = 0;
1841       for (i=0, row=0; i<m; i++) {
1842 
1843         switch (sizes[i]){
1844           case 1:
1845             x[row++] *= bdiag[cnt++];
1846             break;
1847           case 2:
1848             x1   = x[row]; x2 = x[row+1];
1849             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1850             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1851             x[row++] = tmp1;
1852             x[row++] = tmp2;
1853             cnt += 4;
1854             break;
1855           case 3:
1856             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1857             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1858             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1859             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1860             x[row++] = tmp1;
1861             x[row++] = tmp2;
1862             x[row++] = tmp3;
1863             cnt += 9;
1864             break;
1865           case 4:
1866             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1867             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1868             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1869             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1870             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1871             x[row++] = tmp1;
1872             x[row++] = tmp2;
1873             x[row++] = tmp3;
1874             x[row++] = tmp4;
1875             cnt += 16;
1876             break;
1877           case 5:
1878             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1879             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1880             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1881             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1882             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1883             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1884             x[row++] = tmp1;
1885             x[row++] = tmp2;
1886             x[row++] = tmp3;
1887             x[row++] = tmp4;
1888             x[row++] = tmp5;
1889             cnt += 25;
1890             break;
1891 	  default:
1892    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1893         }
1894       }
1895       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1896     }
1897     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1898 
1899       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1900       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1901         ibdiag -= sizes[i]*sizes[i];
1902         sz      = ii[row+1] - diag[row] - 1;
1903         v1      = a->a + diag[row] + 1;
1904         idx     = a->j + diag[row] + 1;
1905 
1906         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1907         switch (sizes[i]){
1908           case 1:
1909 
1910             sum1  = xb[row];
1911             for(n = 0; n<sz-1; n+=2) {
1912               i1   = idx[0];
1913               i2   = idx[1];
1914               idx += 2;
1915               tmp0 = x[i1];
1916               tmp1 = x[i2];
1917               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1918             }
1919 
1920             if (n == sz-1){
1921               tmp0  = x[*idx];
1922               sum1 -= *v1*tmp0;
1923             }
1924             x[row--] = sum1*(*ibdiag);
1925             break;
1926 
1927           case 2:
1928 
1929             sum1  = xb[row];
1930             sum2  = xb[row-1];
1931             /* note that sum1 is associated with the second of the two rows */
1932             v2    = a->a + diag[row-1] + 2;
1933             for(n = 0; n<sz-1; n+=2) {
1934               i1   = idx[0];
1935               i2   = idx[1];
1936               idx += 2;
1937               tmp0 = x[i1];
1938               tmp1 = x[i2];
1939               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1940               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1941             }
1942 
1943             if (n == sz-1){
1944               tmp0  = x[*idx];
1945               sum1 -= *v1*tmp0;
1946               sum2 -= *v2*tmp0;
1947             }
1948             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1949             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1950             break;
1951           case 3:
1952 
1953             sum1  = xb[row];
1954             sum2  = xb[row-1];
1955             sum3  = xb[row-2];
1956             v2    = a->a + diag[row-1] + 2;
1957             v3    = a->a + diag[row-2] + 3;
1958             for(n = 0; n<sz-1; n+=2) {
1959               i1   = idx[0];
1960               i2   = idx[1];
1961               idx += 2;
1962               tmp0 = x[i1];
1963               tmp1 = x[i2];
1964               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1965               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1966               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1967             }
1968 
1969             if (n == sz-1){
1970               tmp0  = x[*idx];
1971               sum1 -= *v1*tmp0;
1972               sum2 -= *v2*tmp0;
1973               sum3 -= *v3*tmp0;
1974             }
1975             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1976             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1977             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1978             break;
1979           case 4:
1980 
1981             sum1  = xb[row];
1982             sum2  = xb[row-1];
1983             sum3  = xb[row-2];
1984             sum4  = xb[row-3];
1985             v2    = a->a + diag[row-1] + 2;
1986             v3    = a->a + diag[row-2] + 3;
1987             v4    = a->a + diag[row-3] + 4;
1988             for(n = 0; n<sz-1; n+=2) {
1989               i1   = idx[0];
1990               i2   = idx[1];
1991               idx += 2;
1992               tmp0 = x[i1];
1993               tmp1 = x[i2];
1994               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1995               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1996               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1997               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1998             }
1999 
2000             if (n == sz-1){
2001               tmp0  = x[*idx];
2002               sum1 -= *v1*tmp0;
2003               sum2 -= *v2*tmp0;
2004               sum3 -= *v3*tmp0;
2005               sum4 -= *v4*tmp0;
2006             }
2007             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2008             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2009             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2010             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2011             break;
2012           case 5:
2013 
2014             sum1  = xb[row];
2015             sum2  = xb[row-1];
2016             sum3  = xb[row-2];
2017             sum4  = xb[row-3];
2018             sum5  = xb[row-4];
2019             v2    = a->a + diag[row-1] + 2;
2020             v3    = a->a + diag[row-2] + 3;
2021             v4    = a->a + diag[row-3] + 4;
2022             v5    = a->a + diag[row-4] + 5;
2023             for(n = 0; n<sz-1; n+=2) {
2024               i1   = idx[0];
2025               i2   = idx[1];
2026               idx += 2;
2027               tmp0 = x[i1];
2028               tmp1 = x[i2];
2029               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2030               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2031               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2032               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2033               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2034             }
2035 
2036             if (n == sz-1){
2037               tmp0  = x[*idx];
2038               sum1 -= *v1*tmp0;
2039               sum2 -= *v2*tmp0;
2040               sum3 -= *v3*tmp0;
2041               sum4 -= *v4*tmp0;
2042               sum5 -= *v5*tmp0;
2043             }
2044             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2045             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2046             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2047             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2048             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2049             break;
2050 	  default:
2051    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2052         }
2053       }
2054 
2055       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2056     }
2057     its--;
2058     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2059     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2060   }
2061   if (flag & SOR_EISENSTAT) {
2062     const PetscScalar *b;
2063     MatScalar         *t = a->inode.ssor_work;
2064 
2065     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2066     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2067     /*
2068           Apply  (U + D)^-1  where D is now the block diagonal
2069     */
2070     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2071     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2072       ibdiag -= sizes[i]*sizes[i];
2073       sz      = ii[row+1] - diag[row] - 1;
2074       v1      = a->a + diag[row] + 1;
2075       idx     = a->j + diag[row] + 1;
2076       CHKMEMQ;
2077       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2078       switch (sizes[i]){
2079         case 1:
2080 
2081 	  sum1  = b[row];
2082 	  for(n = 0; n<sz-1; n+=2) {
2083 	    i1   = idx[0];
2084 	    i2   = idx[1];
2085 	    idx += 2;
2086 	    tmp0 = x[i1];
2087 	    tmp1 = x[i2];
2088 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2089 	  }
2090 
2091 	  if (n == sz-1){
2092 	    tmp0  = x[*idx];
2093 	    sum1 -= *v1*tmp0;
2094 	  }
2095 	  x[row] = sum1*(*ibdiag);row--;
2096 	  break;
2097 
2098         case 2:
2099 
2100 	  sum1  = b[row];
2101 	  sum2  = b[row-1];
2102 	  /* note that sum1 is associated with the second of the two rows */
2103 	  v2    = a->a + diag[row-1] + 2;
2104 	  for(n = 0; n<sz-1; n+=2) {
2105 	    i1   = idx[0];
2106 	    i2   = idx[1];
2107 	    idx += 2;
2108 	    tmp0 = x[i1];
2109 	    tmp1 = x[i2];
2110 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2111 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2112 	  }
2113 
2114 	  if (n == sz-1){
2115 	    tmp0  = x[*idx];
2116 	    sum1 -= *v1*tmp0;
2117 	    sum2 -= *v2*tmp0;
2118 	  }
2119 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2120 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2121           row -= 2;
2122 	  break;
2123         case 3:
2124 
2125 	  sum1  = b[row];
2126 	  sum2  = b[row-1];
2127 	  sum3  = b[row-2];
2128 	  v2    = a->a + diag[row-1] + 2;
2129 	  v3    = a->a + diag[row-2] + 3;
2130 	  for(n = 0; n<sz-1; n+=2) {
2131 	    i1   = idx[0];
2132 	    i2   = idx[1];
2133 	    idx += 2;
2134 	    tmp0 = x[i1];
2135 	    tmp1 = x[i2];
2136 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2137 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2138 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2139 	  }
2140 
2141 	  if (n == sz-1){
2142 	    tmp0  = x[*idx];
2143 	    sum1 -= *v1*tmp0;
2144 	    sum2 -= *v2*tmp0;
2145 	    sum3 -= *v3*tmp0;
2146 	  }
2147 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2148 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2149 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2150           row -= 3;
2151 	  break;
2152         case 4:
2153 
2154 	  sum1  = b[row];
2155 	  sum2  = b[row-1];
2156 	  sum3  = b[row-2];
2157 	  sum4  = b[row-3];
2158 	  v2    = a->a + diag[row-1] + 2;
2159 	  v3    = a->a + diag[row-2] + 3;
2160 	  v4    = a->a + diag[row-3] + 4;
2161 	  for(n = 0; n<sz-1; n+=2) {
2162 	    i1   = idx[0];
2163 	    i2   = idx[1];
2164 	    idx += 2;
2165 	    tmp0 = x[i1];
2166 	    tmp1 = x[i2];
2167 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2168 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2169 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2170 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2171 	  }
2172 
2173 	  if (n == sz-1){
2174 	    tmp0  = x[*idx];
2175 	    sum1 -= *v1*tmp0;
2176 	    sum2 -= *v2*tmp0;
2177 	    sum3 -= *v3*tmp0;
2178 	    sum4 -= *v4*tmp0;
2179 	  }
2180 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2181 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2182 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2183 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2184           row -= 4;
2185 	  break;
2186         case 5:
2187 
2188 	  sum1  = b[row];
2189 	  sum2  = b[row-1];
2190 	  sum3  = b[row-2];
2191 	  sum4  = b[row-3];
2192 	  sum5  = b[row-4];
2193 	  v2    = a->a + diag[row-1] + 2;
2194 	  v3    = a->a + diag[row-2] + 3;
2195 	  v4    = a->a + diag[row-3] + 4;
2196 	  v5    = a->a + diag[row-4] + 5;
2197 	  for(n = 0; n<sz-1; n+=2) {
2198 	    i1   = idx[0];
2199 	    i2   = idx[1];
2200 	    idx += 2;
2201 	    tmp0 = x[i1];
2202 	    tmp1 = x[i2];
2203 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2204 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2205 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2206 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2207 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2208 	  }
2209 
2210 	  if (n == sz-1){
2211 	    tmp0  = x[*idx];
2212 	    sum1 -= *v1*tmp0;
2213 	    sum2 -= *v2*tmp0;
2214 	    sum3 -= *v3*tmp0;
2215 	    sum4 -= *v4*tmp0;
2216 	    sum5 -= *v5*tmp0;
2217 	  }
2218 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2219 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2220 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2221 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2222 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2223           row -= 5;
2224 	  break;
2225         default:
2226 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2227       }
2228       CHKMEMQ;
2229     }
2230     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2231 
2232     /*
2233            t = b - D x    where D is the block diagonal
2234     */
2235     cnt = 0;
2236     for (i=0, row=0; i<m; i++) {
2237       CHKMEMQ;
2238       switch (sizes[i]){
2239         case 1:
2240 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2241 	  break;
2242         case 2:
2243 	  x1   = x[row]; x2 = x[row+1];
2244 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2245 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2246 	  t[row]   = b[row] - tmp1;
2247 	  t[row+1] = b[row+1] - tmp2; row += 2;
2248 	  cnt += 4;
2249 	  break;
2250         case 3:
2251 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2252 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2253 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2254 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2255 	  t[row] = b[row] - tmp1;
2256 	  t[row+1] = b[row+1] - tmp2;
2257 	  t[row+2] = b[row+2] - tmp3; row += 3;
2258 	  cnt += 9;
2259 	  break;
2260         case 4:
2261 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2262 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2263 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2264 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2265 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2266 	  t[row] = b[row] - tmp1;
2267 	  t[row+1] = b[row+1] - tmp2;
2268 	  t[row+2] = b[row+2] - tmp3;
2269 	  t[row+3] = b[row+3] - tmp4; row += 4;
2270 	  cnt += 16;
2271 	  break;
2272         case 5:
2273 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2274 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2275 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2276 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2277 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2278 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2279 	  t[row] = b[row] - tmp1;
2280 	  t[row+1] = b[row+1] - tmp2;
2281 	  t[row+2] = b[row+2] - tmp3;
2282 	  t[row+3] = b[row+3] - tmp4;
2283 	  t[row+4] = b[row+4] - tmp5;row += 5;
2284 	  cnt += 25;
2285 	  break;
2286         default:
2287 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2288       }
2289       CHKMEMQ;
2290     }
2291     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2292 
2293 
2294 
2295     /*
2296           Apply (L + D)^-1 where D is the block diagonal
2297     */
2298     for (i=0, row=0; i<m; i++) {
2299       sz  = diag[row] - ii[row];
2300       v1  = a->a + ii[row];
2301       idx = a->j + ii[row];
2302       CHKMEMQ;
2303       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2304       switch (sizes[i]){
2305         case 1:
2306 
2307 	  sum1  = t[row];
2308 	  for(n = 0; n<sz-1; n+=2) {
2309 	    i1   = idx[0];
2310 	    i2   = idx[1];
2311 	    idx += 2;
2312 	    tmp0 = t[i1];
2313 	    tmp1 = t[i2];
2314 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2315 	  }
2316 
2317 	  if (n == sz-1){
2318 	    tmp0  = t[*idx];
2319 	    sum1 -= *v1 * tmp0;
2320 	  }
2321 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2322 	  break;
2323         case 2:
2324 	  v2    = a->a + ii[row+1];
2325 	  sum1  = t[row];
2326 	  sum2  = t[row+1];
2327 	  for(n = 0; n<sz-1; n+=2) {
2328 	    i1   = idx[0];
2329 	    i2   = idx[1];
2330 	    idx += 2;
2331 	    tmp0 = t[i1];
2332 	    tmp1 = t[i2];
2333 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2334 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2335 	  }
2336 
2337 	  if (n == sz-1){
2338 	    tmp0  = t[*idx];
2339 	    sum1 -= v1[0] * tmp0;
2340 	    sum2 -= v2[0] * tmp0;
2341 	  }
2342 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2343 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2344 	  ibdiag  += 4; row += 2;
2345 	  break;
2346         case 3:
2347 	  v2    = a->a + ii[row+1];
2348 	  v3    = a->a + ii[row+2];
2349 	  sum1  = t[row];
2350 	  sum2  = t[row+1];
2351 	  sum3  = t[row+2];
2352 	  for(n = 0; n<sz-1; n+=2) {
2353 	    i1   = idx[0];
2354 	    i2   = idx[1];
2355 	    idx += 2;
2356 	    tmp0 = t[i1];
2357 	    tmp1 = t[i2];
2358 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2359 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2360 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2361 	  }
2362 
2363 	  if (n == sz-1){
2364 	    tmp0  = t[*idx];
2365 	    sum1 -= v1[0] * tmp0;
2366 	    sum2 -= v2[0] * tmp0;
2367 	    sum3 -= v3[0] * tmp0;
2368 	  }
2369 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2370 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2371 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2372 	  ibdiag  += 9; row += 3;
2373 	  break;
2374         case 4:
2375 	  v2    = a->a + ii[row+1];
2376 	  v3    = a->a + ii[row+2];
2377 	  v4    = a->a + ii[row+3];
2378 	  sum1  = t[row];
2379 	  sum2  = t[row+1];
2380 	  sum3  = t[row+2];
2381 	  sum4  = t[row+3];
2382 	  for(n = 0; n<sz-1; n+=2) {
2383 	    i1   = idx[0];
2384 	    i2   = idx[1];
2385 	    idx += 2;
2386 	    tmp0 = t[i1];
2387 	    tmp1 = t[i2];
2388 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2389 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2390 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2391 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2392 	  }
2393 
2394 	  if (n == sz-1){
2395 	    tmp0  = t[*idx];
2396 	    sum1 -= v1[0] * tmp0;
2397 	    sum2 -= v2[0] * tmp0;
2398 	    sum3 -= v3[0] * tmp0;
2399 	    sum4 -= v4[0] * tmp0;
2400 	  }
2401 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2402 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2403 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2404 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2405 	  ibdiag  += 16; row += 4;
2406 	  break;
2407         case 5:
2408 	  v2    = a->a + ii[row+1];
2409 	  v3    = a->a + ii[row+2];
2410 	  v4    = a->a + ii[row+3];
2411 	  v5    = a->a + ii[row+4];
2412 	  sum1  = t[row];
2413 	  sum2  = t[row+1];
2414 	  sum3  = t[row+2];
2415 	  sum4  = t[row+3];
2416 	  sum5  = t[row+4];
2417 	  for(n = 0; n<sz-1; n+=2) {
2418 	    i1   = idx[0];
2419 	    i2   = idx[1];
2420 	    idx += 2;
2421 	    tmp0 = t[i1];
2422 	    tmp1 = t[i2];
2423 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2424 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2425 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2426 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2427 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2428 	  }
2429 
2430 	  if (n == sz-1){
2431 	    tmp0  = t[*idx];
2432 	    sum1 -= v1[0] * tmp0;
2433 	    sum2 -= v2[0] * tmp0;
2434 	    sum3 -= v3[0] * tmp0;
2435 	    sum4 -= v4[0] * tmp0;
2436 	    sum5 -= v5[0] * tmp0;
2437 	  }
2438 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2439 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2440 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2441 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2442 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2443 	  ibdiag  += 25; row += 5;
2444 	  break;
2445         default:
2446 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2447       }
2448       CHKMEMQ;
2449     }
2450     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2451     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2452     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2453   }
2454   PetscFunctionReturn(0);
2455 }
2456 
2457 #undef __FUNCT__
2458 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2459 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2460 {
2461   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2462   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2463   const MatScalar    *bdiag = a->inode.bdiag;
2464   const PetscScalar  *b;
2465   PetscErrorCode      ierr;
2466   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2467   const PetscInt      *sizes = a->inode.size;
2468 
2469   PetscFunctionBegin;
2470   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2471   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2472   cnt = 0;
2473   for (i=0, row=0; i<m; i++) {
2474     switch (sizes[i]){
2475       case 1:
2476 	x[row] = b[row]*bdiag[cnt++];row++;
2477 	break;
2478       case 2:
2479 	x1   = b[row]; x2 = b[row+1];
2480 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2481 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2482 	x[row++] = tmp1;
2483 	x[row++] = tmp2;
2484 	cnt += 4;
2485 	break;
2486       case 3:
2487 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2488 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2489 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2490 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2491 	x[row++] = tmp1;
2492 	x[row++] = tmp2;
2493 	x[row++] = tmp3;
2494 	cnt += 9;
2495 	break;
2496       case 4:
2497 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2498 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2499 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2500 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2501 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2502 	x[row++] = tmp1;
2503 	x[row++] = tmp2;
2504 	x[row++] = tmp3;
2505 	x[row++] = tmp4;
2506 	cnt += 16;
2507 	break;
2508       case 5:
2509 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2510 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2511 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2512 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2513 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2514 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2515 	x[row++] = tmp1;
2516 	x[row++] = tmp2;
2517 	x[row++] = tmp3;
2518 	x[row++] = tmp4;
2519 	x[row++] = tmp5;
2520 	cnt += 25;
2521 	break;
2522       default:
2523 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2524     }
2525   }
2526   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2527   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2528   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2529   PetscFunctionReturn(0);
2530 }
2531 
2532 /*
2533     samestructure indicates that the matrix has not changed its nonzero structure so we
2534     do not need to recompute the inodes
2535 */
2536 #undef __FUNCT__
2537 #define __FUNCT__ "Mat_CheckInode"
2538 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2539 {
2540   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2541   PetscErrorCode ierr;
2542   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2543   PetscTruth     flag;
2544 
2545   PetscFunctionBegin;
2546   if (!a->inode.use)                     PetscFunctionReturn(0);
2547   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2548 
2549 
2550   m = A->rmap->n;
2551   if (a->inode.size) {ns = a->inode.size;}
2552   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2553 
2554   i          = 0;
2555   node_count = 0;
2556   idx        = a->j;
2557   ii         = a->i;
2558   while (i < m){                /* For each row */
2559     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2560     /* Limits the number of elements in a node to 'a->inode.limit' */
2561     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2562       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2563       if (nzy != nzx) break;
2564       idy  += nzx;             /* Same nonzero pattern */
2565       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2566       if (!flag) break;
2567     }
2568     ns[node_count++] = blk_size;
2569     idx += blk_size*nzx;
2570     i    = j;
2571   }
2572   /* If not enough inodes found,, do not use inode version of the routines */
2573   if (!a->inode.size && m && node_count > .9*m) {
2574     ierr = PetscFree(ns);CHKERRQ(ierr);
2575     a->inode.node_count     = 0;
2576     a->inode.size           = PETSC_NULL;
2577     a->inode.use            = PETSC_FALSE;
2578     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2579   } else {
2580     A->ops->mult              = MatMult_SeqAIJ_Inode;
2581     A->ops->sor               = MatSOR_SeqAIJ_Inode;
2582     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
2583     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
2584     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
2585     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
2586     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
2587     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
2588     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
2589     a->inode.node_count       = node_count;
2590     a->inode.size             = ns;
2591     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2592   }
2593   PetscFunctionReturn(0);
2594 }
2595 
2596 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
2597 PetscInt __k, *__vi; \
2598 __vi = aj + ai[row];				\
2599 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
2600 __vi = aj + adiag[row];				\
2601 cols[nzl] = __vi[0];\
2602 __vi = aj + adiag[row+1]+1;\
2603 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
2604 
2605 #undef __FUNCT__
2606 #define __FUNCT__ "Mat_CheckInode_FactorLU"
2607 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
2608 {
2609   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2610   PetscErrorCode ierr;
2611   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
2612   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
2613   PetscTruth     flag;
2614 
2615   PetscFunctionBegin;
2616   if (!a->inode.use)                     PetscFunctionReturn(0);
2617   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2618 
2619   m = A->rmap->n;
2620   if (a->inode.size) {ns = a->inode.size;}
2621   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2622 
2623   i          = 0;
2624   node_count = 0;
2625   while (i < m){                /* For each row */
2626     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
2627     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
2628     nzx  = nzl1 + nzu1 + 1;
2629     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
2630     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
2631 
2632     /* Limits the number of elements in a node to 'a->inode.limit' */
2633     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2634       nzl2    = ai[j+1] - ai[j];
2635       nzu2    = adiag[j] - adiag[j+1] - 1;
2636       nzy     = nzl2 + nzu2 + 1;
2637       if( nzy != nzx) break;
2638       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
2639       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
2640       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2641       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
2642       ierr = PetscFree(cols2);CHKERRQ(ierr);
2643     }
2644     ns[node_count++] = blk_size;
2645     ierr = PetscFree(cols1);CHKERRQ(ierr);
2646     i    = j;
2647   }
2648   /* If not enough inodes found,, do not use inode version of the routines */
2649   if (!a->inode.size && m && node_count > .9*m) {
2650     ierr = PetscFree(ns);CHKERRQ(ierr);
2651     a->inode.node_count     = 0;
2652     a->inode.size           = PETSC_NULL;
2653     a->inode.use            = PETSC_FALSE;
2654     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2655   } else {
2656     A->ops->mult              = 0;
2657     A->ops->sor               = 0;
2658     A->ops->multadd           = 0;
2659     A->ops->getrowij          = 0;
2660     A->ops->restorerowij      = 0;
2661     A->ops->getcolumnij       = 0;
2662     A->ops->restorecolumnij   = 0;
2663     A->ops->coloringpatch     = 0;
2664     A->ops->multdiagonalblock = 0;
2665     a->inode.node_count       = node_count;
2666     a->inode.size             = ns;
2667     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2668   }
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 /*
2673      This is really ugly. if inodes are used this replaces the
2674   permutations with ones that correspond to rows/cols of the matrix
2675   rather then inode blocks
2676 */
2677 #undef __FUNCT__
2678 #define __FUNCT__ "MatInodeAdjustForInodes"
2679 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2680 {
2681   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2682 
2683   PetscFunctionBegin;
2684   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2685   if (f) {
2686     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2687   }
2688   PetscFunctionReturn(0);
2689 }
2690 
2691 EXTERN_C_BEGIN
2692 #undef __FUNCT__
2693 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
2694 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
2695 {
2696   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2697   PetscErrorCode ierr;
2698   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2699   const PetscInt *ridx,*cidx;
2700   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2701   PetscInt       nslim_col,*ns_col;
2702   IS             ris = *rperm,cis = *cperm;
2703 
2704   PetscFunctionBegin;
2705   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2706   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2707 
2708   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2709   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2710   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
2711 
2712   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2713   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2714 
2715   /* Form the inode structure for the rows of permuted matric using inv perm*/
2716   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2717 
2718   /* Construct the permutations for rows*/
2719   for (i=0,row = 0; i<nslim_row; ++i){
2720     indx      = ridx[i];
2721     start_val = tns[indx];
2722     end_val   = tns[indx + 1];
2723     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2724   }
2725 
2726   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2727   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2728 
2729  /* Construct permutations for columns */
2730   for (i=0,col=0; i<nslim_col; ++i){
2731     indx      = cidx[i];
2732     start_val = tns[indx];
2733     end_val   = tns[indx + 1];
2734     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2735   }
2736 
2737   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2738   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
2739   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2740   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
2741 
2742   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2743   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2744 
2745   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2746   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
2747   ierr = ISDestroy(cis);CHKERRQ(ierr);
2748   ierr = ISDestroy(ris);CHKERRQ(ierr);
2749   ierr = PetscFree(tns);CHKERRQ(ierr);
2750   PetscFunctionReturn(0);
2751 }
2752 EXTERN_C_END
2753 
2754 #undef __FUNCT__
2755 #define __FUNCT__ "MatInodeGetInodeSizes"
2756 /*@C
2757    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2758 
2759    Collective on Mat
2760 
2761    Input Parameter:
2762 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2763 
2764    Output Parameter:
2765 +  node_count - no of inodes present in the matrix.
2766 .  sizes      - an array of size node_count,with sizes of each inode.
2767 -  limit      - the max size used to generate the inodes.
2768 
2769    Level: advanced
2770 
2771    Notes: This routine returns some internal storage information
2772    of the matrix, it is intended to be used by advanced users.
2773    It should be called after the matrix is assembled.
2774    The contents of the sizes[] array should not be changed.
2775    PETSC_NULL may be passed for information not requested.
2776 
2777 .keywords: matrix, seqaij, get, inode
2778 
2779 .seealso: MatGetInfo()
2780 @*/
2781 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2782 {
2783   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2784 
2785   PetscFunctionBegin;
2786   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2787   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2788   if (f) {
2789     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2790   }
2791   PetscFunctionReturn(0);
2792 }
2793 
2794 EXTERN_C_BEGIN
2795 #undef __FUNCT__
2796 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
2797 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2798 {
2799   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2800 
2801   PetscFunctionBegin;
2802   if (node_count) *node_count = a->inode.node_count;
2803   if (sizes)      *sizes      = a->inode.size;
2804   if (limit)      *limit      = a->inode.limit;
2805   PetscFunctionReturn(0);
2806 }
2807 EXTERN_C_END
2808