xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 9596e0b48258fba4fca4f68feb5185896facfe69)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_Inode"
586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_Inode"
779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   const PetscInt    *r,*c,*rout,*cout;
785   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821       for(j=0; j<nz-1; j+=2){
822         i0   = vi[0];
823         i1   = vi[1];
824         vi  +=2;
825         tmp0 = tmps[i0];
826         tmp1 = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828       }
829       if(j == nz-1){
830         tmp0 = tmps[*vi++];
831         sum1 -= *v1++ *tmp0;
832       }
833       tmp[row ++]=sum1;
834       break;
835     case 2:
836       sum1 = b[*r++];
837       sum2 = b[*r++];
838       v2   = aa + ai[row+1];
839 
840       for(j=0; j<nz-1; j+=2){
841         i0   = vi[0];
842         i1   = vi[1];
843         vi  +=2;
844         tmp0 = tmps[i0];
845         tmp1 = tmps[i1];
846         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848       }
849       if(j == nz-1){
850         tmp0 = tmps[*vi++];
851         sum1 -= *v1++ *tmp0;
852         sum2 -= *v2++ *tmp0;
853       }
854       sum2 -= *v2++ * sum1;
855       tmp[row ++]=sum1;
856       tmp[row ++]=sum2;
857       break;
858     case 3:
859       sum1 = b[*r++];
860       sum2 = b[*r++];
861       sum3 = b[*r++];
862       v2   = aa + ai[row+1];
863       v3   = aa + ai[row+2];
864 
865       for (j=0; j<nz-1; j+=2){
866         i0   = vi[0];
867         i1   = vi[1];
868         vi  +=2;
869         tmp0 = tmps[i0];
870         tmp1 = tmps[i1];
871         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874       }
875       if (j == nz-1){
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ *tmp0;
878         sum2 -= *v2++ *tmp0;
879         sum3 -= *v3++ *tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884       tmp[row ++]=sum1;
885       tmp[row ++]=sum2;
886       tmp[row ++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2){
899         i0   = vi[0];
900         i1   = vi[1];
901         vi  +=2;
902         tmp0 = tmps[i0];
903         tmp1 = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1){
910         tmp0 = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row ++]=sum1;
924       tmp[row ++]=sum2;
925       tmp[row ++]=sum3;
926       tmp[row ++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2){
940         i0   = vi[0];
941         i1   = vi[1];
942         vi  +=2;
943         tmp0 = tmps[i0];
944         tmp1 = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1){
952         tmp0 = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row ++]=sum1;
972       tmp[row ++]=sum2;
973       tmp[row ++]=sum3;
974       tmp[row ++]=sum4;
975       tmp[row ++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz){               /* Each loop in 'case' is unrolled */
989     case 1 :
990       sum1 = tmp[row];
991 
992       for(j=nz ; j>1; j-=2){
993         vi  -=2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1){
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2 :
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz ; j>1; j-=2){
1012         vi  -=2;
1013         i0   = vi[2];
1014         i1   = vi[1];
1015         tmp0 = tmps[i0];
1016         tmp1 = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1){
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3 :
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz ; j>1; j-=2){
1039         vi  -=2;
1040         i0   = vi[2];
1041         i1   = vi[1];
1042         tmp0 = tmps[i0];
1043         tmp1 = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1){
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4 :
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz ; j>1; j-=2){
1075         vi  -=2;
1076         i0   = vi[2];
1077         i1   = vi[1];
1078         tmp0 = tmps[i0];
1079         tmp1 = tmps[i1];
1080         v1  -= 2;
1081         v2  -= 2;
1082         v3  -= 2;
1083         v4  -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1){
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5 :
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz ; j>1; j-=2){
1119         vi  -= 2;
1120         i0   = vi[2];
1121         i1   = vi[1];
1122         tmp0 = tmps[i0];
1123         tmp1 = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1){
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 /*
1173      Makes a longer coloring[] array and calls the usual code with that
1174 */
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatColoringPatch_Inode"
1177 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1178 {
1179   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1180   PetscErrorCode  ierr;
1181   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1182   PetscInt        *colorused,i;
1183   ISColoringValue *newcolor;
1184 
1185   PetscFunctionBegin;
1186   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1187   /* loop over inodes, marking a color for each column*/
1188   row = 0;
1189   for (i=0; i<m; i++){
1190     for (j=0; j<ns[i]; j++) {
1191       newcolor[row++] = coloring[i] + j*ncolors;
1192     }
1193   }
1194 
1195   /* eliminate unneeded colors */
1196   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1197   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1198   for (i=0; i<n; i++) {
1199     colorused[newcolor[i]] = 1;
1200   }
1201 
1202   for (i=1; i<5*ncolors; i++) {
1203     colorused[i] += colorused[i-1];
1204   }
1205   ncolors = colorused[5*ncolors-1];
1206   for (i=0; i<n; i++) {
1207     newcolor[i] = colorused[newcolor[i]]-1;
1208   }
1209   ierr = PetscFree(colorused);CHKERRQ(ierr);
1210   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1211   ierr = PetscFree(coloring);CHKERRQ(ierr);
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #include "../src/mat/blockinvert.h"
1216 
1217 #undef __FUNCT__
1218 #define __FUNCT__ "MatRelax_Inode"
1219 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1220 {
1221   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1222   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1223   MatScalar          *ibdiag,*bdiag;
1224   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1225   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1226   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1227   PetscErrorCode     ierr;
1228   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1229   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1230 
1231   PetscFunctionBegin;
1232   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1233   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1234   if (its > 1) {
1235     /* switch to non-inode version */
1236     ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1237     PetscFunctionReturn(0);
1238   }
1239 
1240   if (!a->inode.ibdiagvalid) {
1241     if (!a->inode.ibdiag) {
1242       /* calculate space needed for diagonal blocks */
1243       for (i=0; i<m; i++) {
1244 	cnt += sizes[i]*sizes[i];
1245       }
1246       a->inode.bdiagsize = cnt;
1247       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1248     }
1249 
1250     /* copy over the diagonal blocks and invert them */
1251     ibdiag = a->inode.ibdiag;
1252     bdiag  = a->inode.bdiag;
1253     cnt = 0;
1254     for (i=0, row = 0; i<m; i++) {
1255       for (j=0; j<sizes[i]; j++) {
1256         for (k=0; k<sizes[i]; k++) {
1257           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1258         }
1259       }
1260       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1261 
1262       switch(sizes[i]) {
1263         case 1:
1264           /* Create matrix data structure */
1265           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1266           ibdiag[cnt] = 1.0/ibdiag[cnt];
1267           break;
1268         case 2:
1269           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1270           break;
1271         case 3:
1272           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1273           break;
1274         case 4:
1275           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1276           break;
1277         case 5:
1278           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1279           break;
1280        default:
1281 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1282       }
1283       cnt += sizes[i]*sizes[i];
1284       row += sizes[i];
1285     }
1286     a->inode.ibdiagvalid = PETSC_TRUE;
1287   }
1288   ibdiag = a->inode.ibdiag;
1289   bdiag  = a->inode.bdiag;
1290 
1291 
1292   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1293   if (flag & SOR_ZERO_INITIAL_GUESS) {
1294     PetscScalar *b;
1295     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1296     if (xx != bb) {
1297       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1298     } else {
1299       b = x;
1300     }
1301     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1302 
1303       for (i=0, row=0; i<m; i++) {
1304         sz  = diag[row] - ii[row];
1305         v1  = a->a + ii[row];
1306         idx = a->j + ii[row];
1307 
1308         /* see comments for MatMult_Inode() for how this is coded */
1309         switch (sizes[i]){
1310           case 1:
1311 
1312             sum1  = b[row];
1313             for(n = 0; n<sz-1; n+=2) {
1314               i1   = idx[0];
1315               i2   = idx[1];
1316               idx += 2;
1317               tmp0 = x[i1];
1318               tmp1 = x[i2];
1319               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1320             }
1321 
1322             if (n == sz-1){
1323               tmp0  = x[*idx];
1324               sum1 -= *v1 * tmp0;
1325             }
1326             x[row++] = sum1*(*ibdiag++);
1327             break;
1328           case 2:
1329             v2    = a->a + ii[row+1];
1330             sum1  = b[row];
1331             sum2  = b[row+1];
1332             for(n = 0; n<sz-1; n+=2) {
1333               i1   = idx[0];
1334               i2   = idx[1];
1335               idx += 2;
1336               tmp0 = x[i1];
1337               tmp1 = x[i2];
1338               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1339               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1340             }
1341 
1342             if (n == sz-1){
1343               tmp0  = x[*idx];
1344               sum1 -= v1[0] * tmp0;
1345               sum2 -= v2[0] * tmp0;
1346             }
1347             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1348             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1349             ibdiag  += 4;
1350             break;
1351           case 3:
1352             v2    = a->a + ii[row+1];
1353             v3    = a->a + ii[row+2];
1354             sum1  = b[row];
1355             sum2  = b[row+1];
1356             sum3  = b[row+2];
1357             for(n = 0; n<sz-1; n+=2) {
1358               i1   = idx[0];
1359               i2   = idx[1];
1360               idx += 2;
1361               tmp0 = x[i1];
1362               tmp1 = x[i2];
1363               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1364               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1365               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1366             }
1367 
1368             if (n == sz-1){
1369               tmp0  = x[*idx];
1370               sum1 -= v1[0] * tmp0;
1371               sum2 -= v2[0] * tmp0;
1372               sum3 -= v3[0] * tmp0;
1373             }
1374             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1375             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1376             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1377             ibdiag  += 9;
1378             break;
1379           case 4:
1380             v2    = a->a + ii[row+1];
1381             v3    = a->a + ii[row+2];
1382             v4    = a->a + ii[row+3];
1383             sum1  = b[row];
1384             sum2  = b[row+1];
1385             sum3  = b[row+2];
1386             sum4  = b[row+3];
1387             for(n = 0; n<sz-1; n+=2) {
1388               i1   = idx[0];
1389               i2   = idx[1];
1390               idx += 2;
1391               tmp0 = x[i1];
1392               tmp1 = x[i2];
1393               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1394               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1395               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1396               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1397             }
1398 
1399             if (n == sz-1){
1400               tmp0  = x[*idx];
1401               sum1 -= v1[0] * tmp0;
1402               sum2 -= v2[0] * tmp0;
1403               sum3 -= v3[0] * tmp0;
1404               sum4 -= v4[0] * tmp0;
1405             }
1406             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1407             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1408             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1409             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1410             ibdiag  += 16;
1411             break;
1412           case 5:
1413             v2    = a->a + ii[row+1];
1414             v3    = a->a + ii[row+2];
1415             v4    = a->a + ii[row+3];
1416             v5    = a->a + ii[row+4];
1417             sum1  = b[row];
1418             sum2  = b[row+1];
1419             sum3  = b[row+2];
1420             sum4  = b[row+3];
1421             sum5  = b[row+4];
1422             for(n = 0; n<sz-1; n+=2) {
1423               i1   = idx[0];
1424               i2   = idx[1];
1425               idx += 2;
1426               tmp0 = x[i1];
1427               tmp1 = x[i2];
1428               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1429               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1430               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1431               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1432               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1433             }
1434 
1435             if (n == sz-1){
1436               tmp0  = x[*idx];
1437               sum1 -= v1[0] * tmp0;
1438               sum2 -= v2[0] * tmp0;
1439               sum3 -= v3[0] * tmp0;
1440               sum4 -= v4[0] * tmp0;
1441               sum5 -= v5[0] * tmp0;
1442             }
1443             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1444             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1445             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1446             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1447             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1448             ibdiag  += 25;
1449             break;
1450 	  default:
1451    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1452         }
1453       }
1454 
1455       xb = x;
1456       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1457     } else xb = b;
1458     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1459         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1460       cnt = 0;
1461       for (i=0, row=0; i<m; i++) {
1462 
1463         switch (sizes[i]){
1464           case 1:
1465             x[row++] *= bdiag[cnt++];
1466             break;
1467           case 2:
1468             x1   = x[row]; x2 = x[row+1];
1469             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1470             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1471             x[row++] = tmp1;
1472             x[row++] = tmp2;
1473             cnt += 4;
1474             break;
1475           case 3:
1476             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1477             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1478             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1479             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1480             x[row++] = tmp1;
1481             x[row++] = tmp2;
1482             x[row++] = tmp3;
1483             cnt += 9;
1484             break;
1485           case 4:
1486             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1487             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1488             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1489             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1490             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1491             x[row++] = tmp1;
1492             x[row++] = tmp2;
1493             x[row++] = tmp3;
1494             x[row++] = tmp4;
1495             cnt += 16;
1496             break;
1497           case 5:
1498             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1499             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1500             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1501             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1502             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1503             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1504             x[row++] = tmp1;
1505             x[row++] = tmp2;
1506             x[row++] = tmp3;
1507             x[row++] = tmp4;
1508             x[row++] = tmp5;
1509             cnt += 25;
1510             break;
1511 	  default:
1512    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1513         }
1514       }
1515       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1516     }
1517     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1518 
1519       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1520       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1521         ibdiag -= sizes[i]*sizes[i];
1522         sz      = ii[row+1] - diag[row] - 1;
1523         v1      = a->a + diag[row] + 1;
1524         idx     = a->j + diag[row] + 1;
1525 
1526         /* see comments for MatMult_Inode() for how this is coded */
1527         switch (sizes[i]){
1528           case 1:
1529 
1530             sum1  = xb[row];
1531             for(n = 0; n<sz-1; n+=2) {
1532               i1   = idx[0];
1533               i2   = idx[1];
1534               idx += 2;
1535               tmp0 = x[i1];
1536               tmp1 = x[i2];
1537               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1538             }
1539 
1540             if (n == sz-1){
1541               tmp0  = x[*idx];
1542               sum1 -= *v1*tmp0;
1543             }
1544             x[row--] = sum1*(*ibdiag);
1545             break;
1546 
1547           case 2:
1548 
1549             sum1  = xb[row];
1550             sum2  = xb[row-1];
1551             /* note that sum1 is associated with the second of the two rows */
1552             v2    = a->a + diag[row-1] + 2;
1553             for(n = 0; n<sz-1; n+=2) {
1554               i1   = idx[0];
1555               i2   = idx[1];
1556               idx += 2;
1557               tmp0 = x[i1];
1558               tmp1 = x[i2];
1559               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1560               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1561             }
1562 
1563             if (n == sz-1){
1564               tmp0  = x[*idx];
1565               sum1 -= *v1*tmp0;
1566               sum2 -= *v2*tmp0;
1567             }
1568             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1569             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1570             break;
1571           case 3:
1572 
1573             sum1  = xb[row];
1574             sum2  = xb[row-1];
1575             sum3  = xb[row-2];
1576             v2    = a->a + diag[row-1] + 2;
1577             v3    = a->a + diag[row-2] + 3;
1578             for(n = 0; n<sz-1; n+=2) {
1579               i1   = idx[0];
1580               i2   = idx[1];
1581               idx += 2;
1582               tmp0 = x[i1];
1583               tmp1 = x[i2];
1584               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1585               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1586               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1587             }
1588 
1589             if (n == sz-1){
1590               tmp0  = x[*idx];
1591               sum1 -= *v1*tmp0;
1592               sum2 -= *v2*tmp0;
1593               sum3 -= *v3*tmp0;
1594             }
1595             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1596             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1597             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1598             break;
1599           case 4:
1600 
1601             sum1  = xb[row];
1602             sum2  = xb[row-1];
1603             sum3  = xb[row-2];
1604             sum4  = xb[row-3];
1605             v2    = a->a + diag[row-1] + 2;
1606             v3    = a->a + diag[row-2] + 3;
1607             v4    = a->a + diag[row-3] + 4;
1608             for(n = 0; n<sz-1; n+=2) {
1609               i1   = idx[0];
1610               i2   = idx[1];
1611               idx += 2;
1612               tmp0 = x[i1];
1613               tmp1 = x[i2];
1614               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1615               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1616               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1617               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1618             }
1619 
1620             if (n == sz-1){
1621               tmp0  = x[*idx];
1622               sum1 -= *v1*tmp0;
1623               sum2 -= *v2*tmp0;
1624               sum3 -= *v3*tmp0;
1625               sum4 -= *v4*tmp0;
1626             }
1627             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
1628             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
1629             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
1630             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
1631             break;
1632           case 5:
1633 
1634             sum1  = xb[row];
1635             sum2  = xb[row-1];
1636             sum3  = xb[row-2];
1637             sum4  = xb[row-3];
1638             sum5  = xb[row-4];
1639             v2    = a->a + diag[row-1] + 2;
1640             v3    = a->a + diag[row-2] + 3;
1641             v4    = a->a + diag[row-3] + 4;
1642             v5    = a->a + diag[row-4] + 5;
1643             for(n = 0; n<sz-1; n+=2) {
1644               i1   = idx[0];
1645               i2   = idx[1];
1646               idx += 2;
1647               tmp0 = x[i1];
1648               tmp1 = x[i2];
1649               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1650               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1651               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1652               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1653               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1654             }
1655 
1656             if (n == sz-1){
1657               tmp0  = x[*idx];
1658               sum1 -= *v1*tmp0;
1659               sum2 -= *v2*tmp0;
1660               sum3 -= *v3*tmp0;
1661               sum4 -= *v4*tmp0;
1662               sum5 -= *v5*tmp0;
1663             }
1664             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
1665             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
1666             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
1667             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
1668             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
1669             break;
1670 	  default:
1671    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1672         }
1673       }
1674 
1675       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1676     }
1677     its--;
1678     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1679     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
1680   }
1681   if (flag & SOR_EISENSTAT) {
1682     const PetscScalar *b;
1683     MatScalar         *t = a->inode.ssor_work;
1684 
1685     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1686     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1687     /*
1688           Apply  (U + D)^-1  where D is now the block diagonal
1689     */
1690     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1691     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1692       ibdiag -= sizes[i]*sizes[i];
1693       sz      = ii[row+1] - diag[row] - 1;
1694       v1      = a->a + diag[row] + 1;
1695       idx     = a->j + diag[row] + 1;
1696       CHKMEMQ;
1697       /* see comments for MatMult_Inode() for how this is coded */
1698       switch (sizes[i]){
1699         case 1:
1700 
1701 	  sum1  = b[row];
1702 	  for(n = 0; n<sz-1; n+=2) {
1703 	    i1   = idx[0];
1704 	    i2   = idx[1];
1705 	    idx += 2;
1706 	    tmp0 = x[i1];
1707 	    tmp1 = x[i2];
1708 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1709 	  }
1710 
1711 	  if (n == sz-1){
1712 	    tmp0  = x[*idx];
1713 	    sum1 -= *v1*tmp0;
1714 	  }
1715 	  x[row] = sum1*(*ibdiag);row--;
1716 	  break;
1717 
1718         case 2:
1719 
1720 	  sum1  = b[row];
1721 	  sum2  = b[row-1];
1722 	  /* note that sum1 is associated with the second of the two rows */
1723 	  v2    = a->a + diag[row-1] + 2;
1724 	  for(n = 0; n<sz-1; n+=2) {
1725 	    i1   = idx[0];
1726 	    i2   = idx[1];
1727 	    idx += 2;
1728 	    tmp0 = x[i1];
1729 	    tmp1 = x[i2];
1730 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1731 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1732 	  }
1733 
1734 	  if (n == sz-1){
1735 	    tmp0  = x[*idx];
1736 	    sum1 -= *v1*tmp0;
1737 	    sum2 -= *v2*tmp0;
1738 	  }
1739 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
1740 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
1741           row -= 2;
1742 	  break;
1743         case 3:
1744 
1745 	  sum1  = b[row];
1746 	  sum2  = b[row-1];
1747 	  sum3  = b[row-2];
1748 	  v2    = a->a + diag[row-1] + 2;
1749 	  v3    = a->a + diag[row-2] + 3;
1750 	  for(n = 0; n<sz-1; n+=2) {
1751 	    i1   = idx[0];
1752 	    i2   = idx[1];
1753 	    idx += 2;
1754 	    tmp0 = x[i1];
1755 	    tmp1 = x[i2];
1756 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1757 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1758 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1759 	  }
1760 
1761 	  if (n == sz-1){
1762 	    tmp0  = x[*idx];
1763 	    sum1 -= *v1*tmp0;
1764 	    sum2 -= *v2*tmp0;
1765 	    sum3 -= *v3*tmp0;
1766 	  }
1767 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1768 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1769 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1770           row -= 3;
1771 	  break;
1772         case 4:
1773 
1774 	  sum1  = b[row];
1775 	  sum2  = b[row-1];
1776 	  sum3  = b[row-2];
1777 	  sum4  = b[row-3];
1778 	  v2    = a->a + diag[row-1] + 2;
1779 	  v3    = a->a + diag[row-2] + 3;
1780 	  v4    = a->a + diag[row-3] + 4;
1781 	  for(n = 0; n<sz-1; n+=2) {
1782 	    i1   = idx[0];
1783 	    i2   = idx[1];
1784 	    idx += 2;
1785 	    tmp0 = x[i1];
1786 	    tmp1 = x[i2];
1787 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1788 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1789 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1790 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1791 	  }
1792 
1793 	  if (n == sz-1){
1794 	    tmp0  = x[*idx];
1795 	    sum1 -= *v1*tmp0;
1796 	    sum2 -= *v2*tmp0;
1797 	    sum3 -= *v3*tmp0;
1798 	    sum4 -= *v4*tmp0;
1799 	  }
1800 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
1801 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
1802 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
1803 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
1804           row -= 4;
1805 	  break;
1806         case 5:
1807 
1808 	  sum1  = b[row];
1809 	  sum2  = b[row-1];
1810 	  sum3  = b[row-2];
1811 	  sum4  = b[row-3];
1812 	  sum5  = b[row-4];
1813 	  v2    = a->a + diag[row-1] + 2;
1814 	  v3    = a->a + diag[row-2] + 3;
1815 	  v4    = a->a + diag[row-3] + 4;
1816 	  v5    = a->a + diag[row-4] + 5;
1817 	  for(n = 0; n<sz-1; n+=2) {
1818 	    i1   = idx[0];
1819 	    i2   = idx[1];
1820 	    idx += 2;
1821 	    tmp0 = x[i1];
1822 	    tmp1 = x[i2];
1823 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1824 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1825 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1826 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1827 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1828 	  }
1829 
1830 	  if (n == sz-1){
1831 	    tmp0  = x[*idx];
1832 	    sum1 -= *v1*tmp0;
1833 	    sum2 -= *v2*tmp0;
1834 	    sum3 -= *v3*tmp0;
1835 	    sum4 -= *v4*tmp0;
1836 	    sum5 -= *v5*tmp0;
1837 	  }
1838 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
1839 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
1840 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
1841 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
1842 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
1843           row -= 5;
1844 	  break;
1845         default:
1846 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1847       }
1848       CHKMEMQ;
1849     }
1850     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1851 
1852     /*
1853            t = b - D x    where D is the block diagonal
1854     */
1855     cnt = 0;
1856     for (i=0, row=0; i<m; i++) {
1857       CHKMEMQ;
1858       switch (sizes[i]){
1859         case 1:
1860 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
1861 	  break;
1862         case 2:
1863 	  x1   = x[row]; x2 = x[row+1];
1864 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1865 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1866 	  t[row]   = b[row] - tmp1;
1867 	  t[row+1] = b[row+1] - tmp2; row += 2;
1868 	  cnt += 4;
1869 	  break;
1870         case 3:
1871 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1872 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1873 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1874 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1875 	  t[row] = b[row] - tmp1;
1876 	  t[row+1] = b[row+1] - tmp2;
1877 	  t[row+2] = b[row+2] - tmp3; row += 3;
1878 	  cnt += 9;
1879 	  break;
1880         case 4:
1881 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1882 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1883 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1884 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1885 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1886 	  t[row] = b[row] - tmp1;
1887 	  t[row+1] = b[row+1] - tmp2;
1888 	  t[row+2] = b[row+2] - tmp3;
1889 	  t[row+3] = b[row+3] - tmp4; row += 4;
1890 	  cnt += 16;
1891 	  break;
1892         case 5:
1893 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1894 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1895 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1896 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1897 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1898 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1899 	  t[row] = b[row] - tmp1;
1900 	  t[row+1] = b[row+1] - tmp2;
1901 	  t[row+2] = b[row+2] - tmp3;
1902 	  t[row+3] = b[row+3] - tmp4;
1903 	  t[row+4] = b[row+4] - tmp5;row += 5;
1904 	  cnt += 25;
1905 	  break;
1906         default:
1907 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1908       }
1909       CHKMEMQ;
1910     }
1911     ierr = PetscLogFlops(m);CHKERRQ(ierr);
1912 
1913 
1914 
1915     /*
1916           Apply (L + D)^-1 where D is the block diagonal
1917     */
1918     for (i=0, row=0; i<m; i++) {
1919       sz  = diag[row] - ii[row];
1920       v1  = a->a + ii[row];
1921       idx = a->j + ii[row];
1922       CHKMEMQ;
1923       /* see comments for MatMult_Inode() for how this is coded */
1924       switch (sizes[i]){
1925         case 1:
1926 
1927 	  sum1  = t[row];
1928 	  for(n = 0; n<sz-1; n+=2) {
1929 	    i1   = idx[0];
1930 	    i2   = idx[1];
1931 	    idx += 2;
1932 	    tmp0 = t[i1];
1933 	    tmp1 = t[i2];
1934 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1935 	  }
1936 
1937 	  if (n == sz-1){
1938 	    tmp0  = t[*idx];
1939 	    sum1 -= *v1 * tmp0;
1940 	  }
1941 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
1942 	  break;
1943         case 2:
1944 	  v2    = a->a + ii[row+1];
1945 	  sum1  = t[row];
1946 	  sum2  = t[row+1];
1947 	  for(n = 0; n<sz-1; n+=2) {
1948 	    i1   = idx[0];
1949 	    i2   = idx[1];
1950 	    idx += 2;
1951 	    tmp0 = t[i1];
1952 	    tmp1 = t[i2];
1953 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1954 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1955 	  }
1956 
1957 	  if (n == sz-1){
1958 	    tmp0  = t[*idx];
1959 	    sum1 -= v1[0] * tmp0;
1960 	    sum2 -= v2[0] * tmp0;
1961 	  }
1962 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
1963 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
1964 	  ibdiag  += 4; row += 2;
1965 	  break;
1966         case 3:
1967 	  v2    = a->a + ii[row+1];
1968 	  v3    = a->a + ii[row+2];
1969 	  sum1  = t[row];
1970 	  sum2  = t[row+1];
1971 	  sum3  = t[row+2];
1972 	  for(n = 0; n<sz-1; n+=2) {
1973 	    i1   = idx[0];
1974 	    i2   = idx[1];
1975 	    idx += 2;
1976 	    tmp0 = t[i1];
1977 	    tmp1 = t[i2];
1978 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1979 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1980 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1981 	  }
1982 
1983 	  if (n == sz-1){
1984 	    tmp0  = t[*idx];
1985 	    sum1 -= v1[0] * tmp0;
1986 	    sum2 -= v2[0] * tmp0;
1987 	    sum3 -= v3[0] * tmp0;
1988 	  }
1989 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1990 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1991 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1992 	  ibdiag  += 9; row += 3;
1993 	  break;
1994         case 4:
1995 	  v2    = a->a + ii[row+1];
1996 	  v3    = a->a + ii[row+2];
1997 	  v4    = a->a + ii[row+3];
1998 	  sum1  = t[row];
1999 	  sum2  = t[row+1];
2000 	  sum3  = t[row+2];
2001 	  sum4  = t[row+3];
2002 	  for(n = 0; n<sz-1; n+=2) {
2003 	    i1   = idx[0];
2004 	    i2   = idx[1];
2005 	    idx += 2;
2006 	    tmp0 = t[i1];
2007 	    tmp1 = t[i2];
2008 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2009 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2010 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2011 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2012 	  }
2013 
2014 	  if (n == sz-1){
2015 	    tmp0  = t[*idx];
2016 	    sum1 -= v1[0] * tmp0;
2017 	    sum2 -= v2[0] * tmp0;
2018 	    sum3 -= v3[0] * tmp0;
2019 	    sum4 -= v4[0] * tmp0;
2020 	  }
2021 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2022 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2023 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2024 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2025 	  ibdiag  += 16; row += 4;
2026 	  break;
2027         case 5:
2028 	  v2    = a->a + ii[row+1];
2029 	  v3    = a->a + ii[row+2];
2030 	  v4    = a->a + ii[row+3];
2031 	  v5    = a->a + ii[row+4];
2032 	  sum1  = t[row];
2033 	  sum2  = t[row+1];
2034 	  sum3  = t[row+2];
2035 	  sum4  = t[row+3];
2036 	  sum5  = t[row+4];
2037 	  for(n = 0; n<sz-1; n+=2) {
2038 	    i1   = idx[0];
2039 	    i2   = idx[1];
2040 	    idx += 2;
2041 	    tmp0 = t[i1];
2042 	    tmp1 = t[i2];
2043 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2044 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2045 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2046 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2047 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2048 	  }
2049 
2050 	  if (n == sz-1){
2051 	    tmp0  = t[*idx];
2052 	    sum1 -= v1[0] * tmp0;
2053 	    sum2 -= v2[0] * tmp0;
2054 	    sum3 -= v3[0] * tmp0;
2055 	    sum4 -= v4[0] * tmp0;
2056 	    sum5 -= v5[0] * tmp0;
2057 	  }
2058 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2059 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2060 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2061 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2062 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2063 	  ibdiag  += 25; row += 5;
2064 	  break;
2065         default:
2066 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2067       }
2068       CHKMEMQ;
2069     }
2070     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2071     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2072     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2073   }
2074   PetscFunctionReturn(0);
2075 }
2076 
2077 #undef __FUNCT__
2078 #define __FUNCT__ "MatMultDiagonalBlock_Inode"
2079 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx)
2080 {
2081   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2082   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2083   const MatScalar    *bdiag = a->inode.bdiag;
2084   const PetscScalar  *b;
2085   PetscErrorCode      ierr;
2086   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2087   const PetscInt      *sizes = a->inode.size;
2088 
2089   PetscFunctionBegin;
2090   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2091   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2092   cnt = 0;
2093   for (i=0, row=0; i<m; i++) {
2094     switch (sizes[i]){
2095       case 1:
2096 	x[row] = b[row]*bdiag[cnt++];row++;
2097 	break;
2098       case 2:
2099 	x1   = b[row]; x2 = b[row+1];
2100 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2101 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2102 	x[row++] = tmp1;
2103 	x[row++] = tmp2;
2104 	cnt += 4;
2105 	break;
2106       case 3:
2107 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2108 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2109 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2110 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2111 	x[row++] = tmp1;
2112 	x[row++] = tmp2;
2113 	x[row++] = tmp3;
2114 	cnt += 9;
2115 	break;
2116       case 4:
2117 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2118 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2119 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2120 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2121 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2122 	x[row++] = tmp1;
2123 	x[row++] = tmp2;
2124 	x[row++] = tmp3;
2125 	x[row++] = tmp4;
2126 	cnt += 16;
2127 	break;
2128       case 5:
2129 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2130 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2131 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2132 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2133 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2134 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2135 	x[row++] = tmp1;
2136 	x[row++] = tmp2;
2137 	x[row++] = tmp3;
2138 	x[row++] = tmp4;
2139 	x[row++] = tmp5;
2140 	cnt += 25;
2141 	break;
2142       default:
2143 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2144     }
2145   }
2146   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2147   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2148   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2149   PetscFunctionReturn(0);
2150 }
2151 
2152 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec);
2153 /*
2154     samestructure indicates that the matrix has not changed its nonzero structure so we
2155     do not need to recompute the inodes
2156 */
2157 #undef __FUNCT__
2158 #define __FUNCT__ "Mat_CheckInode"
2159 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2160 {
2161   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2162   PetscErrorCode ierr;
2163   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2164   PetscTruth     flag;
2165 
2166   PetscFunctionBegin;
2167   if (!a->inode.use)                     PetscFunctionReturn(0);
2168   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2169 
2170 
2171   m = A->rmap->n;
2172   if (a->inode.size) {ns = a->inode.size;}
2173   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2174 
2175   i          = 0;
2176   node_count = 0;
2177   idx        = a->j;
2178   ii         = a->i;
2179   while (i < m){                /* For each row */
2180     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2181     /* Limits the number of elements in a node to 'a->inode.limit' */
2182     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2183       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2184       if (nzy != nzx) break;
2185       idy  += nzx;             /* Same nonzero pattern */
2186       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2187       if (!flag) break;
2188     }
2189     ns[node_count++] = blk_size;
2190     idx += blk_size*nzx;
2191     i    = j;
2192   }
2193   /* If not enough inodes found,, do not use inode version of the routines */
2194   if (!a->inode.size && m && node_count > .9*m) {
2195     ierr = PetscFree(ns);CHKERRQ(ierr);
2196     a->inode.node_count     = 0;
2197     a->inode.size           = PETSC_NULL;
2198     a->inode.use            = PETSC_FALSE;
2199     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2200   } else {
2201     A->ops->mult              = MatMult_Inode;
2202     A->ops->relax             = MatRelax_Inode;
2203     A->ops->multadd           = MatMultAdd_Inode;
2204     A->ops->getrowij          = MatGetRowIJ_Inode;
2205     A->ops->restorerowij      = MatRestoreRowIJ_Inode;
2206     A->ops->getcolumnij       = MatGetColumnIJ_Inode;
2207     A->ops->restorecolumnij   = MatRestoreColumnIJ_Inode;
2208     A->ops->coloringpatch     = MatColoringPatch_Inode;
2209     A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode;
2210     a->inode.node_count       = node_count;
2211     a->inode.size             = ns;
2212     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2213   }
2214   PetscFunctionReturn(0);
2215 }
2216 
2217 /*
2218      This is really ugly. if inodes are used this replaces the
2219   permutations with ones that correspond to rows/cols of the matrix
2220   rather then inode blocks
2221 */
2222 #undef __FUNCT__
2223 #define __FUNCT__ "MatInodeAdjustForInodes"
2224 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2225 {
2226   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2227 
2228   PetscFunctionBegin;
2229   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2230   if (f) {
2231     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2232   }
2233   PetscFunctionReturn(0);
2234 }
2235 
2236 EXTERN_C_BEGIN
2237 #undef __FUNCT__
2238 #define __FUNCT__ "MatAdjustForInodes_Inode"
2239 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2240 {
2241   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2242   PetscErrorCode ierr;
2243   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2244   const PetscInt *ridx,*cidx;
2245   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2246   PetscInt       nslim_col,*ns_col;
2247   IS             ris = *rperm,cis = *cperm;
2248 
2249   PetscFunctionBegin;
2250   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2251   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2252 
2253   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2254   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2255   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2256   permc = permr + m;
2257 
2258   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2259   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2260 
2261   /* Form the inode structure for the rows of permuted matric using inv perm*/
2262   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2263 
2264   /* Construct the permutations for rows*/
2265   for (i=0,row = 0; i<nslim_row; ++i){
2266     indx      = ridx[i];
2267     start_val = tns[indx];
2268     end_val   = tns[indx + 1];
2269     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2270   }
2271 
2272   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2273   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2274 
2275  /* Construct permutations for columns */
2276   for (i=0,col=0; i<nslim_col; ++i){
2277     indx      = cidx[i];
2278     start_val = tns[indx];
2279     end_val   = tns[indx + 1];
2280     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2281   }
2282 
2283   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2284   ISSetPermutation(*rperm);
2285   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2286   ISSetPermutation(*cperm);
2287 
2288   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2289   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2290 
2291   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2292   ierr = PetscFree(permr);CHKERRQ(ierr);
2293   ierr = ISDestroy(cis);CHKERRQ(ierr);
2294   ierr = ISDestroy(ris);CHKERRQ(ierr);
2295   ierr = PetscFree(tns);CHKERRQ(ierr);
2296   PetscFunctionReturn(0);
2297 }
2298 EXTERN_C_END
2299 
2300 #undef __FUNCT__
2301 #define __FUNCT__ "MatInodeGetInodeSizes"
2302 /*@C
2303    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2304 
2305    Collective on Mat
2306 
2307    Input Parameter:
2308 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2309 
2310    Output Parameter:
2311 +  node_count - no of inodes present in the matrix.
2312 .  sizes      - an array of size node_count,with sizes of each inode.
2313 -  limit      - the max size used to generate the inodes.
2314 
2315    Level: advanced
2316 
2317    Notes: This routine returns some internal storage information
2318    of the matrix, it is intended to be used by advanced users.
2319    It should be called after the matrix is assembled.
2320    The contents of the sizes[] array should not be changed.
2321    PETSC_NULL may be passed for information not requested.
2322 
2323 .keywords: matrix, seqaij, get, inode
2324 
2325 .seealso: MatGetInfo()
2326 @*/
2327 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2328 {
2329   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2330 
2331   PetscFunctionBegin;
2332   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2333   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2334   if (f) {
2335     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2336   }
2337   PetscFunctionReturn(0);
2338 }
2339 
2340 EXTERN_C_BEGIN
2341 #undef __FUNCT__
2342 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2343 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2344 {
2345   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2346 
2347   PetscFunctionBegin;
2348   if (node_count) *node_count = a->inode.node_count;
2349   if (sizes)      *sizes      = a->inode.size;
2350   if (limit)      *limit      = a->inode.limit;
2351   PetscFunctionReturn(0);
2352 }
2353 EXTERN_C_END
2354