xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 829b6ff0376e8d5b8b7c6c54aef4e99b13aa00d1)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_SeqAIJ_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
779 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   const PetscInt    *r,*c,*rout,*cout;
785   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       for(j=0; j<nz-1; j+=2){
821         i0   = vi[0];
822         i1   = vi[1];
823         vi  +=2;
824         tmp0 = tmps[i0];
825         tmp1 = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if(j == nz-1){
829         tmp0 = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row ++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for(j=0; j<nz-1; j+=2){
840         i0   = vi[0];
841         i1   = vi[1];
842         vi  +=2;
843         tmp0 = tmps[i0];
844         tmp1 = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if(j == nz-1){
849         tmp0 = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2 -= *v2++ * sum1;
854       tmp[row ++]=sum1;
855       tmp[row ++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2){
865         i0   = vi[0];
866         i1   = vi[1];
867         vi  +=2;
868         tmp0 = tmps[i0];
869         tmp1 = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1){
875         tmp0 = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883       tmp[row ++]=sum1;
884       tmp[row ++]=sum2;
885       tmp[row ++]=sum3;
886       break;
887 
888     case 4:
889       sum1 = b[*r++];
890       sum2 = b[*r++];
891       sum3 = b[*r++];
892       sum4 = b[*r++];
893       v2   = aa + ai[row+1];
894       v3   = aa + ai[row+2];
895       v4   = aa + ai[row+3];
896 
897       for (j=0; j<nz-1; j+=2){
898         i0   = vi[0];
899         i1   = vi[1];
900         vi  +=2;
901         tmp0 = tmps[i0];
902         tmp1 = tmps[i1];
903         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
904         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
905         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
906         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
907       }
908       if (j == nz-1){
909         tmp0 = tmps[*vi++];
910         sum1 -= *v1++ *tmp0;
911         sum2 -= *v2++ *tmp0;
912         sum3 -= *v3++ *tmp0;
913         sum4 -= *v4++ *tmp0;
914       }
915       sum2 -= *v2++ * sum1;
916       sum3 -= *v3++ * sum1;
917       sum4 -= *v4++ * sum1;
918       sum3 -= *v3++ * sum2;
919       sum4 -= *v4++ * sum2;
920       sum4 -= *v4++ * sum3;
921 
922       tmp[row ++]=sum1;
923       tmp[row ++]=sum2;
924       tmp[row ++]=sum3;
925       tmp[row ++]=sum4;
926       break;
927     case 5:
928       sum1 = b[*r++];
929       sum2 = b[*r++];
930       sum3 = b[*r++];
931       sum4 = b[*r++];
932       sum5 = b[*r++];
933       v2   = aa + ai[row+1];
934       v3   = aa + ai[row+2];
935       v4   = aa + ai[row+3];
936       v5   = aa + ai[row+4];
937 
938       for (j=0; j<nz-1; j+=2){
939         i0   = vi[0];
940         i1   = vi[1];
941         vi  +=2;
942         tmp0 = tmps[i0];
943         tmp1 = tmps[i1];
944         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
945         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
946         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
947         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
948         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
949       }
950       if (j == nz-1){
951         tmp0 = tmps[*vi++];
952         sum1 -= *v1++ *tmp0;
953         sum2 -= *v2++ *tmp0;
954         sum3 -= *v3++ *tmp0;
955         sum4 -= *v4++ *tmp0;
956         sum5 -= *v5++ *tmp0;
957       }
958 
959       sum2 -= *v2++ * sum1;
960       sum3 -= *v3++ * sum1;
961       sum4 -= *v4++ * sum1;
962       sum5 -= *v5++ * sum1;
963       sum3 -= *v3++ * sum2;
964       sum4 -= *v4++ * sum2;
965       sum5 -= *v5++ * sum2;
966       sum4 -= *v4++ * sum3;
967       sum5 -= *v5++ * sum3;
968       sum5 -= *v5++ * sum4;
969 
970       tmp[row ++]=sum1;
971       tmp[row ++]=sum2;
972       tmp[row ++]=sum3;
973       tmp[row ++]=sum4;
974       tmp[row ++]=sum5;
975       break;
976     default:
977       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
978     }
979   }
980   /* backward solve the upper triangular */
981   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
982     nsz = ns[i];
983     aii = ai[row+1] -1;
984     v1  = aa + aii;
985     vi  = aj + aii;
986     nz  = aii- ad[row];
987     switch (nsz){               /* Each loop in 'case' is unrolled */
988     case 1 :
989       sum1 = tmp[row];
990 
991       for(j=nz ; j>1; j-=2){
992         vi  -=2;
993         i0   = vi[2];
994         i1   = vi[1];
995         tmp0 = tmps[i0];
996         tmp1 = tmps[i1];
997         v1   -= 2;
998         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
999       }
1000       if (j==1){
1001         tmp0  = tmps[*vi--];
1002         sum1 -= *v1-- * tmp0;
1003       }
1004       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1005       break;
1006     case 2 :
1007       sum1 = tmp[row];
1008       sum2 = tmp[row -1];
1009       v2   = aa + ai[row]-1;
1010       for (j=nz ; j>1; j-=2){
1011         vi  -=2;
1012         i0   = vi[2];
1013         i1   = vi[1];
1014         tmp0 = tmps[i0];
1015         tmp1 = tmps[i1];
1016         v1   -= 2;
1017         v2   -= 2;
1018         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1019         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1020       }
1021       if (j==1){
1022         tmp0  = tmps[*vi--];
1023         sum1 -= *v1-- * tmp0;
1024         sum2 -= *v2-- * tmp0;
1025       }
1026 
1027       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1028       sum2   -= *v2-- * tmp0;
1029       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1030       break;
1031     case 3 :
1032       sum1 = tmp[row];
1033       sum2 = tmp[row -1];
1034       sum3 = tmp[row -2];
1035       v2   = aa + ai[row]-1;
1036       v3   = aa + ai[row -1]-1;
1037       for (j=nz ; j>1; j-=2){
1038         vi  -=2;
1039         i0   = vi[2];
1040         i1   = vi[1];
1041         tmp0 = tmps[i0];
1042         tmp1 = tmps[i1];
1043         v1   -= 2;
1044         v2   -= 2;
1045         v3   -= 2;
1046         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1047         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1048         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1049       }
1050       if (j==1){
1051         tmp0  = tmps[*vi--];
1052         sum1 -= *v1-- * tmp0;
1053         sum2 -= *v2-- * tmp0;
1054         sum3 -= *v3-- * tmp0;
1055       }
1056       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1057       sum2   -= *v2-- * tmp0;
1058       sum3   -= *v3-- * tmp0;
1059       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1060       sum3   -= *v3-- * tmp0;
1061       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1062 
1063       break;
1064     case 4 :
1065       sum1 = tmp[row];
1066       sum2 = tmp[row -1];
1067       sum3 = tmp[row -2];
1068       sum4 = tmp[row -3];
1069       v2   = aa + ai[row]-1;
1070       v3   = aa + ai[row -1]-1;
1071       v4   = aa + ai[row -2]-1;
1072 
1073       for (j=nz ; j>1; j-=2){
1074         vi  -=2;
1075         i0   = vi[2];
1076         i1   = vi[1];
1077         tmp0 = tmps[i0];
1078         tmp1 = tmps[i1];
1079         v1  -= 2;
1080         v2  -= 2;
1081         v3  -= 2;
1082         v4  -= 2;
1083         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1084         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1085         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1086         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1087       }
1088       if (j==1){
1089         tmp0  = tmps[*vi--];
1090         sum1 -= *v1-- * tmp0;
1091         sum2 -= *v2-- * tmp0;
1092         sum3 -= *v3-- * tmp0;
1093         sum4 -= *v4-- * tmp0;
1094       }
1095 
1096       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1097       sum2   -= *v2-- * tmp0;
1098       sum3   -= *v3-- * tmp0;
1099       sum4   -= *v4-- * tmp0;
1100       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1101       sum3   -= *v3-- * tmp0;
1102       sum4   -= *v4-- * tmp0;
1103       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1104       sum4   -= *v4-- * tmp0;
1105       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1106       break;
1107     case 5 :
1108       sum1 = tmp[row];
1109       sum2 = tmp[row -1];
1110       sum3 = tmp[row -2];
1111       sum4 = tmp[row -3];
1112       sum5 = tmp[row -4];
1113       v2   = aa + ai[row]-1;
1114       v3   = aa + ai[row -1]-1;
1115       v4   = aa + ai[row -2]-1;
1116       v5   = aa + ai[row -3]-1;
1117       for (j=nz ; j>1; j-=2){
1118         vi  -= 2;
1119         i0   = vi[2];
1120         i1   = vi[1];
1121         tmp0 = tmps[i0];
1122         tmp1 = tmps[i1];
1123         v1   -= 2;
1124         v2   -= 2;
1125         v3   -= 2;
1126         v4   -= 2;
1127         v5   -= 2;
1128         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1129         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1130         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1131         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1132         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1133       }
1134       if (j==1){
1135         tmp0  = tmps[*vi--];
1136         sum1 -= *v1-- * tmp0;
1137         sum2 -= *v2-- * tmp0;
1138         sum3 -= *v3-- * tmp0;
1139         sum4 -= *v4-- * tmp0;
1140         sum5 -= *v5-- * tmp0;
1141       }
1142 
1143       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1144       sum2   -= *v2-- * tmp0;
1145       sum3   -= *v3-- * tmp0;
1146       sum4   -= *v4-- * tmp0;
1147       sum5   -= *v5-- * tmp0;
1148       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1149       sum3   -= *v3-- * tmp0;
1150       sum4   -= *v4-- * tmp0;
1151       sum5   -= *v5-- * tmp0;
1152       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1153       sum4   -= *v4-- * tmp0;
1154       sum5   -= *v5-- * tmp0;
1155       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1156       sum5   -= *v5-- * tmp0;
1157       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1158       break;
1159     default:
1160       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1161     }
1162   }
1163   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1164   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1165   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1167   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1168   PetscFunctionReturn(0);
1169 }
1170 
1171 /*
1172      Makes a longer coloring[] array and calls the usual code with that
1173 */
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
1176 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1177 {
1178   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1179   PetscErrorCode  ierr;
1180   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1181   PetscInt        *colorused,i;
1182   ISColoringValue *newcolor;
1183 
1184   PetscFunctionBegin;
1185   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1186   /* loop over inodes, marking a color for each column*/
1187   row = 0;
1188   for (i=0; i<m; i++){
1189     for (j=0; j<ns[i]; j++) {
1190       newcolor[row++] = coloring[i] + j*ncolors;
1191     }
1192   }
1193 
1194   /* eliminate unneeded colors */
1195   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1196   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1197   for (i=0; i<n; i++) {
1198     colorused[newcolor[i]] = 1;
1199   }
1200 
1201   for (i=1; i<5*ncolors; i++) {
1202     colorused[i] += colorused[i-1];
1203   }
1204   ncolors = colorused[5*ncolors-1];
1205   for (i=0; i<n; i++) {
1206     newcolor[i] = colorused[newcolor[i]]-1;
1207   }
1208   ierr = PetscFree(colorused);CHKERRQ(ierr);
1209   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1210   ierr = PetscFree(coloring);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 #include "../src/mat/blockinvert.h"
1215 
1216 #undef __FUNCT__
1217 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
1218 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1219 {
1220   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1221   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1222   MatScalar          *ibdiag,*bdiag;
1223   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1224   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1225   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1226   PetscErrorCode     ierr;
1227   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1228   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1229 
1230   PetscFunctionBegin;
1231   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1232   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1233   if (its > 1) {
1234     /* switch to non-inode version */
1235     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1236     PetscFunctionReturn(0);
1237   }
1238 
1239   if (!a->inode.ibdiagvalid) {
1240     if (!a->inode.ibdiag) {
1241       /* calculate space needed for diagonal blocks */
1242       for (i=0; i<m; i++) {
1243 	cnt += sizes[i]*sizes[i];
1244       }
1245       a->inode.bdiagsize = cnt;
1246       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1247     }
1248 
1249     /* copy over the diagonal blocks and invert them */
1250     ibdiag = a->inode.ibdiag;
1251     bdiag  = a->inode.bdiag;
1252     cnt = 0;
1253     for (i=0, row = 0; i<m; i++) {
1254       for (j=0; j<sizes[i]; j++) {
1255         for (k=0; k<sizes[i]; k++) {
1256           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1257         }
1258       }
1259       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1260 
1261       switch(sizes[i]) {
1262         case 1:
1263           /* Create matrix data structure */
1264           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1265           ibdiag[cnt] = 1.0/ibdiag[cnt];
1266           break;
1267         case 2:
1268           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1269           break;
1270         case 3:
1271           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1272           break;
1273         case 4:
1274           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1275           break;
1276         case 5:
1277           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1278           break;
1279        default:
1280 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1281       }
1282       cnt += sizes[i]*sizes[i];
1283       row += sizes[i];
1284     }
1285     a->inode.ibdiagvalid = PETSC_TRUE;
1286   }
1287   ibdiag = a->inode.ibdiag;
1288   bdiag  = a->inode.bdiag;
1289 
1290 
1291   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1292   if (flag & SOR_ZERO_INITIAL_GUESS) {
1293     PetscScalar *b;
1294     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1295     if (xx != bb) {
1296       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1297     } else {
1298       b = x;
1299     }
1300     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1301 
1302       for (i=0, row=0; i<m; i++) {
1303         sz  = diag[row] - ii[row];
1304         v1  = a->a + ii[row];
1305         idx = a->j + ii[row];
1306 
1307         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1308         switch (sizes[i]){
1309           case 1:
1310 
1311             sum1  = b[row];
1312             for(n = 0; n<sz-1; n+=2) {
1313               i1   = idx[0];
1314               i2   = idx[1];
1315               idx += 2;
1316               tmp0 = x[i1];
1317               tmp1 = x[i2];
1318               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1319             }
1320 
1321             if (n == sz-1){
1322               tmp0  = x[*idx];
1323               sum1 -= *v1 * tmp0;
1324             }
1325             x[row++] = sum1*(*ibdiag++);
1326             break;
1327           case 2:
1328             v2    = a->a + ii[row+1];
1329             sum1  = b[row];
1330             sum2  = b[row+1];
1331             for(n = 0; n<sz-1; n+=2) {
1332               i1   = idx[0];
1333               i2   = idx[1];
1334               idx += 2;
1335               tmp0 = x[i1];
1336               tmp1 = x[i2];
1337               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1338               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1339             }
1340 
1341             if (n == sz-1){
1342               tmp0  = x[*idx];
1343               sum1 -= v1[0] * tmp0;
1344               sum2 -= v2[0] * tmp0;
1345             }
1346             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1347             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1348             ibdiag  += 4;
1349             break;
1350           case 3:
1351             v2    = a->a + ii[row+1];
1352             v3    = a->a + ii[row+2];
1353             sum1  = b[row];
1354             sum2  = b[row+1];
1355             sum3  = b[row+2];
1356             for(n = 0; n<sz-1; n+=2) {
1357               i1   = idx[0];
1358               i2   = idx[1];
1359               idx += 2;
1360               tmp0 = x[i1];
1361               tmp1 = x[i2];
1362               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1363               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1364               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1365             }
1366 
1367             if (n == sz-1){
1368               tmp0  = x[*idx];
1369               sum1 -= v1[0] * tmp0;
1370               sum2 -= v2[0] * tmp0;
1371               sum3 -= v3[0] * tmp0;
1372             }
1373             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1374             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1375             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1376             ibdiag  += 9;
1377             break;
1378           case 4:
1379             v2    = a->a + ii[row+1];
1380             v3    = a->a + ii[row+2];
1381             v4    = a->a + ii[row+3];
1382             sum1  = b[row];
1383             sum2  = b[row+1];
1384             sum3  = b[row+2];
1385             sum4  = b[row+3];
1386             for(n = 0; n<sz-1; n+=2) {
1387               i1   = idx[0];
1388               i2   = idx[1];
1389               idx += 2;
1390               tmp0 = x[i1];
1391               tmp1 = x[i2];
1392               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1393               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1394               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1395               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1396             }
1397 
1398             if (n == sz-1){
1399               tmp0  = x[*idx];
1400               sum1 -= v1[0] * tmp0;
1401               sum2 -= v2[0] * tmp0;
1402               sum3 -= v3[0] * tmp0;
1403               sum4 -= v4[0] * tmp0;
1404             }
1405             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1406             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1407             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1408             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1409             ibdiag  += 16;
1410             break;
1411           case 5:
1412             v2    = a->a + ii[row+1];
1413             v3    = a->a + ii[row+2];
1414             v4    = a->a + ii[row+3];
1415             v5    = a->a + ii[row+4];
1416             sum1  = b[row];
1417             sum2  = b[row+1];
1418             sum3  = b[row+2];
1419             sum4  = b[row+3];
1420             sum5  = b[row+4];
1421             for(n = 0; n<sz-1; n+=2) {
1422               i1   = idx[0];
1423               i2   = idx[1];
1424               idx += 2;
1425               tmp0 = x[i1];
1426               tmp1 = x[i2];
1427               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1428               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1429               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1430               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1431               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1432             }
1433 
1434             if (n == sz-1){
1435               tmp0  = x[*idx];
1436               sum1 -= v1[0] * tmp0;
1437               sum2 -= v2[0] * tmp0;
1438               sum3 -= v3[0] * tmp0;
1439               sum4 -= v4[0] * tmp0;
1440               sum5 -= v5[0] * tmp0;
1441             }
1442             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1443             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1444             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1445             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1446             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1447             ibdiag  += 25;
1448             break;
1449 	  default:
1450    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1451         }
1452       }
1453 
1454       xb = x;
1455       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1456     } else xb = b;
1457     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1458         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1459       cnt = 0;
1460       for (i=0, row=0; i<m; i++) {
1461 
1462         switch (sizes[i]){
1463           case 1:
1464             x[row++] *= bdiag[cnt++];
1465             break;
1466           case 2:
1467             x1   = x[row]; x2 = x[row+1];
1468             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1469             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1470             x[row++] = tmp1;
1471             x[row++] = tmp2;
1472             cnt += 4;
1473             break;
1474           case 3:
1475             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1476             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1477             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1478             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1479             x[row++] = tmp1;
1480             x[row++] = tmp2;
1481             x[row++] = tmp3;
1482             cnt += 9;
1483             break;
1484           case 4:
1485             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1486             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1487             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1488             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1489             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1490             x[row++] = tmp1;
1491             x[row++] = tmp2;
1492             x[row++] = tmp3;
1493             x[row++] = tmp4;
1494             cnt += 16;
1495             break;
1496           case 5:
1497             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1498             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1499             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1500             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1501             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1502             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1503             x[row++] = tmp1;
1504             x[row++] = tmp2;
1505             x[row++] = tmp3;
1506             x[row++] = tmp4;
1507             x[row++] = tmp5;
1508             cnt += 25;
1509             break;
1510 	  default:
1511    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1512         }
1513       }
1514       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1515     }
1516     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1517 
1518       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1519       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1520         ibdiag -= sizes[i]*sizes[i];
1521         sz      = ii[row+1] - diag[row] - 1;
1522         v1      = a->a + diag[row] + 1;
1523         idx     = a->j + diag[row] + 1;
1524 
1525         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1526         switch (sizes[i]){
1527           case 1:
1528 
1529             sum1  = xb[row];
1530             for(n = 0; n<sz-1; n+=2) {
1531               i1   = idx[0];
1532               i2   = idx[1];
1533               idx += 2;
1534               tmp0 = x[i1];
1535               tmp1 = x[i2];
1536               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1537             }
1538 
1539             if (n == sz-1){
1540               tmp0  = x[*idx];
1541               sum1 -= *v1*tmp0;
1542             }
1543             x[row--] = sum1*(*ibdiag);
1544             break;
1545 
1546           case 2:
1547 
1548             sum1  = xb[row];
1549             sum2  = xb[row-1];
1550             /* note that sum1 is associated with the second of the two rows */
1551             v2    = a->a + diag[row-1] + 2;
1552             for(n = 0; n<sz-1; n+=2) {
1553               i1   = idx[0];
1554               i2   = idx[1];
1555               idx += 2;
1556               tmp0 = x[i1];
1557               tmp1 = x[i2];
1558               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1559               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1560             }
1561 
1562             if (n == sz-1){
1563               tmp0  = x[*idx];
1564               sum1 -= *v1*tmp0;
1565               sum2 -= *v2*tmp0;
1566             }
1567             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1568             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1569             break;
1570           case 3:
1571 
1572             sum1  = xb[row];
1573             sum2  = xb[row-1];
1574             sum3  = xb[row-2];
1575             v2    = a->a + diag[row-1] + 2;
1576             v3    = a->a + diag[row-2] + 3;
1577             for(n = 0; n<sz-1; n+=2) {
1578               i1   = idx[0];
1579               i2   = idx[1];
1580               idx += 2;
1581               tmp0 = x[i1];
1582               tmp1 = x[i2];
1583               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1584               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1585               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1586             }
1587 
1588             if (n == sz-1){
1589               tmp0  = x[*idx];
1590               sum1 -= *v1*tmp0;
1591               sum2 -= *v2*tmp0;
1592               sum3 -= *v3*tmp0;
1593             }
1594             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1595             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1596             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1597             break;
1598           case 4:
1599 
1600             sum1  = xb[row];
1601             sum2  = xb[row-1];
1602             sum3  = xb[row-2];
1603             sum4  = xb[row-3];
1604             v2    = a->a + diag[row-1] + 2;
1605             v3    = a->a + diag[row-2] + 3;
1606             v4    = a->a + diag[row-3] + 4;
1607             for(n = 0; n<sz-1; n+=2) {
1608               i1   = idx[0];
1609               i2   = idx[1];
1610               idx += 2;
1611               tmp0 = x[i1];
1612               tmp1 = x[i2];
1613               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1614               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1615               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1616               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1617             }
1618 
1619             if (n == sz-1){
1620               tmp0  = x[*idx];
1621               sum1 -= *v1*tmp0;
1622               sum2 -= *v2*tmp0;
1623               sum3 -= *v3*tmp0;
1624               sum4 -= *v4*tmp0;
1625             }
1626             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
1627             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
1628             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
1629             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
1630             break;
1631           case 5:
1632 
1633             sum1  = xb[row];
1634             sum2  = xb[row-1];
1635             sum3  = xb[row-2];
1636             sum4  = xb[row-3];
1637             sum5  = xb[row-4];
1638             v2    = a->a + diag[row-1] + 2;
1639             v3    = a->a + diag[row-2] + 3;
1640             v4    = a->a + diag[row-3] + 4;
1641             v5    = a->a + diag[row-4] + 5;
1642             for(n = 0; n<sz-1; n+=2) {
1643               i1   = idx[0];
1644               i2   = idx[1];
1645               idx += 2;
1646               tmp0 = x[i1];
1647               tmp1 = x[i2];
1648               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1649               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1650               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1651               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1652               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1653             }
1654 
1655             if (n == sz-1){
1656               tmp0  = x[*idx];
1657               sum1 -= *v1*tmp0;
1658               sum2 -= *v2*tmp0;
1659               sum3 -= *v3*tmp0;
1660               sum4 -= *v4*tmp0;
1661               sum5 -= *v5*tmp0;
1662             }
1663             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
1664             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
1665             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
1666             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
1667             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
1668             break;
1669 	  default:
1670    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1671         }
1672       }
1673 
1674       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1675     }
1676     its--;
1677     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1678     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
1679   }
1680   if (flag & SOR_EISENSTAT) {
1681     const PetscScalar *b;
1682     MatScalar         *t = a->inode.ssor_work;
1683 
1684     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1685     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1686     /*
1687           Apply  (U + D)^-1  where D is now the block diagonal
1688     */
1689     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1690     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1691       ibdiag -= sizes[i]*sizes[i];
1692       sz      = ii[row+1] - diag[row] - 1;
1693       v1      = a->a + diag[row] + 1;
1694       idx     = a->j + diag[row] + 1;
1695       CHKMEMQ;
1696       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1697       switch (sizes[i]){
1698         case 1:
1699 
1700 	  sum1  = b[row];
1701 	  for(n = 0; n<sz-1; n+=2) {
1702 	    i1   = idx[0];
1703 	    i2   = idx[1];
1704 	    idx += 2;
1705 	    tmp0 = x[i1];
1706 	    tmp1 = x[i2];
1707 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1708 	  }
1709 
1710 	  if (n == sz-1){
1711 	    tmp0  = x[*idx];
1712 	    sum1 -= *v1*tmp0;
1713 	  }
1714 	  x[row] = sum1*(*ibdiag);row--;
1715 	  break;
1716 
1717         case 2:
1718 
1719 	  sum1  = b[row];
1720 	  sum2  = b[row-1];
1721 	  /* note that sum1 is associated with the second of the two rows */
1722 	  v2    = a->a + diag[row-1] + 2;
1723 	  for(n = 0; n<sz-1; n+=2) {
1724 	    i1   = idx[0];
1725 	    i2   = idx[1];
1726 	    idx += 2;
1727 	    tmp0 = x[i1];
1728 	    tmp1 = x[i2];
1729 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1730 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1731 	  }
1732 
1733 	  if (n == sz-1){
1734 	    tmp0  = x[*idx];
1735 	    sum1 -= *v1*tmp0;
1736 	    sum2 -= *v2*tmp0;
1737 	  }
1738 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
1739 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
1740           row -= 2;
1741 	  break;
1742         case 3:
1743 
1744 	  sum1  = b[row];
1745 	  sum2  = b[row-1];
1746 	  sum3  = b[row-2];
1747 	  v2    = a->a + diag[row-1] + 2;
1748 	  v3    = a->a + diag[row-2] + 3;
1749 	  for(n = 0; n<sz-1; n+=2) {
1750 	    i1   = idx[0];
1751 	    i2   = idx[1];
1752 	    idx += 2;
1753 	    tmp0 = x[i1];
1754 	    tmp1 = x[i2];
1755 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1756 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1757 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1758 	  }
1759 
1760 	  if (n == sz-1){
1761 	    tmp0  = x[*idx];
1762 	    sum1 -= *v1*tmp0;
1763 	    sum2 -= *v2*tmp0;
1764 	    sum3 -= *v3*tmp0;
1765 	  }
1766 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1767 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1768 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1769           row -= 3;
1770 	  break;
1771         case 4:
1772 
1773 	  sum1  = b[row];
1774 	  sum2  = b[row-1];
1775 	  sum3  = b[row-2];
1776 	  sum4  = b[row-3];
1777 	  v2    = a->a + diag[row-1] + 2;
1778 	  v3    = a->a + diag[row-2] + 3;
1779 	  v4    = a->a + diag[row-3] + 4;
1780 	  for(n = 0; n<sz-1; n+=2) {
1781 	    i1   = idx[0];
1782 	    i2   = idx[1];
1783 	    idx += 2;
1784 	    tmp0 = x[i1];
1785 	    tmp1 = x[i2];
1786 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1787 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1788 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1789 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1790 	  }
1791 
1792 	  if (n == sz-1){
1793 	    tmp0  = x[*idx];
1794 	    sum1 -= *v1*tmp0;
1795 	    sum2 -= *v2*tmp0;
1796 	    sum3 -= *v3*tmp0;
1797 	    sum4 -= *v4*tmp0;
1798 	  }
1799 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
1800 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
1801 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
1802 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
1803           row -= 4;
1804 	  break;
1805         case 5:
1806 
1807 	  sum1  = b[row];
1808 	  sum2  = b[row-1];
1809 	  sum3  = b[row-2];
1810 	  sum4  = b[row-3];
1811 	  sum5  = b[row-4];
1812 	  v2    = a->a + diag[row-1] + 2;
1813 	  v3    = a->a + diag[row-2] + 3;
1814 	  v4    = a->a + diag[row-3] + 4;
1815 	  v5    = a->a + diag[row-4] + 5;
1816 	  for(n = 0; n<sz-1; n+=2) {
1817 	    i1   = idx[0];
1818 	    i2   = idx[1];
1819 	    idx += 2;
1820 	    tmp0 = x[i1];
1821 	    tmp1 = x[i2];
1822 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1823 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1824 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1825 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1826 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1827 	  }
1828 
1829 	  if (n == sz-1){
1830 	    tmp0  = x[*idx];
1831 	    sum1 -= *v1*tmp0;
1832 	    sum2 -= *v2*tmp0;
1833 	    sum3 -= *v3*tmp0;
1834 	    sum4 -= *v4*tmp0;
1835 	    sum5 -= *v5*tmp0;
1836 	  }
1837 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
1838 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
1839 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
1840 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
1841 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
1842           row -= 5;
1843 	  break;
1844         default:
1845 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1846       }
1847       CHKMEMQ;
1848     }
1849     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1850 
1851     /*
1852            t = b - D x    where D is the block diagonal
1853     */
1854     cnt = 0;
1855     for (i=0, row=0; i<m; i++) {
1856       CHKMEMQ;
1857       switch (sizes[i]){
1858         case 1:
1859 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
1860 	  break;
1861         case 2:
1862 	  x1   = x[row]; x2 = x[row+1];
1863 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1864 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1865 	  t[row]   = b[row] - tmp1;
1866 	  t[row+1] = b[row+1] - tmp2; row += 2;
1867 	  cnt += 4;
1868 	  break;
1869         case 3:
1870 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1871 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1872 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1873 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1874 	  t[row] = b[row] - tmp1;
1875 	  t[row+1] = b[row+1] - tmp2;
1876 	  t[row+2] = b[row+2] - tmp3; row += 3;
1877 	  cnt += 9;
1878 	  break;
1879         case 4:
1880 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1881 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1882 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1883 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1884 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1885 	  t[row] = b[row] - tmp1;
1886 	  t[row+1] = b[row+1] - tmp2;
1887 	  t[row+2] = b[row+2] - tmp3;
1888 	  t[row+3] = b[row+3] - tmp4; row += 4;
1889 	  cnt += 16;
1890 	  break;
1891         case 5:
1892 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1893 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1894 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1895 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1896 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1897 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1898 	  t[row] = b[row] - tmp1;
1899 	  t[row+1] = b[row+1] - tmp2;
1900 	  t[row+2] = b[row+2] - tmp3;
1901 	  t[row+3] = b[row+3] - tmp4;
1902 	  t[row+4] = b[row+4] - tmp5;row += 5;
1903 	  cnt += 25;
1904 	  break;
1905         default:
1906 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1907       }
1908       CHKMEMQ;
1909     }
1910     ierr = PetscLogFlops(m);CHKERRQ(ierr);
1911 
1912 
1913 
1914     /*
1915           Apply (L + D)^-1 where D is the block diagonal
1916     */
1917     for (i=0, row=0; i<m; i++) {
1918       sz  = diag[row] - ii[row];
1919       v1  = a->a + ii[row];
1920       idx = a->j + ii[row];
1921       CHKMEMQ;
1922       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1923       switch (sizes[i]){
1924         case 1:
1925 
1926 	  sum1  = t[row];
1927 	  for(n = 0; n<sz-1; n+=2) {
1928 	    i1   = idx[0];
1929 	    i2   = idx[1];
1930 	    idx += 2;
1931 	    tmp0 = t[i1];
1932 	    tmp1 = t[i2];
1933 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1934 	  }
1935 
1936 	  if (n == sz-1){
1937 	    tmp0  = t[*idx];
1938 	    sum1 -= *v1 * tmp0;
1939 	  }
1940 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
1941 	  break;
1942         case 2:
1943 	  v2    = a->a + ii[row+1];
1944 	  sum1  = t[row];
1945 	  sum2  = t[row+1];
1946 	  for(n = 0; n<sz-1; n+=2) {
1947 	    i1   = idx[0];
1948 	    i2   = idx[1];
1949 	    idx += 2;
1950 	    tmp0 = t[i1];
1951 	    tmp1 = t[i2];
1952 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1953 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1954 	  }
1955 
1956 	  if (n == sz-1){
1957 	    tmp0  = t[*idx];
1958 	    sum1 -= v1[0] * tmp0;
1959 	    sum2 -= v2[0] * tmp0;
1960 	  }
1961 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
1962 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
1963 	  ibdiag  += 4; row += 2;
1964 	  break;
1965         case 3:
1966 	  v2    = a->a + ii[row+1];
1967 	  v3    = a->a + ii[row+2];
1968 	  sum1  = t[row];
1969 	  sum2  = t[row+1];
1970 	  sum3  = t[row+2];
1971 	  for(n = 0; n<sz-1; n+=2) {
1972 	    i1   = idx[0];
1973 	    i2   = idx[1];
1974 	    idx += 2;
1975 	    tmp0 = t[i1];
1976 	    tmp1 = t[i2];
1977 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1978 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1979 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1980 	  }
1981 
1982 	  if (n == sz-1){
1983 	    tmp0  = t[*idx];
1984 	    sum1 -= v1[0] * tmp0;
1985 	    sum2 -= v2[0] * tmp0;
1986 	    sum3 -= v3[0] * tmp0;
1987 	  }
1988 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1989 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1990 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1991 	  ibdiag  += 9; row += 3;
1992 	  break;
1993         case 4:
1994 	  v2    = a->a + ii[row+1];
1995 	  v3    = a->a + ii[row+2];
1996 	  v4    = a->a + ii[row+3];
1997 	  sum1  = t[row];
1998 	  sum2  = t[row+1];
1999 	  sum3  = t[row+2];
2000 	  sum4  = t[row+3];
2001 	  for(n = 0; n<sz-1; n+=2) {
2002 	    i1   = idx[0];
2003 	    i2   = idx[1];
2004 	    idx += 2;
2005 	    tmp0 = t[i1];
2006 	    tmp1 = t[i2];
2007 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2008 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2009 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2010 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2011 	  }
2012 
2013 	  if (n == sz-1){
2014 	    tmp0  = t[*idx];
2015 	    sum1 -= v1[0] * tmp0;
2016 	    sum2 -= v2[0] * tmp0;
2017 	    sum3 -= v3[0] * tmp0;
2018 	    sum4 -= v4[0] * tmp0;
2019 	  }
2020 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2021 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2022 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2023 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2024 	  ibdiag  += 16; row += 4;
2025 	  break;
2026         case 5:
2027 	  v2    = a->a + ii[row+1];
2028 	  v3    = a->a + ii[row+2];
2029 	  v4    = a->a + ii[row+3];
2030 	  v5    = a->a + ii[row+4];
2031 	  sum1  = t[row];
2032 	  sum2  = t[row+1];
2033 	  sum3  = t[row+2];
2034 	  sum4  = t[row+3];
2035 	  sum5  = t[row+4];
2036 	  for(n = 0; n<sz-1; n+=2) {
2037 	    i1   = idx[0];
2038 	    i2   = idx[1];
2039 	    idx += 2;
2040 	    tmp0 = t[i1];
2041 	    tmp1 = t[i2];
2042 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2043 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2044 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2045 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2046 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2047 	  }
2048 
2049 	  if (n == sz-1){
2050 	    tmp0  = t[*idx];
2051 	    sum1 -= v1[0] * tmp0;
2052 	    sum2 -= v2[0] * tmp0;
2053 	    sum3 -= v3[0] * tmp0;
2054 	    sum4 -= v4[0] * tmp0;
2055 	    sum5 -= v5[0] * tmp0;
2056 	  }
2057 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2058 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2059 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2060 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2061 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2062 	  ibdiag  += 25; row += 5;
2063 	  break;
2064         default:
2065 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2066       }
2067       CHKMEMQ;
2068     }
2069     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2070     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2071     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2072   }
2073   PetscFunctionReturn(0);
2074 }
2075 
2076 #undef __FUNCT__
2077 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2078 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2079 {
2080   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2081   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2082   const MatScalar    *bdiag = a->inode.bdiag;
2083   const PetscScalar  *b;
2084   PetscErrorCode      ierr;
2085   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2086   const PetscInt      *sizes = a->inode.size;
2087 
2088   PetscFunctionBegin;
2089   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2090   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2091   cnt = 0;
2092   for (i=0, row=0; i<m; i++) {
2093     switch (sizes[i]){
2094       case 1:
2095 	x[row] = b[row]*bdiag[cnt++];row++;
2096 	break;
2097       case 2:
2098 	x1   = b[row]; x2 = b[row+1];
2099 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2100 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2101 	x[row++] = tmp1;
2102 	x[row++] = tmp2;
2103 	cnt += 4;
2104 	break;
2105       case 3:
2106 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2107 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2108 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2109 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2110 	x[row++] = tmp1;
2111 	x[row++] = tmp2;
2112 	x[row++] = tmp3;
2113 	cnt += 9;
2114 	break;
2115       case 4:
2116 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2117 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2118 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2119 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2120 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2121 	x[row++] = tmp1;
2122 	x[row++] = tmp2;
2123 	x[row++] = tmp3;
2124 	x[row++] = tmp4;
2125 	cnt += 16;
2126 	break;
2127       case 5:
2128 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2129 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2130 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2131 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2132 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2133 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2134 	x[row++] = tmp1;
2135 	x[row++] = tmp2;
2136 	x[row++] = tmp3;
2137 	x[row++] = tmp4;
2138 	x[row++] = tmp5;
2139 	cnt += 25;
2140 	break;
2141       default:
2142 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2143     }
2144   }
2145   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2146   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2147   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 /*
2152     samestructure indicates that the matrix has not changed its nonzero structure so we
2153     do not need to recompute the inodes
2154 */
2155 #undef __FUNCT__
2156 #define __FUNCT__ "Mat_CheckInode"
2157 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2158 {
2159   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2160   PetscErrorCode ierr;
2161   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2162   PetscTruth     flag;
2163 
2164   PetscFunctionBegin;
2165   if (!a->inode.use)                     PetscFunctionReturn(0);
2166   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2167 
2168 
2169   m = A->rmap->n;
2170   if (a->inode.size) {ns = a->inode.size;}
2171   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2172 
2173   i          = 0;
2174   node_count = 0;
2175   idx        = a->j;
2176   ii         = a->i;
2177   while (i < m){                /* For each row */
2178     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2179     /* Limits the number of elements in a node to 'a->inode.limit' */
2180     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2181       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2182       if (nzy != nzx) break;
2183       idy  += nzx;             /* Same nonzero pattern */
2184       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2185       if (!flag) break;
2186     }
2187     ns[node_count++] = blk_size;
2188     idx += blk_size*nzx;
2189     i    = j;
2190   }
2191   /* If not enough inodes found,, do not use inode version of the routines */
2192   if (!a->inode.size && m && node_count > .9*m) {
2193     ierr = PetscFree(ns);CHKERRQ(ierr);
2194     a->inode.node_count     = 0;
2195     a->inode.size           = PETSC_NULL;
2196     a->inode.use            = PETSC_FALSE;
2197     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2198   } else {
2199     A->ops->mult              = MatMult_SeqAIJ_Inode;
2200     A->ops->sor               = MatSOR_SeqAIJ_Inode;
2201     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
2202     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
2203     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
2204     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
2205     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
2206     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
2207     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
2208     a->inode.node_count       = node_count;
2209     a->inode.size             = ns;
2210     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2211   }
2212   PetscFunctionReturn(0);
2213 }
2214 
2215 /*
2216      This is really ugly. if inodes are used this replaces the
2217   permutations with ones that correspond to rows/cols of the matrix
2218   rather then inode blocks
2219 */
2220 #undef __FUNCT__
2221 #define __FUNCT__ "MatInodeAdjustForInodes"
2222 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2223 {
2224   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2225 
2226   PetscFunctionBegin;
2227   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2228   if (f) {
2229     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2230   }
2231   PetscFunctionReturn(0);
2232 }
2233 
2234 EXTERN_C_BEGIN
2235 #undef __FUNCT__
2236 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
2237 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
2238 {
2239   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2240   PetscErrorCode ierr;
2241   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2242   const PetscInt *ridx,*cidx;
2243   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2244   PetscInt       nslim_col,*ns_col;
2245   IS             ris = *rperm,cis = *cperm;
2246 
2247   PetscFunctionBegin;
2248   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2249   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2250 
2251   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2252   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2253   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2254   permc = permr + m;
2255 
2256   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2257   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2258 
2259   /* Form the inode structure for the rows of permuted matric using inv perm*/
2260   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2261 
2262   /* Construct the permutations for rows*/
2263   for (i=0,row = 0; i<nslim_row; ++i){
2264     indx      = ridx[i];
2265     start_val = tns[indx];
2266     end_val   = tns[indx + 1];
2267     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2268   }
2269 
2270   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2271   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2272 
2273  /* Construct permutations for columns */
2274   for (i=0,col=0; i<nslim_col; ++i){
2275     indx      = cidx[i];
2276     start_val = tns[indx];
2277     end_val   = tns[indx + 1];
2278     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2279   }
2280 
2281   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2282   ISSetPermutation(*rperm);
2283   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2284   ISSetPermutation(*cperm);
2285 
2286   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2287   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2288 
2289   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2290   ierr = PetscFree(permr);CHKERRQ(ierr);
2291   ierr = ISDestroy(cis);CHKERRQ(ierr);
2292   ierr = ISDestroy(ris);CHKERRQ(ierr);
2293   ierr = PetscFree(tns);CHKERRQ(ierr);
2294   PetscFunctionReturn(0);
2295 }
2296 EXTERN_C_END
2297 
2298 #undef __FUNCT__
2299 #define __FUNCT__ "MatInodeGetInodeSizes"
2300 /*@C
2301    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2302 
2303    Collective on Mat
2304 
2305    Input Parameter:
2306 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2307 
2308    Output Parameter:
2309 +  node_count - no of inodes present in the matrix.
2310 .  sizes      - an array of size node_count,with sizes of each inode.
2311 -  limit      - the max size used to generate the inodes.
2312 
2313    Level: advanced
2314 
2315    Notes: This routine returns some internal storage information
2316    of the matrix, it is intended to be used by advanced users.
2317    It should be called after the matrix is assembled.
2318    The contents of the sizes[] array should not be changed.
2319    PETSC_NULL may be passed for information not requested.
2320 
2321 .keywords: matrix, seqaij, get, inode
2322 
2323 .seealso: MatGetInfo()
2324 @*/
2325 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2326 {
2327   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2328 
2329   PetscFunctionBegin;
2330   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2331   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2332   if (f) {
2333     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2334   }
2335   PetscFunctionReturn(0);
2336 }
2337 
2338 EXTERN_C_BEGIN
2339 #undef __FUNCT__
2340 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
2341 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2342 {
2343   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2344 
2345   PetscFunctionBegin;
2346   if (node_count) *node_count = a->inode.node_count;
2347   if (sizes)      *sizes      = a->inode.size;
2348   if (limit)      *limit      = a->inode.limit;
2349   PetscFunctionReturn(0);
2350 }
2351 EXTERN_C_END
2352