xref: /petsc/src/mat/impls/aij/seq/inode.c (revision d2d6cebe7c034aec5d7d675a271af3ae6a2becaf)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 /* ----------------------------------------------------------- */
1182 #undef __FUNCT__
1183 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
1184 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
1185 {
1186   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
1187   IS                iscol = a->col,isrow = a->row;
1188   PetscErrorCode    ierr;
1189   const PetscInt    *r,*c,*rout,*cout;
1190   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
1191   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
1192   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
1193   PetscScalar       sum1,sum2,sum3,sum4,sum5;
1194   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
1195   const PetscScalar *b;
1196 
1197   PetscFunctionBegin;
1198   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
1199   node_max = a->inode.node_count;
1200   ns       = a->inode.size;     /* Node Size array */
1201 
1202   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   tmp  = a->solve_work;
1205 
1206   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1207   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1208 
1209   /* forward solve the lower triangular */
1210   tmps = tmp ;
1211   aa   = a_a ;
1212   aj   = a_j ;
1213   ad   = a->diag;
1214 
1215   for (i = 0,row = 0; i< node_max; ++i){
1216     nsz = ns[i];
1217     aii = ai[row];
1218     v1  = aa + aii;
1219     vi  = aj + aii;
1220     nz  = ai[row+1]- ai[row];
1221 
1222     if (i < node_max-1) {
1223       /* Prefetch the indices for the next block */
1224       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
1225       /* Prefetch the data for the next block */
1226       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
1227     }
1228 
1229     switch (nsz){               /* Each loop in 'case' is unrolled */
1230     case 1 :
1231       sum1 = b[r[row]];
1232       for(j=0; j<nz-1; j+=2){
1233         i0   = vi[j];
1234         i1   = vi[j+1];
1235         tmp0 = tmps[i0];
1236         tmp1 = tmps[i1];
1237         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
1238       }
1239       if(j == nz-1){
1240         tmp0 = tmps[vi[j]];
1241         sum1 -= v1[j]*tmp0;
1242       }
1243       tmp[row++]=sum1;
1244       break;
1245     case 2:
1246       sum1 = b[r[row]];
1247       sum2 = b[r[row+1]];
1248       v2   = aa + ai[row+1];
1249 
1250       for(j=0; j<nz-1; j+=2){
1251         i0   = vi[j];
1252         i1   = vi[j+1];
1253         tmp0 = tmps[i0];
1254         tmp1 = tmps[i1];
1255         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1256         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1257       }
1258       if(j == nz-1){
1259         tmp0 = tmps[vi[j]];
1260         sum1 -= v1[j] *tmp0;
1261         sum2 -= v2[j] *tmp0;
1262       }
1263       sum2 -= v2[nz] * sum1;
1264       tmp[row ++]=sum1;
1265       tmp[row ++]=sum2;
1266       break;
1267     case 3:
1268       sum1 = b[r[row]];
1269       sum2 = b[r[row+1]];
1270       sum3 = b[r[row+2]];
1271       v2   = aa + ai[row+1];
1272       v3   = aa + ai[row+2];
1273 
1274       for (j=0; j<nz-1; j+=2){
1275         i0   = vi[j];
1276         i1   = vi[j+1];
1277         tmp0 = tmps[i0];
1278         tmp1 = tmps[i1];
1279         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1280         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1281         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1282       }
1283       if (j == nz-1){
1284         tmp0 = tmps[vi[j]];
1285         sum1 -= v1[j] *tmp0;
1286         sum2 -= v2[j] *tmp0;
1287         sum3 -= v3[j] *tmp0;
1288       }
1289       sum2 -= v2[nz] * sum1;
1290       sum3 -= v3[nz] * sum1;
1291       sum3 -= v3[nz+1] * sum2;
1292       tmp[row ++]=sum1;
1293       tmp[row ++]=sum2;
1294       tmp[row ++]=sum3;
1295       break;
1296 
1297     case 4:
1298       sum1 = b[r[row]];
1299       sum2 = b[r[row+1]];
1300       sum3 = b[r[row+2]];
1301       sum4 = b[r[row+3]];
1302       v2   = aa + ai[row+1];
1303       v3   = aa + ai[row+2];
1304       v4   = aa + ai[row+3];
1305 
1306       for (j=0; j<nz-1; j+=2){
1307         i0   = vi[j];
1308         i1   = vi[j+1];
1309         tmp0 = tmps[i0];
1310         tmp1 = tmps[i1];
1311         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1312         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1313         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1314         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1315       }
1316       if (j == nz-1){
1317         tmp0 = tmps[vi[j]];
1318         sum1 -= v1[j] *tmp0;
1319         sum2 -= v2[j] *tmp0;
1320         sum3 -= v3[j] *tmp0;
1321         sum4 -= v4[j] *tmp0;
1322       }
1323       sum2 -= v2[nz] * sum1;
1324       sum3 -= v3[nz] * sum1;
1325       sum4 -= v4[nz] * sum1;
1326       sum3 -= v3[nz+1] * sum2;
1327       sum4 -= v4[nz+1] * sum2;
1328       sum4 -= v4[nz+2] * sum3;
1329 
1330       tmp[row ++]=sum1;
1331       tmp[row ++]=sum2;
1332       tmp[row ++]=sum3;
1333       tmp[row ++]=sum4;
1334       break;
1335     case 5:
1336       sum1 = b[r[row]];
1337       sum2 = b[r[row+1]];
1338       sum3 = b[r[row+2]];
1339       sum4 = b[r[row+3]];
1340       sum5 = b[r[row+4]];
1341       v2   = aa + ai[row+1];
1342       v3   = aa + ai[row+2];
1343       v4   = aa + ai[row+3];
1344       v5   = aa + ai[row+4];
1345 
1346       for (j=0; j<nz-1; j+=2){
1347         i0   = vi[j];
1348         i1   = vi[j+1];
1349         tmp0 = tmps[i0];
1350         tmp1 = tmps[i1];
1351         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1352         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
1353         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
1354         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
1355         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
1356       }
1357       if (j == nz-1){
1358         tmp0 = tmps[vi[j]];
1359         sum1 -= v1[j] *tmp0;
1360         sum2 -= v2[j] *tmp0;
1361         sum3 -= v3[j] *tmp0;
1362         sum4 -= v4[j] *tmp0;
1363         sum5 -= v5[j] *tmp0;
1364       }
1365 
1366       sum2 -= v2[nz] * sum1;
1367       sum3 -= v3[nz] * sum1;
1368       sum4 -= v4[nz] * sum1;
1369       sum5 -= v5[nz] * sum1;
1370       sum3 -= v3[nz+1] * sum2;
1371       sum4 -= v4[nz+1] * sum2;
1372       sum5 -= v5[nz+1] * sum2;
1373       sum4 -= v4[nz+2] * sum3;
1374       sum5 -= v5[nz+2] * sum3;
1375       sum5 -= v5[nz+3] * sum4;
1376 
1377       tmp[row ++]=sum1;
1378       tmp[row ++]=sum2;
1379       tmp[row ++]=sum3;
1380       tmp[row ++]=sum4;
1381       tmp[row ++]=sum5;
1382       break;
1383     default:
1384       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1385     }
1386   }
1387   /* backward solve the upper triangular */
1388   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
1389     nsz = ns[i];
1390     aii = ad[row+1] + 1;
1391     v1  = aa + aii;
1392     vi  = aj + aii;
1393     nz  = ad[row]- ad[row+1] - 1;
1394 
1395     if (i > 0) {
1396       /* Prefetch the indices for the next block */
1397       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
1398       /* Prefetch the data for the next block */
1399       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
1400     }
1401 
1402     switch (nsz){               /* Each loop in 'case' is unrolled */
1403     case 1 :
1404       sum1 = tmp[row];
1405 
1406       for(j=0 ; j<nz-1; j+=2){
1407         i0   = vi[j];
1408         i1   = vi[j+1];
1409         tmp0 = tmps[i0];
1410         tmp1 = tmps[i1];
1411         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1412       }
1413       if (j == nz-1){
1414         tmp0  = tmps[vi[j]];
1415         sum1 -= v1[j]*tmp0;
1416       }
1417       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1418       break;
1419     case 2 :
1420       sum1 = tmp[row];
1421       sum2 = tmp[row-1];
1422       v2   = aa + ad[row] + 1;
1423       for (j=0 ; j<nz-1; j+=2){
1424         i0   = vi[j];
1425         i1   = vi[j+1];
1426         tmp0 = tmps[i0];
1427         tmp1 = tmps[i1];
1428         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1429         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1430       }
1431       if (j == nz-1){
1432         tmp0  = tmps[vi[j]];
1433         sum1 -= v1[j]* tmp0;
1434         sum2 -= v2[j+1]* tmp0;
1435       }
1436 
1437       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1438       sum2   -= v2[0] * tmp0;
1439       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1440       break;
1441     case 3 :
1442       sum1 = tmp[row];
1443       sum2 = tmp[row -1];
1444       sum3 = tmp[row -2];
1445       v2   = aa + ad[row] + 1;
1446       v3   = aa + ad[row -1] + 1;
1447       for (j=0 ; j<nz-1; j+=2){
1448         i0   = vi[j];
1449         i1   = vi[j+1];
1450         tmp0 = tmps[i0];
1451         tmp1 = tmps[i1];
1452         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1453         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1454         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1455       }
1456       if (j== nz-1){
1457         tmp0  = tmps[vi[j]];
1458         sum1 -= v1[j] * tmp0;
1459         sum2 -= v2[j+1] * tmp0;
1460         sum3 -= v3[j+2] * tmp0;
1461       }
1462       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1463       sum2   -= v2[0]* tmp0;
1464       sum3   -= v3[1] * tmp0;
1465       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1466       sum3   -= v3[0]* tmp0;
1467       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1468 
1469       break;
1470     case 4 :
1471       sum1 = tmp[row];
1472       sum2 = tmp[row -1];
1473       sum3 = tmp[row -2];
1474       sum4 = tmp[row -3];
1475       v2   = aa + ad[row]+1;
1476       v3   = aa + ad[row -1]+1;
1477       v4   = aa + ad[row -2]+1;
1478 
1479       for (j=0 ; j<nz-1; j+=2){
1480         i0   = vi[j];
1481         i1   = vi[j+1];
1482         tmp0 = tmps[i0];
1483         tmp1 = tmps[i1];
1484         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
1485         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1486         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1487         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1488       }
1489       if (j== nz-1){
1490         tmp0  = tmps[vi[j]];
1491         sum1 -= v1[j] * tmp0;
1492         sum2 -= v2[j+1] * tmp0;
1493         sum3 -= v3[j+2] * tmp0;
1494         sum4 -= v4[j+3] * tmp0;
1495       }
1496 
1497       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1498       sum2   -= v2[0] * tmp0;
1499       sum3   -= v3[1] * tmp0;
1500       sum4   -= v4[2] * tmp0;
1501       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1502       sum3   -= v3[0] * tmp0;
1503       sum4   -= v4[1] * tmp0;
1504       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1505       sum4   -= v4[0] * tmp0;
1506       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1507       break;
1508     case 5 :
1509       sum1 = tmp[row];
1510       sum2 = tmp[row -1];
1511       sum3 = tmp[row -2];
1512       sum4 = tmp[row -3];
1513       sum5 = tmp[row -4];
1514       v2   = aa + ad[row]+1;
1515       v3   = aa + ad[row -1]+1;
1516       v4   = aa + ad[row -2]+1;
1517       v5   = aa + ad[row -3]+1;
1518       for (j=0 ; j<nz-1; j+=2){
1519         i0   = vi[j];
1520         i1   = vi[j+1];
1521         tmp0 = tmps[i0];
1522         tmp1 = tmps[i1];
1523         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
1524         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
1525         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
1526         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
1527         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
1528       }
1529       if (j==nz-1){
1530         tmp0  = tmps[vi[j]];
1531         sum1 -= v1[j] * tmp0;
1532         sum2 -= v2[j+1] * tmp0;
1533         sum3 -= v3[j+2] * tmp0;
1534         sum4 -= v4[j+3] * tmp0;
1535         sum5 -= v5[j+4] * tmp0;
1536       }
1537 
1538       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
1539       sum2   -= v2[0] * tmp0;
1540       sum3   -= v3[1] * tmp0;
1541       sum4   -= v4[2] * tmp0;
1542       sum5   -= v5[3] * tmp0;
1543       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
1544       sum3   -= v3[0] * tmp0;
1545       sum4   -= v4[1] * tmp0;
1546       sum5   -= v5[2] * tmp0;
1547       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
1548       sum4   -= v4[0] * tmp0;
1549       sum5   -= v5[1] * tmp0;
1550       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
1551       sum5   -= v5[0] * tmp0;
1552       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
1553       break;
1554     default:
1555       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1556     }
1557   }
1558   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1559   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1560   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1561   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1562   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 
1567 /*
1568      Makes a longer coloring[] array and calls the usual code with that
1569 */
1570 #undef __FUNCT__
1571 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
1572 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1573 {
1574   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1575   PetscErrorCode  ierr;
1576   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1577   PetscInt        *colorused,i;
1578   ISColoringValue *newcolor;
1579 
1580   PetscFunctionBegin;
1581   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1582   /* loop over inodes, marking a color for each column*/
1583   row = 0;
1584   for (i=0; i<m; i++){
1585     for (j=0; j<ns[i]; j++) {
1586       newcolor[row++] = coloring[i] + j*ncolors;
1587     }
1588   }
1589 
1590   /* eliminate unneeded colors */
1591   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1592   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1593   for (i=0; i<n; i++) {
1594     colorused[newcolor[i]] = 1;
1595   }
1596 
1597   for (i=1; i<5*ncolors; i++) {
1598     colorused[i] += colorused[i-1];
1599   }
1600   ncolors = colorused[5*ncolors-1];
1601   for (i=0; i<n; i++) {
1602     newcolor[i] = colorused[newcolor[i]]-1;
1603   }
1604   ierr = PetscFree(colorused);CHKERRQ(ierr);
1605   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1606   ierr = PetscFree(coloring);CHKERRQ(ierr);
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 #include "../src/mat/blockinvert.h"
1611 
1612 #undef __FUNCT__
1613 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
1614 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1615 {
1616   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1617   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1618   MatScalar          *ibdiag,*bdiag,work[25];
1619   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1620   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1621   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1622   PetscErrorCode     ierr;
1623   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1624   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
1625 
1626   PetscFunctionBegin;
1627   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1628   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1629   if (its > 1) {
1630     /* switch to non-inode version */
1631     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1632     PetscFunctionReturn(0);
1633   }
1634 
1635   if (!a->inode.ibdiagvalid) {
1636     if (!a->inode.ibdiag) {
1637       /* calculate space needed for diagonal blocks */
1638       for (i=0; i<m; i++) {
1639 	cnt += sizes[i]*sizes[i];
1640       }
1641       a->inode.bdiagsize = cnt;
1642       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1643     }
1644 
1645     /* copy over the diagonal blocks and invert them */
1646     ibdiag = a->inode.ibdiag;
1647     bdiag  = a->inode.bdiag;
1648     cnt = 0;
1649     for (i=0, row = 0; i<m; i++) {
1650       for (j=0; j<sizes[i]; j++) {
1651         for (k=0; k<sizes[i]; k++) {
1652           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1653         }
1654       }
1655       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1656 
1657       switch(sizes[i]) {
1658         case 1:
1659           /* Create matrix data structure */
1660           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1661           ibdiag[cnt] = 1.0/ibdiag[cnt];
1662           break;
1663         case 2:
1664           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1665           break;
1666         case 3:
1667           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1668           break;
1669         case 4:
1670           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1671           break;
1672         case 5:
1673           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
1674           break;
1675        default:
1676 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1677       }
1678       cnt += sizes[i]*sizes[i];
1679       row += sizes[i];
1680     }
1681     a->inode.ibdiagvalid = PETSC_TRUE;
1682   }
1683   ibdiag = a->inode.ibdiag;
1684   bdiag  = a->inode.bdiag;
1685 
1686 
1687   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1688   if (flag & SOR_ZERO_INITIAL_GUESS) {
1689     PetscScalar *b;
1690     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1691     if (xx != bb) {
1692       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1693     } else {
1694       b = x;
1695     }
1696     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1697 
1698       for (i=0, row=0; i<m; i++) {
1699         sz  = diag[row] - ii[row];
1700         v1  = a->a + ii[row];
1701         idx = a->j + ii[row];
1702 
1703         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1704         switch (sizes[i]){
1705           case 1:
1706 
1707             sum1  = b[row];
1708             for(n = 0; n<sz-1; n+=2) {
1709               i1   = idx[0];
1710               i2   = idx[1];
1711               idx += 2;
1712               tmp0 = x[i1];
1713               tmp1 = x[i2];
1714               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1715             }
1716 
1717             if (n == sz-1){
1718               tmp0  = x[*idx];
1719               sum1 -= *v1 * tmp0;
1720             }
1721             x[row++] = sum1*(*ibdiag++);
1722             break;
1723           case 2:
1724             v2    = a->a + ii[row+1];
1725             sum1  = b[row];
1726             sum2  = b[row+1];
1727             for(n = 0; n<sz-1; n+=2) {
1728               i1   = idx[0];
1729               i2   = idx[1];
1730               idx += 2;
1731               tmp0 = x[i1];
1732               tmp1 = x[i2];
1733               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1734               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1735             }
1736 
1737             if (n == sz-1){
1738               tmp0  = x[*idx];
1739               sum1 -= v1[0] * tmp0;
1740               sum2 -= v2[0] * tmp0;
1741             }
1742             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1743             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1744             ibdiag  += 4;
1745             break;
1746           case 3:
1747             v2    = a->a + ii[row+1];
1748             v3    = a->a + ii[row+2];
1749             sum1  = b[row];
1750             sum2  = b[row+1];
1751             sum3  = b[row+2];
1752             for(n = 0; n<sz-1; n+=2) {
1753               i1   = idx[0];
1754               i2   = idx[1];
1755               idx += 2;
1756               tmp0 = x[i1];
1757               tmp1 = x[i2];
1758               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1759               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1760               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1761             }
1762 
1763             if (n == sz-1){
1764               tmp0  = x[*idx];
1765               sum1 -= v1[0] * tmp0;
1766               sum2 -= v2[0] * tmp0;
1767               sum3 -= v3[0] * tmp0;
1768             }
1769             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1770             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1771             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1772             ibdiag  += 9;
1773             break;
1774           case 4:
1775             v2    = a->a + ii[row+1];
1776             v3    = a->a + ii[row+2];
1777             v4    = a->a + ii[row+3];
1778             sum1  = b[row];
1779             sum2  = b[row+1];
1780             sum3  = b[row+2];
1781             sum4  = b[row+3];
1782             for(n = 0; n<sz-1; n+=2) {
1783               i1   = idx[0];
1784               i2   = idx[1];
1785               idx += 2;
1786               tmp0 = x[i1];
1787               tmp1 = x[i2];
1788               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1789               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1790               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1791               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1792             }
1793 
1794             if (n == sz-1){
1795               tmp0  = x[*idx];
1796               sum1 -= v1[0] * tmp0;
1797               sum2 -= v2[0] * tmp0;
1798               sum3 -= v3[0] * tmp0;
1799               sum4 -= v4[0] * tmp0;
1800             }
1801             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1802             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1803             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1804             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1805             ibdiag  += 16;
1806             break;
1807           case 5:
1808             v2    = a->a + ii[row+1];
1809             v3    = a->a + ii[row+2];
1810             v4    = a->a + ii[row+3];
1811             v5    = a->a + ii[row+4];
1812             sum1  = b[row];
1813             sum2  = b[row+1];
1814             sum3  = b[row+2];
1815             sum4  = b[row+3];
1816             sum5  = b[row+4];
1817             for(n = 0; n<sz-1; n+=2) {
1818               i1   = idx[0];
1819               i2   = idx[1];
1820               idx += 2;
1821               tmp0 = x[i1];
1822               tmp1 = x[i2];
1823               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1824               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1825               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1826               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1827               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1828             }
1829 
1830             if (n == sz-1){
1831               tmp0  = x[*idx];
1832               sum1 -= v1[0] * tmp0;
1833               sum2 -= v2[0] * tmp0;
1834               sum3 -= v3[0] * tmp0;
1835               sum4 -= v4[0] * tmp0;
1836               sum5 -= v5[0] * tmp0;
1837             }
1838             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1839             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1840             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1841             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1842             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1843             ibdiag  += 25;
1844             break;
1845 	  default:
1846    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1847         }
1848       }
1849 
1850       xb = x;
1851       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1852     } else xb = b;
1853     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1854         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1855       cnt = 0;
1856       for (i=0, row=0; i<m; i++) {
1857 
1858         switch (sizes[i]){
1859           case 1:
1860             x[row++] *= bdiag[cnt++];
1861             break;
1862           case 2:
1863             x1   = x[row]; x2 = x[row+1];
1864             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1865             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1866             x[row++] = tmp1;
1867             x[row++] = tmp2;
1868             cnt += 4;
1869             break;
1870           case 3:
1871             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1872             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1873             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1874             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1875             x[row++] = tmp1;
1876             x[row++] = tmp2;
1877             x[row++] = tmp3;
1878             cnt += 9;
1879             break;
1880           case 4:
1881             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1882             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1883             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1884             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1885             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1886             x[row++] = tmp1;
1887             x[row++] = tmp2;
1888             x[row++] = tmp3;
1889             x[row++] = tmp4;
1890             cnt += 16;
1891             break;
1892           case 5:
1893             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1894             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1895             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1896             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1897             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1898             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1899             x[row++] = tmp1;
1900             x[row++] = tmp2;
1901             x[row++] = tmp3;
1902             x[row++] = tmp4;
1903             x[row++] = tmp5;
1904             cnt += 25;
1905             break;
1906 	  default:
1907    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1908         }
1909       }
1910       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1911     }
1912     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1913 
1914       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1915       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1916         ibdiag -= sizes[i]*sizes[i];
1917         sz      = ii[row+1] - diag[row] - 1;
1918         v1      = a->a + diag[row] + 1;
1919         idx     = a->j + diag[row] + 1;
1920 
1921         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
1922         switch (sizes[i]){
1923           case 1:
1924 
1925             sum1  = xb[row];
1926             for(n = 0; n<sz-1; n+=2) {
1927               i1   = idx[0];
1928               i2   = idx[1];
1929               idx += 2;
1930               tmp0 = x[i1];
1931               tmp1 = x[i2];
1932               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1933             }
1934 
1935             if (n == sz-1){
1936               tmp0  = x[*idx];
1937               sum1 -= *v1*tmp0;
1938             }
1939             x[row--] = sum1*(*ibdiag);
1940             break;
1941 
1942           case 2:
1943 
1944             sum1  = xb[row];
1945             sum2  = xb[row-1];
1946             /* note that sum1 is associated with the second of the two rows */
1947             v2    = a->a + diag[row-1] + 2;
1948             for(n = 0; n<sz-1; n+=2) {
1949               i1   = idx[0];
1950               i2   = idx[1];
1951               idx += 2;
1952               tmp0 = x[i1];
1953               tmp1 = x[i2];
1954               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1955               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1956             }
1957 
1958             if (n == sz-1){
1959               tmp0  = x[*idx];
1960               sum1 -= *v1*tmp0;
1961               sum2 -= *v2*tmp0;
1962             }
1963             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1964             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1965             break;
1966           case 3:
1967 
1968             sum1  = xb[row];
1969             sum2  = xb[row-1];
1970             sum3  = xb[row-2];
1971             v2    = a->a + diag[row-1] + 2;
1972             v3    = a->a + diag[row-2] + 3;
1973             for(n = 0; n<sz-1; n+=2) {
1974               i1   = idx[0];
1975               i2   = idx[1];
1976               idx += 2;
1977               tmp0 = x[i1];
1978               tmp1 = x[i2];
1979               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1980               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1981               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1982             }
1983 
1984             if (n == sz-1){
1985               tmp0  = x[*idx];
1986               sum1 -= *v1*tmp0;
1987               sum2 -= *v2*tmp0;
1988               sum3 -= *v3*tmp0;
1989             }
1990             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
1991             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
1992             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
1993             break;
1994           case 4:
1995 
1996             sum1  = xb[row];
1997             sum2  = xb[row-1];
1998             sum3  = xb[row-2];
1999             sum4  = xb[row-3];
2000             v2    = a->a + diag[row-1] + 2;
2001             v3    = a->a + diag[row-2] + 3;
2002             v4    = a->a + diag[row-3] + 4;
2003             for(n = 0; n<sz-1; n+=2) {
2004               i1   = idx[0];
2005               i2   = idx[1];
2006               idx += 2;
2007               tmp0 = x[i1];
2008               tmp1 = x[i2];
2009               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2010               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2011               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2012               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2013             }
2014 
2015             if (n == sz-1){
2016               tmp0  = x[*idx];
2017               sum1 -= *v1*tmp0;
2018               sum2 -= *v2*tmp0;
2019               sum3 -= *v3*tmp0;
2020               sum4 -= *v4*tmp0;
2021             }
2022             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2023             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2024             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2025             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2026             break;
2027           case 5:
2028 
2029             sum1  = xb[row];
2030             sum2  = xb[row-1];
2031             sum3  = xb[row-2];
2032             sum4  = xb[row-3];
2033             sum5  = xb[row-4];
2034             v2    = a->a + diag[row-1] + 2;
2035             v3    = a->a + diag[row-2] + 3;
2036             v4    = a->a + diag[row-3] + 4;
2037             v5    = a->a + diag[row-4] + 5;
2038             for(n = 0; n<sz-1; n+=2) {
2039               i1   = idx[0];
2040               i2   = idx[1];
2041               idx += 2;
2042               tmp0 = x[i1];
2043               tmp1 = x[i2];
2044               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2045               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2046               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2047               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2048               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2049             }
2050 
2051             if (n == sz-1){
2052               tmp0  = x[*idx];
2053               sum1 -= *v1*tmp0;
2054               sum2 -= *v2*tmp0;
2055               sum3 -= *v3*tmp0;
2056               sum4 -= *v4*tmp0;
2057               sum5 -= *v5*tmp0;
2058             }
2059             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2060             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2061             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2062             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2063             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2064             break;
2065 	  default:
2066    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2067         }
2068       }
2069 
2070       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2071     }
2072     its--;
2073     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2074     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2075   }
2076   if (flag & SOR_EISENSTAT) {
2077     const PetscScalar *b;
2078     MatScalar         *t = a->inode.ssor_work;
2079 
2080     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2081     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2082     /*
2083           Apply  (U + D)^-1  where D is now the block diagonal
2084     */
2085     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2086     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2087       ibdiag -= sizes[i]*sizes[i];
2088       sz      = ii[row+1] - diag[row] - 1;
2089       v1      = a->a + diag[row] + 1;
2090       idx     = a->j + diag[row] + 1;
2091       CHKMEMQ;
2092       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2093       switch (sizes[i]){
2094         case 1:
2095 
2096 	  sum1  = b[row];
2097 	  for(n = 0; n<sz-1; n+=2) {
2098 	    i1   = idx[0];
2099 	    i2   = idx[1];
2100 	    idx += 2;
2101 	    tmp0 = x[i1];
2102 	    tmp1 = x[i2];
2103 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2104 	  }
2105 
2106 	  if (n == sz-1){
2107 	    tmp0  = x[*idx];
2108 	    sum1 -= *v1*tmp0;
2109 	  }
2110 	  x[row] = sum1*(*ibdiag);row--;
2111 	  break;
2112 
2113         case 2:
2114 
2115 	  sum1  = b[row];
2116 	  sum2  = b[row-1];
2117 	  /* note that sum1 is associated with the second of the two rows */
2118 	  v2    = a->a + diag[row-1] + 2;
2119 	  for(n = 0; n<sz-1; n+=2) {
2120 	    i1   = idx[0];
2121 	    i2   = idx[1];
2122 	    idx += 2;
2123 	    tmp0 = x[i1];
2124 	    tmp1 = x[i2];
2125 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2126 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2127 	  }
2128 
2129 	  if (n == sz-1){
2130 	    tmp0  = x[*idx];
2131 	    sum1 -= *v1*tmp0;
2132 	    sum2 -= *v2*tmp0;
2133 	  }
2134 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2135 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2136           row -= 2;
2137 	  break;
2138         case 3:
2139 
2140 	  sum1  = b[row];
2141 	  sum2  = b[row-1];
2142 	  sum3  = b[row-2];
2143 	  v2    = a->a + diag[row-1] + 2;
2144 	  v3    = a->a + diag[row-2] + 3;
2145 	  for(n = 0; n<sz-1; n+=2) {
2146 	    i1   = idx[0];
2147 	    i2   = idx[1];
2148 	    idx += 2;
2149 	    tmp0 = x[i1];
2150 	    tmp1 = x[i2];
2151 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2152 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2153 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2154 	  }
2155 
2156 	  if (n == sz-1){
2157 	    tmp0  = x[*idx];
2158 	    sum1 -= *v1*tmp0;
2159 	    sum2 -= *v2*tmp0;
2160 	    sum3 -= *v3*tmp0;
2161 	  }
2162 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2163 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2164 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2165           row -= 3;
2166 	  break;
2167         case 4:
2168 
2169 	  sum1  = b[row];
2170 	  sum2  = b[row-1];
2171 	  sum3  = b[row-2];
2172 	  sum4  = b[row-3];
2173 	  v2    = a->a + diag[row-1] + 2;
2174 	  v3    = a->a + diag[row-2] + 3;
2175 	  v4    = a->a + diag[row-3] + 4;
2176 	  for(n = 0; n<sz-1; n+=2) {
2177 	    i1   = idx[0];
2178 	    i2   = idx[1];
2179 	    idx += 2;
2180 	    tmp0 = x[i1];
2181 	    tmp1 = x[i2];
2182 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2183 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2184 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2185 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2186 	  }
2187 
2188 	  if (n == sz-1){
2189 	    tmp0  = x[*idx];
2190 	    sum1 -= *v1*tmp0;
2191 	    sum2 -= *v2*tmp0;
2192 	    sum3 -= *v3*tmp0;
2193 	    sum4 -= *v4*tmp0;
2194 	  }
2195 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2196 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2197 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2198 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2199           row -= 4;
2200 	  break;
2201         case 5:
2202 
2203 	  sum1  = b[row];
2204 	  sum2  = b[row-1];
2205 	  sum3  = b[row-2];
2206 	  sum4  = b[row-3];
2207 	  sum5  = b[row-4];
2208 	  v2    = a->a + diag[row-1] + 2;
2209 	  v3    = a->a + diag[row-2] + 3;
2210 	  v4    = a->a + diag[row-3] + 4;
2211 	  v5    = a->a + diag[row-4] + 5;
2212 	  for(n = 0; n<sz-1; n+=2) {
2213 	    i1   = idx[0];
2214 	    i2   = idx[1];
2215 	    idx += 2;
2216 	    tmp0 = x[i1];
2217 	    tmp1 = x[i2];
2218 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2219 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2220 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2221 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2222 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2223 	  }
2224 
2225 	  if (n == sz-1){
2226 	    tmp0  = x[*idx];
2227 	    sum1 -= *v1*tmp0;
2228 	    sum2 -= *v2*tmp0;
2229 	    sum3 -= *v3*tmp0;
2230 	    sum4 -= *v4*tmp0;
2231 	    sum5 -= *v5*tmp0;
2232 	  }
2233 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2234 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2235 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2236 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2237 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2238           row -= 5;
2239 	  break;
2240         default:
2241 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2242       }
2243       CHKMEMQ;
2244     }
2245     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2246 
2247     /*
2248            t = b - D x    where D is the block diagonal
2249     */
2250     cnt = 0;
2251     for (i=0, row=0; i<m; i++) {
2252       CHKMEMQ;
2253       switch (sizes[i]){
2254         case 1:
2255 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2256 	  break;
2257         case 2:
2258 	  x1   = x[row]; x2 = x[row+1];
2259 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2260 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2261 	  t[row]   = b[row] - tmp1;
2262 	  t[row+1] = b[row+1] - tmp2; row += 2;
2263 	  cnt += 4;
2264 	  break;
2265         case 3:
2266 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2267 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2268 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2269 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2270 	  t[row] = b[row] - tmp1;
2271 	  t[row+1] = b[row+1] - tmp2;
2272 	  t[row+2] = b[row+2] - tmp3; row += 3;
2273 	  cnt += 9;
2274 	  break;
2275         case 4:
2276 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2277 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2278 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2279 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2280 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2281 	  t[row] = b[row] - tmp1;
2282 	  t[row+1] = b[row+1] - tmp2;
2283 	  t[row+2] = b[row+2] - tmp3;
2284 	  t[row+3] = b[row+3] - tmp4; row += 4;
2285 	  cnt += 16;
2286 	  break;
2287         case 5:
2288 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2289 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2290 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2291 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2292 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2293 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2294 	  t[row] = b[row] - tmp1;
2295 	  t[row+1] = b[row+1] - tmp2;
2296 	  t[row+2] = b[row+2] - tmp3;
2297 	  t[row+3] = b[row+3] - tmp4;
2298 	  t[row+4] = b[row+4] - tmp5;row += 5;
2299 	  cnt += 25;
2300 	  break;
2301         default:
2302 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2303       }
2304       CHKMEMQ;
2305     }
2306     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2307 
2308 
2309 
2310     /*
2311           Apply (L + D)^-1 where D is the block diagonal
2312     */
2313     for (i=0, row=0; i<m; i++) {
2314       sz  = diag[row] - ii[row];
2315       v1  = a->a + ii[row];
2316       idx = a->j + ii[row];
2317       CHKMEMQ;
2318       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2319       switch (sizes[i]){
2320         case 1:
2321 
2322 	  sum1  = t[row];
2323 	  for(n = 0; n<sz-1; n+=2) {
2324 	    i1   = idx[0];
2325 	    i2   = idx[1];
2326 	    idx += 2;
2327 	    tmp0 = t[i1];
2328 	    tmp1 = t[i2];
2329 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2330 	  }
2331 
2332 	  if (n == sz-1){
2333 	    tmp0  = t[*idx];
2334 	    sum1 -= *v1 * tmp0;
2335 	  }
2336 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2337 	  break;
2338         case 2:
2339 	  v2    = a->a + ii[row+1];
2340 	  sum1  = t[row];
2341 	  sum2  = t[row+1];
2342 	  for(n = 0; n<sz-1; n+=2) {
2343 	    i1   = idx[0];
2344 	    i2   = idx[1];
2345 	    idx += 2;
2346 	    tmp0 = t[i1];
2347 	    tmp1 = t[i2];
2348 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2349 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2350 	  }
2351 
2352 	  if (n == sz-1){
2353 	    tmp0  = t[*idx];
2354 	    sum1 -= v1[0] * tmp0;
2355 	    sum2 -= v2[0] * tmp0;
2356 	  }
2357 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2358 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2359 	  ibdiag  += 4; row += 2;
2360 	  break;
2361         case 3:
2362 	  v2    = a->a + ii[row+1];
2363 	  v3    = a->a + ii[row+2];
2364 	  sum1  = t[row];
2365 	  sum2  = t[row+1];
2366 	  sum3  = t[row+2];
2367 	  for(n = 0; n<sz-1; n+=2) {
2368 	    i1   = idx[0];
2369 	    i2   = idx[1];
2370 	    idx += 2;
2371 	    tmp0 = t[i1];
2372 	    tmp1 = t[i2];
2373 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2374 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2375 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2376 	  }
2377 
2378 	  if (n == sz-1){
2379 	    tmp0  = t[*idx];
2380 	    sum1 -= v1[0] * tmp0;
2381 	    sum2 -= v2[0] * tmp0;
2382 	    sum3 -= v3[0] * tmp0;
2383 	  }
2384 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2385 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2386 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2387 	  ibdiag  += 9; row += 3;
2388 	  break;
2389         case 4:
2390 	  v2    = a->a + ii[row+1];
2391 	  v3    = a->a + ii[row+2];
2392 	  v4    = a->a + ii[row+3];
2393 	  sum1  = t[row];
2394 	  sum2  = t[row+1];
2395 	  sum3  = t[row+2];
2396 	  sum4  = t[row+3];
2397 	  for(n = 0; n<sz-1; n+=2) {
2398 	    i1   = idx[0];
2399 	    i2   = idx[1];
2400 	    idx += 2;
2401 	    tmp0 = t[i1];
2402 	    tmp1 = t[i2];
2403 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2404 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2405 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2406 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2407 	  }
2408 
2409 	  if (n == sz-1){
2410 	    tmp0  = t[*idx];
2411 	    sum1 -= v1[0] * tmp0;
2412 	    sum2 -= v2[0] * tmp0;
2413 	    sum3 -= v3[0] * tmp0;
2414 	    sum4 -= v4[0] * tmp0;
2415 	  }
2416 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2417 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2418 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2419 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2420 	  ibdiag  += 16; row += 4;
2421 	  break;
2422         case 5:
2423 	  v2    = a->a + ii[row+1];
2424 	  v3    = a->a + ii[row+2];
2425 	  v4    = a->a + ii[row+3];
2426 	  v5    = a->a + ii[row+4];
2427 	  sum1  = t[row];
2428 	  sum2  = t[row+1];
2429 	  sum3  = t[row+2];
2430 	  sum4  = t[row+3];
2431 	  sum5  = t[row+4];
2432 	  for(n = 0; n<sz-1; n+=2) {
2433 	    i1   = idx[0];
2434 	    i2   = idx[1];
2435 	    idx += 2;
2436 	    tmp0 = t[i1];
2437 	    tmp1 = t[i2];
2438 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2439 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2440 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2441 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2442 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2443 	  }
2444 
2445 	  if (n == sz-1){
2446 	    tmp0  = t[*idx];
2447 	    sum1 -= v1[0] * tmp0;
2448 	    sum2 -= v2[0] * tmp0;
2449 	    sum3 -= v3[0] * tmp0;
2450 	    sum4 -= v4[0] * tmp0;
2451 	    sum5 -= v5[0] * tmp0;
2452 	  }
2453 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2454 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2455 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2456 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2457 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2458 	  ibdiag  += 25; row += 5;
2459 	  break;
2460         default:
2461 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2462       }
2463       CHKMEMQ;
2464     }
2465     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2466     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2467     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2468   }
2469   PetscFunctionReturn(0);
2470 }
2471 
2472 #undef __FUNCT__
2473 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
2474 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2475 {
2476   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2477   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2478   const MatScalar    *bdiag = a->inode.bdiag;
2479   const PetscScalar  *b;
2480   PetscErrorCode      ierr;
2481   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2482   const PetscInt      *sizes = a->inode.size;
2483 
2484   PetscFunctionBegin;
2485   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2486   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2487   cnt = 0;
2488   for (i=0, row=0; i<m; i++) {
2489     switch (sizes[i]){
2490       case 1:
2491 	x[row] = b[row]*bdiag[cnt++];row++;
2492 	break;
2493       case 2:
2494 	x1   = b[row]; x2 = b[row+1];
2495 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2496 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2497 	x[row++] = tmp1;
2498 	x[row++] = tmp2;
2499 	cnt += 4;
2500 	break;
2501       case 3:
2502 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2503 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2504 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2505 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2506 	x[row++] = tmp1;
2507 	x[row++] = tmp2;
2508 	x[row++] = tmp3;
2509 	cnt += 9;
2510 	break;
2511       case 4:
2512 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2513 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2514 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2515 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2516 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2517 	x[row++] = tmp1;
2518 	x[row++] = tmp2;
2519 	x[row++] = tmp3;
2520 	x[row++] = tmp4;
2521 	cnt += 16;
2522 	break;
2523       case 5:
2524 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2525 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2526 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2527 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2528 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2529 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2530 	x[row++] = tmp1;
2531 	x[row++] = tmp2;
2532 	x[row++] = tmp3;
2533 	x[row++] = tmp4;
2534 	x[row++] = tmp5;
2535 	cnt += 25;
2536 	break;
2537       default:
2538 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2539     }
2540   }
2541   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2542   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2543   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2544   PetscFunctionReturn(0);
2545 }
2546 
2547 /*
2548     samestructure indicates that the matrix has not changed its nonzero structure so we
2549     do not need to recompute the inodes
2550 */
2551 #undef __FUNCT__
2552 #define __FUNCT__ "Mat_CheckInode"
2553 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2554 {
2555   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2556   PetscErrorCode ierr;
2557   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2558   PetscTruth     flag;
2559 
2560   PetscFunctionBegin;
2561   if (!a->inode.use)                     PetscFunctionReturn(0);
2562   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2563 
2564 
2565   m = A->rmap->n;
2566   if (a->inode.size) {ns = a->inode.size;}
2567   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2568 
2569   i          = 0;
2570   node_count = 0;
2571   idx        = a->j;
2572   ii         = a->i;
2573   while (i < m){                /* For each row */
2574     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2575     /* Limits the number of elements in a node to 'a->inode.limit' */
2576     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2577       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2578       if (nzy != nzx) break;
2579       idy  += nzx;             /* Same nonzero pattern */
2580       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2581       if (!flag) break;
2582     }
2583     ns[node_count++] = blk_size;
2584     idx += blk_size*nzx;
2585     i    = j;
2586   }
2587   /* If not enough inodes found,, do not use inode version of the routines */
2588   if (!a->inode.size && m && node_count > .9*m) {
2589     ierr = PetscFree(ns);CHKERRQ(ierr);
2590     a->inode.node_count     = 0;
2591     a->inode.size           = PETSC_NULL;
2592     a->inode.use            = PETSC_FALSE;
2593     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2594   } else {
2595     A->ops->mult              = MatMult_SeqAIJ_Inode;
2596     A->ops->sor               = MatSOR_SeqAIJ_Inode;
2597     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
2598     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
2599     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
2600     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
2601     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
2602     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
2603     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
2604     a->inode.node_count       = node_count;
2605     a->inode.size             = ns;
2606     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2607   }
2608   PetscFunctionReturn(0);
2609 }
2610 
2611 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
2612 PetscInt __k, *__vi; \
2613 __vi = aj + ai[row];				\
2614 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
2615 __vi = aj + adiag[row];				\
2616 cols[nzl] = __vi[0];\
2617 __vi = aj + adiag[row+1]+1;\
2618 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
2619 
2620 #undef __FUNCT__
2621 #define __FUNCT__ "Mat_CheckInode_FactorLU"
2622 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
2623 {
2624   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2625   PetscErrorCode ierr;
2626   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
2627   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
2628   PetscTruth     flag;
2629 
2630   PetscFunctionBegin;
2631   if (!a->inode.use)                     PetscFunctionReturn(0);
2632   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2633 
2634   m = A->rmap->n;
2635   if (a->inode.size) {ns = a->inode.size;}
2636   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2637 
2638   i          = 0;
2639   node_count = 0;
2640   while (i < m){                /* For each row */
2641     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
2642     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
2643     nzx  = nzl1 + nzu1 + 1;
2644     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
2645     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
2646 
2647     /* Limits the number of elements in a node to 'a->inode.limit' */
2648     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2649       nzl2    = ai[j+1] - ai[j];
2650       nzu2    = adiag[j] - adiag[j+1] - 1;
2651       nzy     = nzl2 + nzu2 + 1;
2652       if( nzy != nzx) break;
2653       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
2654       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
2655       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2656       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
2657       ierr = PetscFree(cols2);CHKERRQ(ierr);
2658     }
2659     ns[node_count++] = blk_size;
2660     ierr = PetscFree(cols1);CHKERRQ(ierr);
2661     i    = j;
2662   }
2663   /* If not enough inodes found,, do not use inode version of the routines */
2664   if (!a->inode.size && m && node_count > .9*m) {
2665     ierr = PetscFree(ns);CHKERRQ(ierr);
2666     a->inode.node_count     = 0;
2667     a->inode.size           = PETSC_NULL;
2668     a->inode.use            = PETSC_FALSE;
2669     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2670   } else {
2671     A->ops->mult              = 0;
2672     A->ops->sor               = 0;
2673     A->ops->multadd           = 0;
2674     A->ops->getrowij          = 0;
2675     A->ops->restorerowij      = 0;
2676     A->ops->getcolumnij       = 0;
2677     A->ops->restorecolumnij   = 0;
2678     A->ops->coloringpatch     = 0;
2679     A->ops->multdiagonalblock = 0;
2680     a->inode.node_count       = node_count;
2681     a->inode.size             = ns;
2682     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2683   }
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 /*
2688      This is really ugly. if inodes are used this replaces the
2689   permutations with ones that correspond to rows/cols of the matrix
2690   rather then inode blocks
2691 */
2692 #undef __FUNCT__
2693 #define __FUNCT__ "MatInodeAdjustForInodes"
2694 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2695 {
2696   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2697 
2698   PetscFunctionBegin;
2699   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2700   if (f) {
2701     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2702   }
2703   PetscFunctionReturn(0);
2704 }
2705 
2706 EXTERN_C_BEGIN
2707 #undef __FUNCT__
2708 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
2709 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
2710 {
2711   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2712   PetscErrorCode ierr;
2713   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2714   const PetscInt *ridx,*cidx;
2715   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2716   PetscInt       nslim_col,*ns_col;
2717   IS             ris = *rperm,cis = *cperm;
2718 
2719   PetscFunctionBegin;
2720   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2721   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2722 
2723   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2724   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2725   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
2726 
2727   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2728   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2729 
2730   /* Form the inode structure for the rows of permuted matric using inv perm*/
2731   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2732 
2733   /* Construct the permutations for rows*/
2734   for (i=0,row = 0; i<nslim_row; ++i){
2735     indx      = ridx[i];
2736     start_val = tns[indx];
2737     end_val   = tns[indx + 1];
2738     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2739   }
2740 
2741   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2742   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2743 
2744  /* Construct permutations for columns */
2745   for (i=0,col=0; i<nslim_col; ++i){
2746     indx      = cidx[i];
2747     start_val = tns[indx];
2748     end_val   = tns[indx + 1];
2749     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2750   }
2751 
2752   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2753   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
2754   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2755   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
2756 
2757   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2758   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2759 
2760   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2761   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
2762   ierr = ISDestroy(cis);CHKERRQ(ierr);
2763   ierr = ISDestroy(ris);CHKERRQ(ierr);
2764   ierr = PetscFree(tns);CHKERRQ(ierr);
2765   PetscFunctionReturn(0);
2766 }
2767 EXTERN_C_END
2768 
2769 #undef __FUNCT__
2770 #define __FUNCT__ "MatInodeGetInodeSizes"
2771 /*@C
2772    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2773 
2774    Collective on Mat
2775 
2776    Input Parameter:
2777 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2778 
2779    Output Parameter:
2780 +  node_count - no of inodes present in the matrix.
2781 .  sizes      - an array of size node_count,with sizes of each inode.
2782 -  limit      - the max size used to generate the inodes.
2783 
2784    Level: advanced
2785 
2786    Notes: This routine returns some internal storage information
2787    of the matrix, it is intended to be used by advanced users.
2788    It should be called after the matrix is assembled.
2789    The contents of the sizes[] array should not be changed.
2790    PETSC_NULL may be passed for information not requested.
2791 
2792 .keywords: matrix, seqaij, get, inode
2793 
2794 .seealso: MatGetInfo()
2795 @*/
2796 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2797 {
2798   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2799 
2800   PetscFunctionBegin;
2801   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2802   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2803   if (f) {
2804     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2805   }
2806   PetscFunctionReturn(0);
2807 }
2808 
2809 EXTERN_C_BEGIN
2810 #undef __FUNCT__
2811 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
2812 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2813 {
2814   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2815 
2816   PetscFunctionBegin;
2817   if (node_count) *node_count = a->inode.node_count;
2818   if (sizes)      *sizes      = a->inode.size;
2819   if (limit)      *limit      = a->inode.limit;
2820   PetscFunctionReturn(0);
2821 }
2822 EXTERN_C_END
2823