xref: /petsc/src/mat/impls/aij/seq/inode.c (revision d4bf62d159b4eec940d9872c577cf78ea329f539)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_Inode"
397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     sz   = n;                   /* No of non zeros in this row */
427                                 /* Switch on the size of Node */
428     switch (nsz){               /* Each loop in 'case' is unrolled */
429     case 1 :
430       sum1  = 0;
431 
432       for(n = 0; n< sz-1; n+=2) {
433         i1   = idx[0];          /* The instructions are ordered to */
434         i2   = idx[1];          /* make the compiler's job easy */
435         idx += 2;
436         tmp0 = x[i1];
437         tmp1 = x[i2];
438         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
439        }
440 
441       if (n == sz-1){          /* Take care of the last nonzero  */
442         tmp0  = x[*idx++];
443         sum1 += *v1++ * tmp0;
444       }
445       y[row++]=sum1;
446       break;
447     case 2:
448       sum1  = 0;
449       sum2  = 0;
450       v2    = v1 + n;
451 
452       for (n = 0; n< sz-1; n+=2) {
453         i1   = idx[0];
454         i2   = idx[1];
455         idx += 2;
456         tmp0 = x[i1];
457         tmp1 = x[i2];
458         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
459         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
460       }
461       if (n == sz-1){
462         tmp0  = x[*idx++];
463         sum1 += *v1++ * tmp0;
464         sum2 += *v2++ * tmp0;
465       }
466       y[row++]=sum1;
467       y[row++]=sum2;
468       v1      =v2;              /* Since the next block to be processed starts there*/
469       idx    +=sz;
470       break;
471     case 3:
472       sum1  = 0;
473       sum2  = 0;
474       sum3  = 0;
475       v2    = v1 + n;
476       v3    = v2 + n;
477 
478       for (n = 0; n< sz-1; n+=2) {
479         i1   = idx[0];
480         i2   = idx[1];
481         idx += 2;
482         tmp0 = x[i1];
483         tmp1 = x[i2];
484         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
485         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
486         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
487       }
488       if (n == sz-1){
489         tmp0  = x[*idx++];
490         sum1 += *v1++ * tmp0;
491         sum2 += *v2++ * tmp0;
492         sum3 += *v3++ * tmp0;
493       }
494       y[row++]=sum1;
495       y[row++]=sum2;
496       y[row++]=sum3;
497       v1       =v3;             /* Since the next block to be processed starts there*/
498       idx     +=2*sz;
499       break;
500     case 4:
501       sum1  = 0;
502       sum2  = 0;
503       sum3  = 0;
504       sum4  = 0;
505       v2    = v1 + n;
506       v3    = v2 + n;
507       v4    = v3 + n;
508 
509       for (n = 0; n< sz-1; n+=2) {
510         i1   = idx[0];
511         i2   = idx[1];
512         idx += 2;
513         tmp0 = x[i1];
514         tmp1 = x[i2];
515         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
516         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
517         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
518         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
519       }
520       if (n == sz-1){
521         tmp0  = x[*idx++];
522         sum1 += *v1++ * tmp0;
523         sum2 += *v2++ * tmp0;
524         sum3 += *v3++ * tmp0;
525         sum4 += *v4++ * tmp0;
526       }
527       y[row++]=sum1;
528       y[row++]=sum2;
529       y[row++]=sum3;
530       y[row++]=sum4;
531       v1      =v4;              /* Since the next block to be processed starts there*/
532       idx    +=3*sz;
533       break;
534     case 5:
535       sum1  = 0;
536       sum2  = 0;
537       sum3  = 0;
538       sum4  = 0;
539       sum5  = 0;
540       v2    = v1 + n;
541       v3    = v2 + n;
542       v4    = v3 + n;
543       v5    = v4 + n;
544 
545       for (n = 0; n<sz-1; n+=2) {
546         i1   = idx[0];
547         i2   = idx[1];
548         idx += 2;
549         tmp0 = x[i1];
550         tmp1 = x[i2];
551         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
552         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
553         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
554         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
555         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
556       }
557       if (n == sz-1){
558         tmp0  = x[*idx++];
559         sum1 += *v1++ * tmp0;
560         sum2 += *v2++ * tmp0;
561         sum3 += *v3++ * tmp0;
562         sum4 += *v4++ * tmp0;
563         sum5 += *v5++ * tmp0;
564       }
565       y[row++]=sum1;
566       y[row++]=sum2;
567       y[row++]=sum3;
568       y[row++]=sum4;
569       y[row++]=sum5;
570       v1      =v5;       /* Since the next block to be processed starts there */
571       idx    +=4*sz;
572       break;
573     default :
574       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
575     }
576   }
577   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
578   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 /* ----------------------------------------------------------- */
583 /* Almost same code as the MatMult_Inode() */
584 #undef __FUNCT__
585 #define __FUNCT__ "MatMultAdd_Inode"
586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
587 {
588   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
589   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
590   MatScalar      *v1,*v2,*v3,*v4,*v5;
591   PetscScalar    *x,*y,*z,*zt;
592   PetscErrorCode ierr;
593   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
594 
595   PetscFunctionBegin;
596   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
597   node_max = a->inode.node_count;
598   ns       = a->inode.size;     /* Node Size array */
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
601   if (zz != yy) {
602     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
603   } else {
604     z = y;
605   }
606   zt = z;
607 
608   idx  = a->j;
609   v1   = a->a;
610   ii   = a->i;
611 
612   for (i = 0,row = 0; i< node_max; ++i){
613     nsz  = ns[i];
614     n    = ii[1] - ii[0];
615     ii  += nsz;
616     sz   = n;                   /* No of non zeros in this row */
617                                 /* Switch on the size of Node */
618     switch (nsz){               /* Each loop in 'case' is unrolled */
619     case 1 :
620       sum1  = *zt++;
621 
622       for(n = 0; n< sz-1; n+=2) {
623         i1   = idx[0];          /* The instructions are ordered to */
624         i2   = idx[1];          /* make the compiler's job easy */
625         idx += 2;
626         tmp0 = x[i1];
627         tmp1 = x[i2];
628         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
629        }
630 
631       if(n   == sz-1){          /* Take care of the last nonzero  */
632         tmp0  = x[*idx++];
633         sum1 += *v1++ * tmp0;
634       }
635       y[row++]=sum1;
636       break;
637     case 2:
638       sum1  = *zt++;
639       sum2  = *zt++;
640       v2    = v1 + n;
641 
642       for(n = 0; n< sz-1; n+=2) {
643         i1   = idx[0];
644         i2   = idx[1];
645         idx += 2;
646         tmp0 = x[i1];
647         tmp1 = x[i2];
648         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
649         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
650       }
651       if(n   == sz-1){
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655       }
656       y[row++]=sum1;
657       y[row++]=sum2;
658       v1      =v2;              /* Since the next block to be processed starts there*/
659       idx    +=sz;
660       break;
661     case 3:
662       sum1  = *zt++;
663       sum2  = *zt++;
664       sum3  = *zt++;
665       v2    = v1 + n;
666       v3    = v2 + n;
667 
668       for (n = 0; n< sz-1; n+=2) {
669         i1   = idx[0];
670         i2   = idx[1];
671         idx += 2;
672         tmp0 = x[i1];
673         tmp1 = x[i2];
674         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
675         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
676         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
677       }
678       if (n == sz-1){
679         tmp0  = x[*idx++];
680         sum1 += *v1++ * tmp0;
681         sum2 += *v2++ * tmp0;
682         sum3 += *v3++ * tmp0;
683       }
684       y[row++]=sum1;
685       y[row++]=sum2;
686       y[row++]=sum3;
687       v1       =v3;             /* Since the next block to be processed starts there*/
688       idx     +=2*sz;
689       break;
690     case 4:
691       sum1  = *zt++;
692       sum2  = *zt++;
693       sum3  = *zt++;
694       sum4  = *zt++;
695       v2    = v1 + n;
696       v3    = v2 + n;
697       v4    = v3 + n;
698 
699       for (n = 0; n< sz-1; n+=2) {
700         i1   = idx[0];
701         i2   = idx[1];
702         idx += 2;
703         tmp0 = x[i1];
704         tmp1 = x[i2];
705         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
706         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
707         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
708         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
709       }
710       if (n == sz-1){
711         tmp0  = x[*idx++];
712         sum1 += *v1++ * tmp0;
713         sum2 += *v2++ * tmp0;
714         sum3 += *v3++ * tmp0;
715         sum4 += *v4++ * tmp0;
716       }
717       y[row++]=sum1;
718       y[row++]=sum2;
719       y[row++]=sum3;
720       y[row++]=sum4;
721       v1      =v4;              /* Since the next block to be processed starts there*/
722       idx    +=3*sz;
723       break;
724     case 5:
725       sum1  = *zt++;
726       sum2  = *zt++;
727       sum3  = *zt++;
728       sum4  = *zt++;
729       sum5  = *zt++;
730       v2    = v1 + n;
731       v3    = v2 + n;
732       v4    = v3 + n;
733       v5    = v4 + n;
734 
735       for (n = 0; n<sz-1; n+=2) {
736         i1   = idx[0];
737         i2   = idx[1];
738         idx += 2;
739         tmp0 = x[i1];
740         tmp1 = x[i2];
741         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
742         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
743         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
744         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
745         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
746       }
747       if(n   == sz-1){
748         tmp0  = x[*idx++];
749         sum1 += *v1++ * tmp0;
750         sum2 += *v2++ * tmp0;
751         sum3 += *v3++ * tmp0;
752         sum4 += *v4++ * tmp0;
753         sum5 += *v5++ * tmp0;
754       }
755       y[row++]=sum1;
756       y[row++]=sum2;
757       y[row++]=sum3;
758       y[row++]=sum4;
759       y[row++]=sum5;
760       v1      =v5;       /* Since the next block to be processed starts there */
761       idx    +=4*sz;
762       break;
763     default :
764       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
765     }
766   }
767   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
768   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
769   if (zz != yy) {
770     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
771   }
772   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
773   PetscFunctionReturn(0);
774 }
775 
776 /* ----------------------------------------------------------- */
777 #undef __FUNCT__
778 #define __FUNCT__ "MatSolve_Inode"
779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
780 {
781   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
782   IS                iscol = a->col,isrow = a->row;
783   PetscErrorCode    ierr;
784   const PetscInt    *r,*c,*rout,*cout;
785   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
786   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp ;
806   aa   = a_a ;
807   aj   = a_j ;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i){
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816 
817     switch (nsz){               /* Each loop in 'case' is unrolled */
818     case 1 :
819       sum1 = b[*r++];
820       /*      while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
821       for(j=0; j<nz-1; j+=2){
822         i0   = vi[0];
823         i1   = vi[1];
824         vi  +=2;
825         tmp0 = tmps[i0];
826         tmp1 = tmps[i1];
827         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828       }
829       if(j == nz-1){
830         tmp0 = tmps[*vi++];
831         sum1 -= *v1++ *tmp0;
832       }
833       tmp[row ++]=sum1;
834       break;
835     case 2:
836       sum1 = b[*r++];
837       sum2 = b[*r++];
838       v2   = aa + ai[row+1];
839 
840       for(j=0; j<nz-1; j+=2){
841         i0   = vi[0];
842         i1   = vi[1];
843         vi  +=2;
844         tmp0 = tmps[i0];
845         tmp1 = tmps[i1];
846         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
847         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
848       }
849       if(j == nz-1){
850         tmp0 = tmps[*vi++];
851         sum1 -= *v1++ *tmp0;
852         sum2 -= *v2++ *tmp0;
853       }
854       sum2 -= *v2++ * sum1;
855       tmp[row ++]=sum1;
856       tmp[row ++]=sum2;
857       break;
858     case 3:
859       sum1 = b[*r++];
860       sum2 = b[*r++];
861       sum3 = b[*r++];
862       v2   = aa + ai[row+1];
863       v3   = aa + ai[row+2];
864 
865       for (j=0; j<nz-1; j+=2){
866         i0   = vi[0];
867         i1   = vi[1];
868         vi  +=2;
869         tmp0 = tmps[i0];
870         tmp1 = tmps[i1];
871         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
872         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
873         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
874       }
875       if (j == nz-1){
876         tmp0 = tmps[*vi++];
877         sum1 -= *v1++ *tmp0;
878         sum2 -= *v2++ *tmp0;
879         sum3 -= *v3++ *tmp0;
880       }
881       sum2 -= *v2++ * sum1;
882       sum3 -= *v3++ * sum1;
883       sum3 -= *v3++ * sum2;
884       tmp[row ++]=sum1;
885       tmp[row ++]=sum2;
886       tmp[row ++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2){
899         i0   = vi[0];
900         i1   = vi[1];
901         vi  +=2;
902         tmp0 = tmps[i0];
903         tmp1 = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1){
910         tmp0 = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row ++]=sum1;
924       tmp[row ++]=sum2;
925       tmp[row ++]=sum3;
926       tmp[row ++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2){
940         i0   = vi[0];
941         i1   = vi[1];
942         vi  +=2;
943         tmp0 = tmps[i0];
944         tmp1 = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1){
952         tmp0 = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row ++]=sum1;
972       tmp[row ++]=sum2;
973       tmp[row ++]=sum3;
974       tmp[row ++]=sum4;
975       tmp[row ++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz){               /* Each loop in 'case' is unrolled */
989     case 1 :
990       sum1 = tmp[row];
991 
992       for(j=nz ; j>1; j-=2){
993         vi  -=2;
994         i0   = vi[2];
995         i1   = vi[1];
996         tmp0 = tmps[i0];
997         tmp1 = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1){
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2 :
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz ; j>1; j-=2){
1012         vi  -=2;
1013         i0   = vi[2];
1014         i1   = vi[1];
1015         tmp0 = tmps[i0];
1016         tmp1 = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1){
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3 :
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz ; j>1; j-=2){
1039         vi  -=2;
1040         i0   = vi[2];
1041         i1   = vi[1];
1042         tmp0 = tmps[i0];
1043         tmp1 = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1){
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4 :
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz ; j>1; j-=2){
1075         vi  -=2;
1076         i0   = vi[2];
1077         i1   = vi[1];
1078         tmp0 = tmps[i0];
1079         tmp1 = tmps[i1];
1080         v1  -= 2;
1081         v2  -= 2;
1082         v3  -= 2;
1083         v4  -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1){
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5 :
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz ; j>1; j-=2){
1119         vi  -= 2;
1120         i0   = vi[2];
1121         i1   = vi[1];
1122         tmp0 = tmps[i0];
1123         tmp1 = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1){
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_Inode"
1174 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   Mat               C = B;
1177   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1178   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1179   PetscErrorCode    ierr;
1180   const PetscInt    *r,*ic,*c,*ics;
1181   PetscInt          n = A->rmap->n,*bi = b->i;
1182   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1183   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1184   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1185   PetscScalar       mul1,mul2,mul3,tmp;
1186   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1187   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1188   PetscReal         rs=0.0;
1189   LUShift_Ctx       sctx;
1190   PetscInt          newshift;
1191 
1192   PetscFunctionBegin;
1193   sctx.shift_top      = 0;
1194   sctx.nshift_max     = 0;
1195   sctx.shift_lo       = 0;
1196   sctx.shift_hi       = 0;
1197   sctx.shift_fraction = 0;
1198 
1199   /* if both shift schemes are chosen by user, only use info->shiftpd */
1200   if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1201     sctx.shift_top = 0;
1202     for (i=0; i<n; i++) {
1203       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1204       rs    = 0.0;
1205       ajtmp = aj + ai[i];
1206       rtmp1 = aa + ai[i];
1207       nz = ai[i+1] - ai[i];
1208       for (j=0; j<nz; j++){
1209         if (*ajtmp != i){
1210           rs += PetscAbsScalar(*rtmp1++);
1211         } else {
1212           rs -= PetscRealPart(*rtmp1++);
1213         }
1214         ajtmp++;
1215       }
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1219     sctx.shift_top *= 1.1;
1220     sctx.nshift_max = 5;
1221     sctx.shift_lo   = 0.;
1222     sctx.shift_hi   = 1.;
1223   }
1224   sctx.shift_amount = 0;
1225   sctx.nshift       = 0;
1226 
1227   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1228   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1229   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1230   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1231   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1232   ics   = ic ;
1233   rtmp22 = rtmp11 + n;
1234   rtmp33 = rtmp22 + n;
1235 
1236   node_max = a->inode.node_count;
1237   ns       = a->inode.size;
1238   if (!ns){
1239     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1240   }
1241 
1242   /* If max inode size > 3, split it into two inodes.*/
1243   /* also map the inode sizes according to the ordering */
1244   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1245   for (i=0,j=0; i<node_max; ++i,++j){
1246     if (ns[i]>3) {
1247       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1248       ++j;
1249       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1250     } else {
1251       tmp_vec1[j] = ns[i];
1252     }
1253   }
1254   /* Use the correct node_max */
1255   node_max = j;
1256 
1257   /* Now reorder the inode info based on mat re-ordering info */
1258   /* First create a row -> inode_size_array_index map */
1259   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1260   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1261   for (i=0,row=0; i<node_max; i++) {
1262     nodesz = tmp_vec1[i];
1263     for (j=0; j<nodesz; j++,row++) {
1264       nsmap[row] = i;
1265     }
1266   }
1267   /* Using nsmap, create a reordered ns structure */
1268   for (i=0,j=0; i< node_max; i++) {
1269     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1270     tmp_vec2[i]  = nodesz;
1271     j           += nodesz;
1272   }
1273   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1274   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1275   /* Now use the correct ns */
1276   ns = tmp_vec2;
1277 
1278   do {
1279     sctx.lushift = PETSC_FALSE;
1280     /* Now loop over each block-row, and do the factorization */
1281     for (i=0,row=0; i<node_max; i++) {
1282       nodesz = ns[i];
1283       nz     = bi[row+1] - bi[row];
1284       bjtmp  = bj + bi[row];
1285 
1286       switch (nodesz){
1287       case 1:
1288         for  (j=0; j<nz; j++){
1289           idx        = bjtmp[j];
1290           rtmp11[idx] = 0.0;
1291         }
1292 
1293         /* load in initial (unfactored row) */
1294         idx    = r[row];
1295         nz_tmp = ai[idx+1] - ai[idx];
1296         ajtmp  = aj + ai[idx];
1297         v1     = aa + ai[idx];
1298 
1299         for (j=0; j<nz_tmp; j++) {
1300           idx        = ics[ajtmp[j]];
1301           rtmp11[idx] = v1[j];
1302         }
1303         rtmp11[ics[r[row]]] += sctx.shift_amount;
1304 
1305         prow = *bjtmp++ ;
1306         while (prow < row) {
1307           pc1 = rtmp11 + prow;
1308           if (*pc1 != 0.0){
1309             pv   = ba + bd[prow];
1310             pj   = nbj + bd[prow];
1311             mul1 = *pc1 * *pv++;
1312             *pc1 = mul1;
1313             nz_tmp = bi[prow+1] - bd[prow] - 1;
1314             ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr);
1315             for (j=0; j<nz_tmp; j++) {
1316               tmp = pv[j];
1317               idx = pj[j];
1318               rtmp11[idx] -= mul1 * tmp;
1319             }
1320           }
1321           prow = *bjtmp++ ;
1322         }
1323         pj  = bj + bi[row];
1324         pc1 = ba + bi[row];
1325 
1326         sctx.pv    = rtmp11[row];
1327         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1328         rs         = 0.0;
1329         for (j=0; j<nz; j++) {
1330           idx    = pj[j];
1331           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1332           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1333         }
1334         sctx.rs  = rs;
1335         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1336         if (newshift == 1) goto endofwhile;
1337         break;
1338 
1339       case 2:
1340         for (j=0; j<nz; j++) {
1341           idx        = bjtmp[j];
1342           rtmp11[idx] = 0.0;
1343           rtmp22[idx] = 0.0;
1344         }
1345 
1346         /* load in initial (unfactored row) */
1347         idx    = r[row];
1348         nz_tmp = ai[idx+1] - ai[idx];
1349         ajtmp  = aj + ai[idx];
1350         v1     = aa + ai[idx];
1351         v2     = aa + ai[idx+1];
1352         for (j=0; j<nz_tmp; j++) {
1353           idx        = ics[ajtmp[j]];
1354           rtmp11[idx] = v1[j];
1355           rtmp22[idx] = v2[j];
1356         }
1357         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1358         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1359 
1360         prow = *bjtmp++ ;
1361         while (prow < row) {
1362           pc1 = rtmp11 + prow;
1363           pc2 = rtmp22 + prow;
1364           if (*pc1 != 0.0 || *pc2 != 0.0){
1365             pv   = ba + bd[prow];
1366             pj   = nbj + bd[prow];
1367             mul1 = *pc1 * *pv;
1368             mul2 = *pc2 * *pv;
1369             ++pv;
1370             *pc1 = mul1;
1371             *pc2 = mul2;
1372 
1373             nz_tmp = bi[prow+1] - bd[prow] - 1;
1374             for (j=0; j<nz_tmp; j++) {
1375               tmp = pv[j];
1376               idx = pj[j];
1377               rtmp11[idx] -= mul1 * tmp;
1378               rtmp22[idx] -= mul2 * tmp;
1379             }
1380             ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1381           }
1382           prow = *bjtmp++ ;
1383         }
1384 
1385         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1386         pc1 = rtmp11 + prow;
1387         pc2 = rtmp22 + prow;
1388 
1389         sctx.pv = *pc1;
1390         pj      = bj + bi[prow];
1391         rs      = 0.0;
1392         for (j=0; j<nz; j++){
1393           idx = pj[j];
1394           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1395         }
1396         sctx.rs = rs;
1397         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1398         if (newshift == 1) goto endofwhile;
1399 
1400         if (*pc2 != 0.0){
1401           pj     = nbj + bd[prow];
1402           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1403           *pc2   = mul2;
1404           nz_tmp = bi[prow+1] - bd[prow] - 1;
1405           for (j=0; j<nz_tmp; j++) {
1406             idx = pj[j] ;
1407             tmp = rtmp11[idx];
1408             rtmp22[idx] -= mul2 * tmp;
1409           }
1410           ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr);
1411         }
1412 
1413         pj  = bj + bi[row];
1414         pc1 = ba + bi[row];
1415         pc2 = ba + bi[row+1];
1416 
1417         sctx.pv = rtmp22[row+1];
1418         rs = 0.0;
1419         rtmp11[row]   = 1.0/rtmp11[row];
1420         rtmp22[row+1] = 1.0/rtmp22[row+1];
1421         /* copy row entries from dense representation to sparse */
1422         for (j=0; j<nz; j++) {
1423           idx    = pj[j];
1424           pc1[j] = rtmp11[idx];
1425           pc2[j] = rtmp22[idx];
1426           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1427         }
1428         sctx.rs = rs;
1429         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1430         if (newshift == 1) goto endofwhile;
1431         break;
1432 
1433       case 3:
1434         for  (j=0; j<nz; j++) {
1435           idx        = bjtmp[j];
1436           rtmp11[idx] = 0.0;
1437           rtmp22[idx] = 0.0;
1438           rtmp33[idx] = 0.0;
1439         }
1440         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1441         idx    = r[row];
1442         nz_tmp = ai[idx+1] - ai[idx];
1443         ajtmp = aj + ai[idx];
1444         v1    = aa + ai[idx];
1445         v2    = aa + ai[idx+1];
1446         v3    = aa + ai[idx+2];
1447         for (j=0; j<nz_tmp; j++) {
1448           idx        = ics[ajtmp[j]];
1449           rtmp11[idx] = v1[j];
1450           rtmp22[idx] = v2[j];
1451           rtmp33[idx] = v3[j];
1452         }
1453         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1454         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1455         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1456 
1457         /* loop over all pivot row blocks above this row block */
1458         prow = *bjtmp++ ;
1459         while (prow < row) {
1460           pc1 = rtmp11 + prow;
1461           pc2 = rtmp22 + prow;
1462           pc3 = rtmp33 + prow;
1463           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1464             pv   = ba  + bd[prow];
1465             pj   = nbj + bd[prow];
1466             mul1 = *pc1 * *pv;
1467             mul2 = *pc2 * *pv;
1468             mul3 = *pc3 * *pv;
1469             ++pv;
1470             *pc1 = mul1;
1471             *pc2 = mul2;
1472             *pc3 = mul3;
1473 
1474             nz_tmp = bi[prow+1] - bd[prow] - 1;
1475             /* update this row based on pivot row */
1476             for (j=0; j<nz_tmp; j++) {
1477               tmp = pv[j];
1478               idx = pj[j];
1479               rtmp11[idx] -= mul1 * tmp;
1480               rtmp22[idx] -= mul2 * tmp;
1481               rtmp33[idx] -= mul3 * tmp;
1482             }
1483             ierr = PetscLogFlops(6.0*nz_tmp);CHKERRQ(ierr);
1484           }
1485           prow = *bjtmp++ ;
1486         }
1487 
1488         /* Now take care of diagonal 3x3 block in this set of rows */
1489         /* note: prow = row here */
1490         pc1 = rtmp11 + prow;
1491         pc2 = rtmp22 + prow;
1492         pc3 = rtmp33 + prow;
1493 
1494         sctx.pv = *pc1;
1495         pj      = bj + bi[prow];
1496         rs      = 0.0;
1497         for (j=0; j<nz; j++){
1498           idx = pj[j];
1499           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
1500         }
1501         sctx.rs = rs;
1502         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1503         if (newshift == 1) goto endofwhile;
1504 
1505         if (*pc2 != 0.0 || *pc3 != 0.0){
1506           mul2 = (*pc2)/(*pc1);
1507           mul3 = (*pc3)/(*pc1);
1508           *pc2 = mul2;
1509           *pc3 = mul3;
1510           nz_tmp = bi[prow+1] - bd[prow] - 1;
1511           pj     = nbj + bd[prow];
1512           for (j=0; j<nz_tmp; j++) {
1513             idx = pj[j] ;
1514             tmp = rtmp11[idx];
1515             rtmp22[idx] -= mul2 * tmp;
1516             rtmp33[idx] -= mul3 * tmp;
1517           }
1518           ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1519         }
1520         ++prow;
1521 
1522         pc2 = rtmp22 + prow;
1523         pc3 = rtmp33 + prow;
1524         sctx.pv = *pc2;
1525         pj      = bj + bi[prow];
1526         rs      = 0.0;
1527         for (j=0; j<nz; j++){
1528           idx = pj[j];
1529           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
1530         }
1531         sctx.rs = rs;
1532         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1533         if (newshift == 1) goto endofwhile;
1534 
1535         if (*pc3 != 0.0){
1536           mul3   = (*pc3)/(*pc2);
1537           *pc3   = mul3;
1538           pj     = nbj + bd[prow];
1539           nz_tmp = bi[prow+1] - bd[prow] - 1;
1540           for (j=0; j<nz_tmp; j++) {
1541             idx = pj[j] ;
1542             tmp = rtmp22[idx];
1543             rtmp33[idx] -= mul3 * tmp;
1544           }
1545           ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr);
1546         }
1547 
1548         pj  = bj + bi[row];
1549         pc1 = ba + bi[row];
1550         pc2 = ba + bi[row+1];
1551         pc3 = ba + bi[row+2];
1552 
1553         sctx.pv = rtmp33[row+2];
1554         rs = 0.0;
1555         rtmp11[row]   = 1.0/rtmp11[row];
1556         rtmp22[row+1] = 1.0/rtmp22[row+1];
1557         rtmp33[row+2] = 1.0/rtmp33[row+2];
1558         /* copy row entries from dense representation to sparse */
1559         for (j=0; j<nz; j++) {
1560           idx    = pj[j];
1561           pc1[j] = rtmp11[idx];
1562           pc2[j] = rtmp22[idx];
1563           pc3[j] = rtmp33[idx];
1564           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1565         }
1566 
1567         sctx.rs = rs;
1568         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
1569         if (newshift == 1) goto endofwhile;
1570         break;
1571 
1572       default:
1573         SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1574       }
1575       row += nodesz;                 /* Update the row */
1576     }
1577     endofwhile:;
1578   } while (sctx.lushift);
1579   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1580   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1581   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1582   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1583   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
1584   (B)->ops->solve           = MatSolve_Inode;
1585   /* do not set solve add, since MatSolve_Inode + Add is faster */
1586   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1587   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1588   C->assembled   = PETSC_TRUE;
1589   C->preallocated = PETSC_TRUE;
1590   if (sctx.nshift) {
1591     if (info->shiftpd) {
1592       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1593     } else if (info->shiftnz) {
1594       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1595     }
1596   }
1597   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 /*
1602      Makes a longer coloring[] array and calls the usual code with that
1603 */
1604 #undef __FUNCT__
1605 #define __FUNCT__ "MatColoringPatch_Inode"
1606 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1607 {
1608   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
1609   PetscErrorCode  ierr;
1610   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1611   PetscInt        *colorused,i;
1612   ISColoringValue *newcolor;
1613 
1614   PetscFunctionBegin;
1615   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
1616   /* loop over inodes, marking a color for each column*/
1617   row = 0;
1618   for (i=0; i<m; i++){
1619     for (j=0; j<ns[i]; j++) {
1620       newcolor[row++] = coloring[i] + j*ncolors;
1621     }
1622   }
1623 
1624   /* eliminate unneeded colors */
1625   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
1626   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
1627   for (i=0; i<n; i++) {
1628     colorused[newcolor[i]] = 1;
1629   }
1630 
1631   for (i=1; i<5*ncolors; i++) {
1632     colorused[i] += colorused[i-1];
1633   }
1634   ncolors = colorused[5*ncolors-1];
1635   for (i=0; i<n; i++) {
1636     newcolor[i] = colorused[newcolor[i]]-1;
1637   }
1638   ierr = PetscFree(colorused);CHKERRQ(ierr);
1639   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
1640   ierr = PetscFree(coloring);CHKERRQ(ierr);
1641   PetscFunctionReturn(0);
1642 }
1643 
1644 #include "../src/mat/blockinvert.h"
1645 
1646 #undef __FUNCT__
1647 #define __FUNCT__ "MatRelax_Inode"
1648 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1649 {
1650   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1651   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
1652   MatScalar          *ibdiag,*bdiag;
1653   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
1654   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
1655   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
1656   PetscErrorCode     ierr;
1657   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
1658   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k;
1659 
1660   PetscFunctionBegin;
1661   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
1662   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
1663   if (its > 1) {
1664     /* switch to non-inode version */
1665     ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
1666     PetscFunctionReturn(0);
1667   }
1668 
1669   if (!a->inode.ibdiagvalid) {
1670     if (!a->inode.ibdiag) {
1671       /* calculate space needed for diagonal blocks */
1672       for (i=0; i<m; i++) {
1673 	cnt += sizes[i]*sizes[i];
1674       }
1675       a->inode.bdiagsize = cnt;
1676       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
1677     }
1678 
1679     /* copy over the diagonal blocks and invert them */
1680     ibdiag = a->inode.ibdiag;
1681     bdiag  = a->inode.bdiag;
1682     cnt = 0;
1683     for (i=0, row = 0; i<m; i++) {
1684       for (j=0; j<sizes[i]; j++) {
1685         for (k=0; k<sizes[i]; k++) {
1686           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
1687         }
1688       }
1689       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
1690 
1691       switch(sizes[i]) {
1692         case 1:
1693           /* Create matrix data structure */
1694           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
1695           ibdiag[cnt] = 1.0/ibdiag[cnt];
1696           break;
1697         case 2:
1698           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
1699           break;
1700         case 3:
1701           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
1702           break;
1703         case 4:
1704           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
1705           break;
1706         case 5:
1707           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr);
1708           break;
1709        default:
1710 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1711       }
1712       cnt += sizes[i]*sizes[i];
1713       row += sizes[i];
1714     }
1715     a->inode.ibdiagvalid = PETSC_TRUE;
1716   }
1717   ibdiag = a->inode.ibdiag;
1718   bdiag  = a->inode.bdiag;
1719 
1720 
1721   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1722   if (flag & SOR_ZERO_INITIAL_GUESS) {
1723     PetscScalar *b;
1724     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1725     if (xx != bb) {
1726       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1727     } else {
1728       b = x;
1729     }
1730     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
1731 
1732       for (i=0, row=0; i<m; i++) {
1733         sz  = diag[row] - ii[row];
1734         v1  = a->a + ii[row];
1735         idx = a->j + ii[row];
1736 
1737         /* see comments for MatMult_Inode() for how this is coded */
1738         switch (sizes[i]){
1739           case 1:
1740 
1741             sum1  = b[row];
1742             for(n = 0; n<sz-1; n+=2) {
1743               i1   = idx[0];
1744               i2   = idx[1];
1745               idx += 2;
1746               tmp0 = x[i1];
1747               tmp1 = x[i2];
1748               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1749             }
1750 
1751             if (n == sz-1){
1752               tmp0  = x[*idx];
1753               sum1 -= *v1 * tmp0;
1754             }
1755             x[row++] = sum1*(*ibdiag++);
1756             break;
1757           case 2:
1758             v2    = a->a + ii[row+1];
1759             sum1  = b[row];
1760             sum2  = b[row+1];
1761             for(n = 0; n<sz-1; n+=2) {
1762               i1   = idx[0];
1763               i2   = idx[1];
1764               idx += 2;
1765               tmp0 = x[i1];
1766               tmp1 = x[i2];
1767               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1768               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1769             }
1770 
1771             if (n == sz-1){
1772               tmp0  = x[*idx];
1773               sum1 -= v1[0] * tmp0;
1774               sum2 -= v2[0] * tmp0;
1775             }
1776             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
1777             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
1778             ibdiag  += 4;
1779             break;
1780           case 3:
1781             v2    = a->a + ii[row+1];
1782             v3    = a->a + ii[row+2];
1783             sum1  = b[row];
1784             sum2  = b[row+1];
1785             sum3  = b[row+2];
1786             for(n = 0; n<sz-1; n+=2) {
1787               i1   = idx[0];
1788               i2   = idx[1];
1789               idx += 2;
1790               tmp0 = x[i1];
1791               tmp1 = x[i2];
1792               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1793               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1794               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1795             }
1796 
1797             if (n == sz-1){
1798               tmp0  = x[*idx];
1799               sum1 -= v1[0] * tmp0;
1800               sum2 -= v2[0] * tmp0;
1801               sum3 -= v3[0] * tmp0;
1802             }
1803             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
1804             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
1805             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
1806             ibdiag  += 9;
1807             break;
1808           case 4:
1809             v2    = a->a + ii[row+1];
1810             v3    = a->a + ii[row+2];
1811             v4    = a->a + ii[row+3];
1812             sum1  = b[row];
1813             sum2  = b[row+1];
1814             sum3  = b[row+2];
1815             sum4  = b[row+3];
1816             for(n = 0; n<sz-1; n+=2) {
1817               i1   = idx[0];
1818               i2   = idx[1];
1819               idx += 2;
1820               tmp0 = x[i1];
1821               tmp1 = x[i2];
1822               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1823               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1824               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1825               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1826             }
1827 
1828             if (n == sz-1){
1829               tmp0  = x[*idx];
1830               sum1 -= v1[0] * tmp0;
1831               sum2 -= v2[0] * tmp0;
1832               sum3 -= v3[0] * tmp0;
1833               sum4 -= v4[0] * tmp0;
1834             }
1835             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
1836             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
1837             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
1838             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
1839             ibdiag  += 16;
1840             break;
1841           case 5:
1842             v2    = a->a + ii[row+1];
1843             v3    = a->a + ii[row+2];
1844             v4    = a->a + ii[row+3];
1845             v5    = a->a + ii[row+4];
1846             sum1  = b[row];
1847             sum2  = b[row+1];
1848             sum3  = b[row+2];
1849             sum4  = b[row+3];
1850             sum5  = b[row+4];
1851             for(n = 0; n<sz-1; n+=2) {
1852               i1   = idx[0];
1853               i2   = idx[1];
1854               idx += 2;
1855               tmp0 = x[i1];
1856               tmp1 = x[i2];
1857               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1858               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1859               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
1860               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
1861               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
1862             }
1863 
1864             if (n == sz-1){
1865               tmp0  = x[*idx];
1866               sum1 -= v1[0] * tmp0;
1867               sum2 -= v2[0] * tmp0;
1868               sum3 -= v3[0] * tmp0;
1869               sum4 -= v4[0] * tmp0;
1870               sum5 -= v5[0] * tmp0;
1871             }
1872             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
1873             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
1874             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
1875             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
1876             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
1877             ibdiag  += 25;
1878             break;
1879 	  default:
1880    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1881         }
1882       }
1883 
1884       xb = x;
1885       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1886     } else xb = b;
1887     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
1888         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1889       cnt = 0;
1890       for (i=0, row=0; i<m; i++) {
1891 
1892         switch (sizes[i]){
1893           case 1:
1894             x[row++] *= bdiag[cnt++];
1895             break;
1896           case 2:
1897             x1   = x[row]; x2 = x[row+1];
1898             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
1899             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
1900             x[row++] = tmp1;
1901             x[row++] = tmp2;
1902             cnt += 4;
1903             break;
1904           case 3:
1905             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
1906             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
1907             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
1908             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
1909             x[row++] = tmp1;
1910             x[row++] = tmp2;
1911             x[row++] = tmp3;
1912             cnt += 9;
1913             break;
1914           case 4:
1915             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
1916             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
1917             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
1918             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
1919             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
1920             x[row++] = tmp1;
1921             x[row++] = tmp2;
1922             x[row++] = tmp3;
1923             x[row++] = tmp4;
1924             cnt += 16;
1925             break;
1926           case 5:
1927             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
1928             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
1929             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
1930             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
1931             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
1932             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
1933             x[row++] = tmp1;
1934             x[row++] = tmp2;
1935             x[row++] = tmp3;
1936             x[row++] = tmp4;
1937             x[row++] = tmp5;
1938             cnt += 25;
1939             break;
1940 	  default:
1941    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
1942         }
1943       }
1944       ierr = PetscLogFlops(m);CHKERRQ(ierr);
1945     }
1946     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
1947 
1948       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
1949       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
1950         ibdiag -= sizes[i]*sizes[i];
1951         sz      = ii[row+1] - diag[row] - 1;
1952         v1      = a->a + diag[row] + 1;
1953         idx     = a->j + diag[row] + 1;
1954 
1955         /* see comments for MatMult_Inode() for how this is coded */
1956         switch (sizes[i]){
1957           case 1:
1958 
1959             sum1  = xb[row];
1960             for(n = 0; n<sz-1; n+=2) {
1961               i1   = idx[0];
1962               i2   = idx[1];
1963               idx += 2;
1964               tmp0 = x[i1];
1965               tmp1 = x[i2];
1966               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1967             }
1968 
1969             if (n == sz-1){
1970               tmp0  = x[*idx];
1971               sum1 -= *v1*tmp0;
1972             }
1973             x[row--] = sum1*(*ibdiag);
1974             break;
1975 
1976           case 2:
1977 
1978             sum1  = xb[row];
1979             sum2  = xb[row-1];
1980             /* note that sum1 is associated with the second of the two rows */
1981             v2    = a->a + diag[row-1] + 2;
1982             for(n = 0; n<sz-1; n+=2) {
1983               i1   = idx[0];
1984               i2   = idx[1];
1985               idx += 2;
1986               tmp0 = x[i1];
1987               tmp1 = x[i2];
1988               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
1989               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
1990             }
1991 
1992             if (n == sz-1){
1993               tmp0  = x[*idx];
1994               sum1 -= *v1*tmp0;
1995               sum2 -= *v2*tmp0;
1996             }
1997             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
1998             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
1999             break;
2000           case 3:
2001 
2002             sum1  = xb[row];
2003             sum2  = xb[row-1];
2004             sum3  = xb[row-2];
2005             v2    = a->a + diag[row-1] + 2;
2006             v3    = a->a + diag[row-2] + 3;
2007             for(n = 0; n<sz-1; n+=2) {
2008               i1   = idx[0];
2009               i2   = idx[1];
2010               idx += 2;
2011               tmp0 = x[i1];
2012               tmp1 = x[i2];
2013               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2014               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2015               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2016             }
2017 
2018             if (n == sz-1){
2019               tmp0  = x[*idx];
2020               sum1 -= *v1*tmp0;
2021               sum2 -= *v2*tmp0;
2022               sum3 -= *v3*tmp0;
2023             }
2024             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2025             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2026             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2027             break;
2028           case 4:
2029 
2030             sum1  = xb[row];
2031             sum2  = xb[row-1];
2032             sum3  = xb[row-2];
2033             sum4  = xb[row-3];
2034             v2    = a->a + diag[row-1] + 2;
2035             v3    = a->a + diag[row-2] + 3;
2036             v4    = a->a + diag[row-3] + 4;
2037             for(n = 0; n<sz-1; n+=2) {
2038               i1   = idx[0];
2039               i2   = idx[1];
2040               idx += 2;
2041               tmp0 = x[i1];
2042               tmp1 = x[i2];
2043               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2044               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2045               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2046               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2047             }
2048 
2049             if (n == sz-1){
2050               tmp0  = x[*idx];
2051               sum1 -= *v1*tmp0;
2052               sum2 -= *v2*tmp0;
2053               sum3 -= *v3*tmp0;
2054               sum4 -= *v4*tmp0;
2055             }
2056             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2057             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2058             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2059             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2060             break;
2061           case 5:
2062 
2063             sum1  = xb[row];
2064             sum2  = xb[row-1];
2065             sum3  = xb[row-2];
2066             sum4  = xb[row-3];
2067             sum5  = xb[row-4];
2068             v2    = a->a + diag[row-1] + 2;
2069             v3    = a->a + diag[row-2] + 3;
2070             v4    = a->a + diag[row-3] + 4;
2071             v5    = a->a + diag[row-4] + 5;
2072             for(n = 0; n<sz-1; n+=2) {
2073               i1   = idx[0];
2074               i2   = idx[1];
2075               idx += 2;
2076               tmp0 = x[i1];
2077               tmp1 = x[i2];
2078               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2079               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2080               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2081               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2082               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2083             }
2084 
2085             if (n == sz-1){
2086               tmp0  = x[*idx];
2087               sum1 -= *v1*tmp0;
2088               sum2 -= *v2*tmp0;
2089               sum3 -= *v3*tmp0;
2090               sum4 -= *v4*tmp0;
2091               sum5 -= *v5*tmp0;
2092             }
2093             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2094             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2095             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2096             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2097             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2098             break;
2099 	  default:
2100    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2101         }
2102       }
2103 
2104       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2105     }
2106     its--;
2107     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2108     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
2109   }
2110   if (flag & SOR_EISENSTAT) {
2111     const PetscScalar *b;
2112     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2113     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2114     MatScalar *t = a->inode.ssor_work;
2115     /*
2116           Apply  (U + D)^-1  where D is now the block diagonal
2117     */
2118     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2119     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2120       ibdiag -= sizes[i]*sizes[i];
2121       sz      = ii[row+1] - diag[row] - 1;
2122       v1      = a->a + diag[row] + 1;
2123       idx     = a->j + diag[row] + 1;
2124       CHKMEMQ;
2125       /* see comments for MatMult_Inode() for how this is coded */
2126       switch (sizes[i]){
2127         case 1:
2128 
2129 	  sum1  = b[row];
2130 	  for(n = 0; n<sz-1; n+=2) {
2131 	    i1   = idx[0];
2132 	    i2   = idx[1];
2133 	    idx += 2;
2134 	    tmp0 = x[i1];
2135 	    tmp1 = x[i2];
2136 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2137 	  }
2138 
2139 	  if (n == sz-1){
2140 	    tmp0  = x[*idx];
2141 	    sum1 -= *v1*tmp0;
2142 	  }
2143 	  x[row] = sum1*(*ibdiag);row--;
2144 	  break;
2145 
2146         case 2:
2147 
2148 	  sum1  = b[row];
2149 	  sum2  = b[row-1];
2150 	  /* note that sum1 is associated with the second of the two rows */
2151 	  v2    = a->a + diag[row-1] + 2;
2152 	  for(n = 0; n<sz-1; n+=2) {
2153 	    i1   = idx[0];
2154 	    i2   = idx[1];
2155 	    idx += 2;
2156 	    tmp0 = x[i1];
2157 	    tmp1 = x[i2];
2158 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2159 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2160 	  }
2161 
2162 	  if (n == sz-1){
2163 	    tmp0  = x[*idx];
2164 	    sum1 -= *v1*tmp0;
2165 	    sum2 -= *v2*tmp0;
2166 	  }
2167 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
2168 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
2169           row -= 2;
2170 	  break;
2171         case 3:
2172 
2173 	  sum1  = b[row];
2174 	  sum2  = b[row-1];
2175 	  sum3  = b[row-2];
2176 	  v2    = a->a + diag[row-1] + 2;
2177 	  v3    = a->a + diag[row-2] + 3;
2178 	  for(n = 0; n<sz-1; n+=2) {
2179 	    i1   = idx[0];
2180 	    i2   = idx[1];
2181 	    idx += 2;
2182 	    tmp0 = x[i1];
2183 	    tmp1 = x[i2];
2184 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2185 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2186 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2187 	  }
2188 
2189 	  if (n == sz-1){
2190 	    tmp0  = x[*idx];
2191 	    sum1 -= *v1*tmp0;
2192 	    sum2 -= *v2*tmp0;
2193 	    sum3 -= *v3*tmp0;
2194 	  }
2195 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2196 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2197 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2198           row -= 3;
2199 	  break;
2200         case 4:
2201 
2202 	  sum1  = b[row];
2203 	  sum2  = b[row-1];
2204 	  sum3  = b[row-2];
2205 	  sum4  = b[row-3];
2206 	  v2    = a->a + diag[row-1] + 2;
2207 	  v3    = a->a + diag[row-2] + 3;
2208 	  v4    = a->a + diag[row-3] + 4;
2209 	  for(n = 0; n<sz-1; n+=2) {
2210 	    i1   = idx[0];
2211 	    i2   = idx[1];
2212 	    idx += 2;
2213 	    tmp0 = x[i1];
2214 	    tmp1 = x[i2];
2215 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2216 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2217 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2218 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2219 	  }
2220 
2221 	  if (n == sz-1){
2222 	    tmp0  = x[*idx];
2223 	    sum1 -= *v1*tmp0;
2224 	    sum2 -= *v2*tmp0;
2225 	    sum3 -= *v3*tmp0;
2226 	    sum4 -= *v4*tmp0;
2227 	  }
2228 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2229 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2230 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2231 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2232           row -= 4;
2233 	  break;
2234         case 5:
2235 
2236 	  sum1  = b[row];
2237 	  sum2  = b[row-1];
2238 	  sum3  = b[row-2];
2239 	  sum4  = b[row-3];
2240 	  sum5  = b[row-4];
2241 	  v2    = a->a + diag[row-1] + 2;
2242 	  v3    = a->a + diag[row-2] + 3;
2243 	  v4    = a->a + diag[row-3] + 4;
2244 	  v5    = a->a + diag[row-4] + 5;
2245 	  for(n = 0; n<sz-1; n+=2) {
2246 	    i1   = idx[0];
2247 	    i2   = idx[1];
2248 	    idx += 2;
2249 	    tmp0 = x[i1];
2250 	    tmp1 = x[i2];
2251 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2252 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2253 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2254 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2255 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2256 	  }
2257 
2258 	  if (n == sz-1){
2259 	    tmp0  = x[*idx];
2260 	    sum1 -= *v1*tmp0;
2261 	    sum2 -= *v2*tmp0;
2262 	    sum3 -= *v3*tmp0;
2263 	    sum4 -= *v4*tmp0;
2264 	    sum5 -= *v5*tmp0;
2265 	  }
2266 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
2267 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
2268 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
2269 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
2270 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
2271           row -= 5;
2272 	  break;
2273         default:
2274 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2275       }
2276       CHKMEMQ;
2277     }
2278     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2279 
2280     /*
2281            t = b - D x    where D is the block diagonal
2282     */
2283     cnt = 0;
2284     for (i=0, row=0; i<m; i++) {
2285       CHKMEMQ;
2286       switch (sizes[i]){
2287         case 1:
2288 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
2289 	  break;
2290         case 2:
2291 	  x1   = x[row]; x2 = x[row+1];
2292 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2293 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2294 	  t[row]   = b[row] - tmp1;
2295 	  t[row+1] = b[row+1] - tmp2; row += 2;
2296 	  cnt += 4;
2297 	  break;
2298         case 3:
2299 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2300 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2301 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2302 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2303 	  t[row] = b[row] - tmp1;
2304 	  t[row+1] = b[row+1] - tmp2;
2305 	  t[row+2] = b[row+2] - tmp3; row += 3;
2306 	  cnt += 9;
2307 	  break;
2308         case 4:
2309 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2310 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2311 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2312 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2313 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2314 	  t[row] = b[row] - tmp1;
2315 	  t[row+1] = b[row+1] - tmp2;
2316 	  t[row+2] = b[row+2] - tmp3;
2317 	  t[row+3] = b[row+3] - tmp4; row += 4;
2318 	  cnt += 16;
2319 	  break;
2320         case 5:
2321 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2322 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2323 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2324 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2325 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2326 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2327 	  t[row] = b[row] - tmp1;
2328 	  t[row+1] = b[row+1] - tmp2;
2329 	  t[row+2] = b[row+2] - tmp3;
2330 	  t[row+3] = b[row+3] - tmp4;
2331 	  t[row+4] = b[row+4] - tmp5;row += 5;
2332 	  cnt += 25;
2333 	  break;
2334         default:
2335 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2336       }
2337       CHKMEMQ;
2338     }
2339     ierr = PetscLogFlops(m);CHKERRQ(ierr);
2340 
2341 
2342 
2343     /*
2344           Apply (L + D)^-1 where D is the block diagonal
2345     */
2346     for (i=0, row=0; i<m; i++) {
2347       sz  = diag[row] - ii[row];
2348       v1  = a->a + ii[row];
2349       idx = a->j + ii[row];
2350       CHKMEMQ;
2351       /* see comments for MatMult_Inode() for how this is coded */
2352       switch (sizes[i]){
2353         case 1:
2354 
2355 	  sum1  = t[row];
2356 	  for(n = 0; n<sz-1; n+=2) {
2357 	    i1   = idx[0];
2358 	    i2   = idx[1];
2359 	    idx += 2;
2360 	    tmp0 = t[i1];
2361 	    tmp1 = t[i2];
2362 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2363 	  }
2364 
2365 	  if (n == sz-1){
2366 	    tmp0  = t[*idx];
2367 	    sum1 -= *v1 * tmp0;
2368 	  }
2369 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
2370 	  break;
2371         case 2:
2372 	  v2    = a->a + ii[row+1];
2373 	  sum1  = t[row];
2374 	  sum2  = t[row+1];
2375 	  for(n = 0; n<sz-1; n+=2) {
2376 	    i1   = idx[0];
2377 	    i2   = idx[1];
2378 	    idx += 2;
2379 	    tmp0 = t[i1];
2380 	    tmp1 = t[i2];
2381 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2382 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2383 	  }
2384 
2385 	  if (n == sz-1){
2386 	    tmp0  = t[*idx];
2387 	    sum1 -= v1[0] * tmp0;
2388 	    sum2 -= v2[0] * tmp0;
2389 	  }
2390 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
2391 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
2392 	  ibdiag  += 4; row += 2;
2393 	  break;
2394         case 3:
2395 	  v2    = a->a + ii[row+1];
2396 	  v3    = a->a + ii[row+2];
2397 	  sum1  = t[row];
2398 	  sum2  = t[row+1];
2399 	  sum3  = t[row+2];
2400 	  for(n = 0; n<sz-1; n+=2) {
2401 	    i1   = idx[0];
2402 	    i2   = idx[1];
2403 	    idx += 2;
2404 	    tmp0 = t[i1];
2405 	    tmp1 = t[i2];
2406 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2407 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2408 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2409 	  }
2410 
2411 	  if (n == sz-1){
2412 	    tmp0  = t[*idx];
2413 	    sum1 -= v1[0] * tmp0;
2414 	    sum2 -= v2[0] * tmp0;
2415 	    sum3 -= v3[0] * tmp0;
2416 	  }
2417 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2418 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2419 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2420 	  ibdiag  += 9; row += 3;
2421 	  break;
2422         case 4:
2423 	  v2    = a->a + ii[row+1];
2424 	  v3    = a->a + ii[row+2];
2425 	  v4    = a->a + ii[row+3];
2426 	  sum1  = t[row];
2427 	  sum2  = t[row+1];
2428 	  sum3  = t[row+2];
2429 	  sum4  = t[row+3];
2430 	  for(n = 0; n<sz-1; n+=2) {
2431 	    i1   = idx[0];
2432 	    i2   = idx[1];
2433 	    idx += 2;
2434 	    tmp0 = t[i1];
2435 	    tmp1 = t[i2];
2436 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2437 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2438 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2439 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2440 	  }
2441 
2442 	  if (n == sz-1){
2443 	    tmp0  = t[*idx];
2444 	    sum1 -= v1[0] * tmp0;
2445 	    sum2 -= v2[0] * tmp0;
2446 	    sum3 -= v3[0] * tmp0;
2447 	    sum4 -= v4[0] * tmp0;
2448 	  }
2449 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2450 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2451 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2452 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2453 	  ibdiag  += 16; row += 4;
2454 	  break;
2455         case 5:
2456 	  v2    = a->a + ii[row+1];
2457 	  v3    = a->a + ii[row+2];
2458 	  v4    = a->a + ii[row+3];
2459 	  v5    = a->a + ii[row+4];
2460 	  sum1  = t[row];
2461 	  sum2  = t[row+1];
2462 	  sum3  = t[row+2];
2463 	  sum4  = t[row+3];
2464 	  sum5  = t[row+4];
2465 	  for(n = 0; n<sz-1; n+=2) {
2466 	    i1   = idx[0];
2467 	    i2   = idx[1];
2468 	    idx += 2;
2469 	    tmp0 = t[i1];
2470 	    tmp1 = t[i2];
2471 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2472 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2473 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2474 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2475 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2476 	  }
2477 
2478 	  if (n == sz-1){
2479 	    tmp0  = t[*idx];
2480 	    sum1 -= v1[0] * tmp0;
2481 	    sum2 -= v2[0] * tmp0;
2482 	    sum3 -= v3[0] * tmp0;
2483 	    sum4 -= v4[0] * tmp0;
2484 	    sum5 -= v5[0] * tmp0;
2485 	  }
2486 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2487 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2488 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2489 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2490 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2491 	  ibdiag  += 25; row += 5;
2492 	  break;
2493         default:
2494 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2495       }
2496       CHKMEMQ;
2497     }
2498     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2499     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2500     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2501   }
2502   PetscFunctionReturn(0);
2503 }
2504 
2505 #undef __FUNCT__
2506 #define __FUNCT__ "MatMultDiagonalBlock_Inode"
2507 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx)
2508 {
2509   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2510   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
2511   const MatScalar    *bdiag = a->inode.bdiag;
2512   const PetscScalar  *b;
2513   PetscErrorCode      ierr;
2514   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
2515   const PetscInt      *sizes = a->inode.size;
2516 
2517   PetscFunctionBegin;
2518   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2519   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2520   cnt = 0;
2521   for (i=0, row=0; i<m; i++) {
2522     switch (sizes[i]){
2523       case 1:
2524 	x[row] = b[row]*bdiag[cnt++];row++;
2525 	break;
2526       case 2:
2527 	x1   = b[row]; x2 = b[row+1];
2528 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2529 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2530 	x[row++] = tmp1;
2531 	x[row++] = tmp2;
2532 	cnt += 4;
2533 	break;
2534       case 3:
2535 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
2536 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2537 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2538 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2539 	x[row++] = tmp1;
2540 	x[row++] = tmp2;
2541 	x[row++] = tmp3;
2542 	cnt += 9;
2543 	break;
2544       case 4:
2545 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
2546 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2547 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2548 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2549 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2550 	x[row++] = tmp1;
2551 	x[row++] = tmp2;
2552 	x[row++] = tmp3;
2553 	x[row++] = tmp4;
2554 	cnt += 16;
2555 	break;
2556       case 5:
2557 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
2558 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2559 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2560 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2561 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2562 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2563 	x[row++] = tmp1;
2564 	x[row++] = tmp2;
2565 	x[row++] = tmp3;
2566 	x[row++] = tmp4;
2567 	x[row++] = tmp5;
2568 	cnt += 25;
2569 	break;
2570       default:
2571 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2572     }
2573   }
2574   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
2575   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2576   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2577   PetscFunctionReturn(0);
2578 }
2579 
2580 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec);
2581 /*
2582     samestructure indicates that the matrix has not changed its nonzero structure so we
2583     do not need to recompute the inodes
2584 */
2585 #undef __FUNCT__
2586 #define __FUNCT__ "Mat_CheckInode"
2587 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
2588 {
2589   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2590   PetscErrorCode ierr;
2591   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
2592   PetscTruth     flag;
2593 
2594   PetscFunctionBegin;
2595   if (!a->inode.use)                     PetscFunctionReturn(0);
2596   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
2597 
2598 
2599   m = A->rmap->n;
2600   if (a->inode.size) {ns = a->inode.size;}
2601   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
2602 
2603   i          = 0;
2604   node_count = 0;
2605   idx        = a->j;
2606   ii         = a->i;
2607   while (i < m){                /* For each row */
2608     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
2609     /* Limits the number of elements in a node to 'a->inode.limit' */
2610     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
2611       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
2612       if (nzy != nzx) break;
2613       idy  += nzx;             /* Same nonzero pattern */
2614       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
2615       if (!flag) break;
2616     }
2617     ns[node_count++] = blk_size;
2618     idx += blk_size*nzx;
2619     i    = j;
2620   }
2621   /* If not enough inodes found,, do not use inode version of the routines */
2622   if (!a->inode.size && m && node_count > .9*m) {
2623     ierr = PetscFree(ns);CHKERRQ(ierr);
2624     a->inode.node_count     = 0;
2625     a->inode.size           = PETSC_NULL;
2626     a->inode.use            = PETSC_FALSE;
2627     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
2628   } else {
2629     A->ops->mult              = MatMult_Inode;
2630     A->ops->relax             = MatRelax_Inode;
2631     A->ops->multadd           = MatMultAdd_Inode;
2632     A->ops->getrowij          = MatGetRowIJ_Inode;
2633     A->ops->restorerowij      = MatRestoreRowIJ_Inode;
2634     A->ops->getcolumnij       = MatGetColumnIJ_Inode;
2635     A->ops->restorecolumnij   = MatRestoreColumnIJ_Inode;
2636     A->ops->coloringpatch     = MatColoringPatch_Inode;
2637     A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode;
2638     a->inode.node_count       = node_count;
2639     a->inode.size             = ns;
2640     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
2641   }
2642   PetscFunctionReturn(0);
2643 }
2644 
2645 /*
2646      This is really ugly. if inodes are used this replaces the
2647   permutations with ones that correspond to rows/cols of the matrix
2648   rather then inode blocks
2649 */
2650 #undef __FUNCT__
2651 #define __FUNCT__ "MatInodeAdjustForInodes"
2652 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
2653 {
2654   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
2655 
2656   PetscFunctionBegin;
2657   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
2658   if (f) {
2659     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
2660   }
2661   PetscFunctionReturn(0);
2662 }
2663 
2664 EXTERN_C_BEGIN
2665 #undef __FUNCT__
2666 #define __FUNCT__ "MatAdjustForInodes_Inode"
2667 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
2668 {
2669   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
2670   PetscErrorCode ierr;
2671   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
2672   const PetscInt *ridx,*cidx;
2673   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
2674   PetscInt       nslim_col,*ns_col;
2675   IS             ris = *rperm,cis = *cperm;
2676 
2677   PetscFunctionBegin;
2678   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
2679   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
2680 
2681   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
2682   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
2683   ierr  = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr);
2684   permc = permr + m;
2685 
2686   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
2687   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
2688 
2689   /* Form the inode structure for the rows of permuted matric using inv perm*/
2690   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
2691 
2692   /* Construct the permutations for rows*/
2693   for (i=0,row = 0; i<nslim_row; ++i){
2694     indx      = ridx[i];
2695     start_val = tns[indx];
2696     end_val   = tns[indx + 1];
2697     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
2698   }
2699 
2700   /* Form the inode structure for the columns of permuted matrix using inv perm*/
2701   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
2702 
2703  /* Construct permutations for columns */
2704   for (i=0,col=0; i<nslim_col; ++i){
2705     indx      = cidx[i];
2706     start_val = tns[indx];
2707     end_val   = tns[indx + 1];
2708     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
2709   }
2710 
2711   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
2712   ISSetPermutation(*rperm);
2713   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
2714   ISSetPermutation(*cperm);
2715 
2716   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
2717   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
2718 
2719   ierr = PetscFree(ns_col);CHKERRQ(ierr);
2720   ierr = PetscFree(permr);CHKERRQ(ierr);
2721   ierr = ISDestroy(cis);CHKERRQ(ierr);
2722   ierr = ISDestroy(ris);CHKERRQ(ierr);
2723   ierr = PetscFree(tns);CHKERRQ(ierr);
2724   PetscFunctionReturn(0);
2725 }
2726 EXTERN_C_END
2727 
2728 #undef __FUNCT__
2729 #define __FUNCT__ "MatInodeGetInodeSizes"
2730 /*@C
2731    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
2732 
2733    Collective on Mat
2734 
2735    Input Parameter:
2736 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
2737 
2738    Output Parameter:
2739 +  node_count - no of inodes present in the matrix.
2740 .  sizes      - an array of size node_count,with sizes of each inode.
2741 -  limit      - the max size used to generate the inodes.
2742 
2743    Level: advanced
2744 
2745    Notes: This routine returns some internal storage information
2746    of the matrix, it is intended to be used by advanced users.
2747    It should be called after the matrix is assembled.
2748    The contents of the sizes[] array should not be changed.
2749    PETSC_NULL may be passed for information not requested.
2750 
2751 .keywords: matrix, seqaij, get, inode
2752 
2753 .seealso: MatGetInfo()
2754 @*/
2755 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2756 {
2757   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
2758 
2759   PetscFunctionBegin;
2760   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
2761   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
2762   if (f) {
2763     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
2764   }
2765   PetscFunctionReturn(0);
2766 }
2767 
2768 EXTERN_C_BEGIN
2769 #undef __FUNCT__
2770 #define __FUNCT__ "MatInodeGetInodeSizes_Inode"
2771 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
2772 {
2773   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2774 
2775   PetscFunctionBegin;
2776   if (node_count) *node_count = a->inode.node_count;
2777   if (sizes)      *sizes      = a->inode.size;
2778   if (limit)      *limit      = a->inode.limit;
2779   PetscFunctionReturn(0);
2780 }
2781 EXTERN_C_END
2782