xref: /petsc/src/mat/impls/aij/seq/inode.c (revision db32a24517e7892d23ded433de58be0568ffb6aa)
1 #define PETSCMAT_DLL
2 
3 /*
4   This file provides high performance routines for the Inode format (compressed sparse row)
5   by taking advantage of rows with identical nonzero structure (I-nodes).
6 */
7 #include "../src/mat/impls/aij/seq/aij.h"
8 
9 #undef __FUNCT__
10 #define __FUNCT__ "Mat_CreateColInode"
11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12 {
13   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
14   PetscErrorCode ierr;
15   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
16 
17   PetscFunctionBegin;
18   n      = A->cmap->n;
19   m      = A->rmap->n;
20   ns_row = a->inode.size;
21 
22   min_mn = (m < n) ? m : n;
23   if (!ns) {
24     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25     for(; count+1 < n; count++,i++);
26     if (count < n)  {
27       i++;
28     }
29     *size = i;
30     PetscFunctionReturn(0);
31   }
32   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
33 
34   /* Use the same row structure wherever feasible. */
35   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36     ns_col[i] = ns_row[i];
37   }
38 
39   /* if m < n; pad up the remainder with inode_limit */
40   for(; count+1 < n; count++,i++) {
41     ns_col[i] = 1;
42   }
43   /* The last node is the odd ball. padd it up with the remaining rows; */
44   if (count < n)  {
45     ns_col[i] = n - count;
46     i++;
47   } else if (count > n) {
48     /* Adjust for the over estimation */
49     ns_col[i-1] += n - count;
50   }
51   *size = i;
52   *ns   = ns_col;
53   PetscFunctionReturn(0);
54 }
55 
56 
57 /*
58       This builds symmetric version of nonzero structure,
59 */
60 #undef __FUNCT__
61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63 {
64   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
65   PetscErrorCode ierr;
66   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1){
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col)
87       tvc[col] = i1;
88   }
89   /* allocate space for row pointers */
90   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
91   *iia = ia;
92   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
93   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
94 
95   /* determine the number of columns in each row */
96   ia[0] = oshift;
97   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
98 
99     j    = aj + ai[row] + ishift;
100     jmax = aj + ai[row+1] + ishift;
101     i2   = 0;
102     col  = *j++ + ishift;
103     i2   = tvc[col];
104     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105       ia[i1+1]++;
106       ia[i2+1]++;
107       i2++;                     /* Start col of next node */
108       while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109       i2 = tvc[col];
110     }
111     if(i2 == i1) ia[i2+1]++;    /* now the diagonal element */
112   }
113 
114   /* shift ia[i] to point to next row */
115   for (i1=1; i1<nslim_row+1; i1++) {
116     row        = ia[i1-1];
117     ia[i1]    += row;
118     work[i1-1] = row - oshift;
119   }
120 
121   /* allocate space for column pointers */
122   nz   = ia[nslim_row] + (!ishift);
123   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
124   *jja = ja;
125 
126  /* loop over lower triangular part putting into ja */
127   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128     j    = aj + ai[row] + ishift;
129     jmax = aj + ai[row+1] + ishift;
130     i2   = 0;                     /* Col inode index */
131     col  = *j++ + ishift;
132     i2   = tvc[col];
133     while (i2<i1 && j<jmax) {
134       ja[work[i2]++] = i1 + oshift;
135       ja[work[i1]++] = i2 + oshift;
136       ++i2;
137       while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138       i2 = tvc[col];
139     }
140     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
141 
142   }
143   ierr = PetscFree(work);CHKERRQ(ierr);
144   ierr = PetscFree(tns);CHKERRQ(ierr);
145   ierr = PetscFree(tvc);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 /*
150       This builds nonsymmetric version of nonzero structure,
151 */
152 #undef __FUNCT__
153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155 {
156   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
157   PetscErrorCode ierr;
158   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
160 
161   PetscFunctionBegin;
162   nslim_row = a->inode.node_count;
163   n         = A->cmap->n;
164 
165   /* Create The column_inode for this matrix */
166   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
167 
168   /* allocate space for reformated column_inode structure */
169   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
170   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
171   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
172 
173   for (i1=0,col=0; i1<nslim_col; ++i1){
174     nsz = ns_col[i1];
175     for (i2=0; i2<nsz; ++i2,++col)
176       tvc[col] = i1;
177   }
178   /* allocate space for row pointers */
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
180   *iia = ia;
181   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
182   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
183 
184   /* determine the number of columns in each row */
185   ia[0] = oshift;
186   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187     j   = aj + ai[row] + ishift;
188     col = *j++ + ishift;
189     i2  = tvc[col];
190     nz  = ai[row+1] - ai[row];
191     while (nz-- > 0) {           /* off-diagonal elemets */
192       ia[i1+1]++;
193       i2++;                     /* Start col of next node */
194       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195       if (nz > 0) i2 = tvc[col];
196     }
197   }
198 
199   /* shift ia[i] to point to next row */
200   for (i1=1; i1<nslim_row+1; i1++) {
201     row        = ia[i1-1];
202     ia[i1]    += row;
203     work[i1-1] = row - oshift;
204   }
205 
206   /* allocate space for column pointers */
207   nz   = ia[nslim_row] + (!ishift);
208   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
209   *jja = ja;
210 
211  /* loop over matrix putting into ja */
212   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213     j   = aj + ai[row] + ishift;
214     i2  = 0;                     /* Col inode index */
215     col = *j++ + ishift;
216     i2  = tvc[col];
217     nz  = ai[row+1] - ai[row];
218     while (nz-- > 0) {
219       ja[work[i1]++] = i2 + oshift;
220       ++i2;
221       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222       if (nz > 0) i2 = tvc[col];
223     }
224   }
225   ierr = PetscFree(ns_col);CHKERRQ(ierr);
226   ierr = PetscFree(work);CHKERRQ(ierr);
227   ierr = PetscFree(tns);CHKERRQ(ierr);
228   ierr = PetscFree(tvc);CHKERRQ(ierr);
229   PetscFunctionReturn(0);
230 }
231 
232 #undef __FUNCT__
233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235 {
236   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
237   PetscErrorCode ierr;
238 
239   PetscFunctionBegin;
240   *n     = a->inode.node_count;
241   if (!ia) PetscFunctionReturn(0);
242   if (!blockcompressed) {
243     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
244   } else if (symmetric) {
245     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
246   } else {
247     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #undef __FUNCT__
253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
255 {
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   if (!ia) PetscFunctionReturn(0);
260 
261   if (!blockcompressed) {
262     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
263   } else {
264     ierr = PetscFree(*ia);CHKERRQ(ierr);
265     ierr = PetscFree(*ja);CHKERRQ(ierr);
266   }
267 
268   PetscFunctionReturn(0);
269 }
270 
271 /* ----------------------------------------------------------- */
272 
273 #undef __FUNCT__
274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
276 {
277   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
278   PetscErrorCode ierr;
279   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
280   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
281 
282   PetscFunctionBegin;
283   nslim_row = a->inode.node_count;
284   n         = A->cmap->n;
285 
286   /* Create The column_inode for this matrix */
287   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
288 
289   /* allocate space for reformated column_inode structure */
290   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
291   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
292   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
293 
294   for (i1=0,col=0; i1<nslim_col; ++i1){
295     nsz = ns_col[i1];
296     for (i2=0; i2<nsz; ++i2,++col)
297       tvc[col] = i1;
298   }
299   /* allocate space for column pointers */
300   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
301   *iia = ia;
302   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
303   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
304 
305   /* determine the number of columns in each row */
306   ia[0] = oshift;
307   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
308     j   = aj + ai[row] + ishift;
309     col = *j++ + ishift;
310     i2  = tvc[col];
311     nz  = ai[row+1] - ai[row];
312     while (nz-- > 0) {           /* off-diagonal elemets */
313       /* ia[i1+1]++; */
314       ia[i2+1]++;
315       i2++;
316       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
317       if (nz > 0) i2 = tvc[col];
318     }
319   }
320 
321   /* shift ia[i] to point to next col */
322   for (i1=1; i1<nslim_col+1; i1++) {
323     col        = ia[i1-1];
324     ia[i1]    += col;
325     work[i1-1] = col - oshift;
326   }
327 
328   /* allocate space for column pointers */
329   nz   = ia[nslim_col] + (!ishift);
330   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
331   *jja = ja;
332 
333  /* loop over matrix putting into ja */
334   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
335     j   = aj + ai[row] + ishift;
336     i2  = 0;                     /* Col inode index */
337     col = *j++ + ishift;
338     i2  = tvc[col];
339     nz  = ai[row+1] - ai[row];
340     while (nz-- > 0) {
341       /* ja[work[i1]++] = i2 + oshift; */
342       ja[work[i2]++] = i1 + oshift;
343       i2++;
344       while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
345       if (nz > 0) i2 = tvc[col];
346     }
347   }
348   ierr = PetscFree(ns_col);CHKERRQ(ierr);
349   ierr = PetscFree(work);CHKERRQ(ierr);
350   ierr = PetscFree(tns);CHKERRQ(ierr);
351   ierr = PetscFree(tvc);CHKERRQ(ierr);
352   PetscFunctionReturn(0);
353 }
354 
355 #undef __FUNCT__
356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
363   if (!ia) PetscFunctionReturn(0);
364 
365   if (!blockcompressed) {
366     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
367   } else if (symmetric) {
368     /* Since the indices are symmetric it does'nt matter */
369     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
370   } else {
371     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
379 {
380   PetscErrorCode ierr;
381 
382   PetscFunctionBegin;
383   if (!ia) PetscFunctionReturn(0);
384   if (!blockcompressed) {
385     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
386   } else {
387     ierr = PetscFree(*ia);CHKERRQ(ierr);
388     ierr = PetscFree(*ja);CHKERRQ(ierr);
389   }
390   PetscFunctionReturn(0);
391 }
392 
393 /* ----------------------------------------------------------- */
394 
395 #undef __FUNCT__
396 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
398 {
399   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
400   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
401   PetscScalar       *y;
402   const PetscScalar *x;
403   const MatScalar   *v1,*v2,*v3,*v4,*v5;
404   PetscErrorCode    ierr;
405   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
406 
407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
409 #endif
410 
411   PetscFunctionBegin;
412   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
413   node_max = a->inode.node_count;
414   ns       = a->inode.size;     /* Node Size array */
415   ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
416   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
417   idx  = a->j;
418   v1   = a->a;
419   ii   = a->i;
420 
421   for (i = 0,row = 0; i< node_max; ++i){
422     nsz  = ns[i];
423     n    = ii[1] - ii[0];
424     nonzerorow += (n>0)*nsz;
425     ii  += nsz;
426     PetscPrefetchBlock(idx+nsz*n,n,0,0);    /* Prefetch the indices for the block row after the current one */
427     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one  */
428     sz   = n;                   /* No of non zeros in this row */
429                                 /* Switch on the size of Node */
430     switch (nsz){               /* Each loop in 'case' is unrolled */
431     case 1 :
432       sum1  = 0.;
433 
434       for(n = 0; n< sz-1; n+=2) {
435         i1   = idx[0];          /* The instructions are ordered to */
436         i2   = idx[1];          /* make the compiler's job easy */
437         idx += 2;
438         tmp0 = x[i1];
439         tmp1 = x[i2];
440         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
441        }
442 
443       if (n == sz-1){          /* Take care of the last nonzero  */
444         tmp0  = x[*idx++];
445         sum1 += *v1++ * tmp0;
446       }
447       y[row++]=sum1;
448       break;
449     case 2:
450       sum1  = 0.;
451       sum2  = 0.;
452       v2    = v1 + n;
453 
454       for (n = 0; n< sz-1; n+=2) {
455         i1   = idx[0];
456         i2   = idx[1];
457         idx += 2;
458         tmp0 = x[i1];
459         tmp1 = x[i2];
460         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
461         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
462       }
463       if (n == sz-1){
464         tmp0  = x[*idx++];
465         sum1 += *v1++ * tmp0;
466         sum2 += *v2++ * tmp0;
467       }
468       y[row++]=sum1;
469       y[row++]=sum2;
470       v1      =v2;              /* Since the next block to be processed starts there*/
471       idx    +=sz;
472       break;
473     case 3:
474       sum1  = 0.;
475       sum2  = 0.;
476       sum3  = 0.;
477       v2    = v1 + n;
478       v3    = v2 + n;
479 
480       for (n = 0; n< sz-1; n+=2) {
481         i1   = idx[0];
482         i2   = idx[1];
483         idx += 2;
484         tmp0 = x[i1];
485         tmp1 = x[i2];
486         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
487         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
488         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
489       }
490       if (n == sz-1){
491         tmp0  = x[*idx++];
492         sum1 += *v1++ * tmp0;
493         sum2 += *v2++ * tmp0;
494         sum3 += *v3++ * tmp0;
495       }
496       y[row++]=sum1;
497       y[row++]=sum2;
498       y[row++]=sum3;
499       v1       =v3;             /* Since the next block to be processed starts there*/
500       idx     +=2*sz;
501       break;
502     case 4:
503       sum1  = 0.;
504       sum2  = 0.;
505       sum3  = 0.;
506       sum4  = 0.;
507       v2    = v1 + n;
508       v3    = v2 + n;
509       v4    = v3 + n;
510 
511       for (n = 0; n< sz-1; n+=2) {
512         i1   = idx[0];
513         i2   = idx[1];
514         idx += 2;
515         tmp0 = x[i1];
516         tmp1 = x[i2];
517         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
518         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
519         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
520         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
521       }
522       if (n == sz-1){
523         tmp0  = x[*idx++];
524         sum1 += *v1++ * tmp0;
525         sum2 += *v2++ * tmp0;
526         sum3 += *v3++ * tmp0;
527         sum4 += *v4++ * tmp0;
528       }
529       y[row++]=sum1;
530       y[row++]=sum2;
531       y[row++]=sum3;
532       y[row++]=sum4;
533       v1      =v4;              /* Since the next block to be processed starts there*/
534       idx    +=3*sz;
535       break;
536     case 5:
537       sum1  = 0.;
538       sum2  = 0.;
539       sum3  = 0.;
540       sum4  = 0.;
541       sum5  = 0.;
542       v2    = v1 + n;
543       v3    = v2 + n;
544       v4    = v3 + n;
545       v5    = v4 + n;
546 
547       for (n = 0; n<sz-1; n+=2) {
548         i1   = idx[0];
549         i2   = idx[1];
550         idx += 2;
551         tmp0 = x[i1];
552         tmp1 = x[i2];
553         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
554         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
555         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
556         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
557         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
558       }
559       if (n == sz-1){
560         tmp0  = x[*idx++];
561         sum1 += *v1++ * tmp0;
562         sum2 += *v2++ * tmp0;
563         sum3 += *v3++ * tmp0;
564         sum4 += *v4++ * tmp0;
565         sum5 += *v5++ * tmp0;
566       }
567       y[row++]=sum1;
568       y[row++]=sum2;
569       y[row++]=sum3;
570       y[row++]=sum4;
571       y[row++]=sum5;
572       v1      =v5;       /* Since the next block to be processed starts there */
573       idx    +=4*sz;
574       break;
575     default :
576       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
577     }
578   }
579   ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr);
580   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
581   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
582   PetscFunctionReturn(0);
583 }
584 /* ----------------------------------------------------------- */
585 /* Almost same code as the MatMult_SeqAIJ_Inode() */
586 #undef __FUNCT__
587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
589 {
590   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
591   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
592   MatScalar      *v1,*v2,*v3,*v4,*v5;
593   PetscScalar    *x,*y,*z,*zt;
594   PetscErrorCode ierr;
595   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
596 
597   PetscFunctionBegin;
598   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
599   node_max = a->inode.node_count;
600   ns       = a->inode.size;     /* Node Size array */
601   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
602   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
603   if (zz != yy) {
604     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
605   } else {
606     z = y;
607   }
608   zt = z;
609 
610   idx  = a->j;
611   v1   = a->a;
612   ii   = a->i;
613 
614   for (i = 0,row = 0; i< node_max; ++i){
615     nsz  = ns[i];
616     n    = ii[1] - ii[0];
617     ii  += nsz;
618     sz   = n;                   /* No of non zeros in this row */
619                                 /* Switch on the size of Node */
620     switch (nsz){               /* Each loop in 'case' is unrolled */
621     case 1 :
622       sum1  = *zt++;
623 
624       for(n = 0; n< sz-1; n+=2) {
625         i1   = idx[0];          /* The instructions are ordered to */
626         i2   = idx[1];          /* make the compiler's job easy */
627         idx += 2;
628         tmp0 = x[i1];
629         tmp1 = x[i2];
630         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
631        }
632 
633       if(n   == sz-1){          /* Take care of the last nonzero  */
634         tmp0  = x[*idx++];
635         sum1 += *v1++ * tmp0;
636       }
637       y[row++]=sum1;
638       break;
639     case 2:
640       sum1  = *zt++;
641       sum2  = *zt++;
642       v2    = v1 + n;
643 
644       for(n = 0; n< sz-1; n+=2) {
645         i1   = idx[0];
646         i2   = idx[1];
647         idx += 2;
648         tmp0 = x[i1];
649         tmp1 = x[i2];
650         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
651         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
652       }
653       if(n   == sz-1){
654         tmp0  = x[*idx++];
655         sum1 += *v1++ * tmp0;
656         sum2 += *v2++ * tmp0;
657       }
658       y[row++]=sum1;
659       y[row++]=sum2;
660       v1      =v2;              /* Since the next block to be processed starts there*/
661       idx    +=sz;
662       break;
663     case 3:
664       sum1  = *zt++;
665       sum2  = *zt++;
666       sum3  = *zt++;
667       v2    = v1 + n;
668       v3    = v2 + n;
669 
670       for (n = 0; n< sz-1; n+=2) {
671         i1   = idx[0];
672         i2   = idx[1];
673         idx += 2;
674         tmp0 = x[i1];
675         tmp1 = x[i2];
676         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
677         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
678         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
679       }
680       if (n == sz-1){
681         tmp0  = x[*idx++];
682         sum1 += *v1++ * tmp0;
683         sum2 += *v2++ * tmp0;
684         sum3 += *v3++ * tmp0;
685       }
686       y[row++]=sum1;
687       y[row++]=sum2;
688       y[row++]=sum3;
689       v1       =v3;             /* Since the next block to be processed starts there*/
690       idx     +=2*sz;
691       break;
692     case 4:
693       sum1  = *zt++;
694       sum2  = *zt++;
695       sum3  = *zt++;
696       sum4  = *zt++;
697       v2    = v1 + n;
698       v3    = v2 + n;
699       v4    = v3 + n;
700 
701       for (n = 0; n< sz-1; n+=2) {
702         i1   = idx[0];
703         i2   = idx[1];
704         idx += 2;
705         tmp0 = x[i1];
706         tmp1 = x[i2];
707         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
708         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
709         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
710         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
711       }
712       if (n == sz-1){
713         tmp0  = x[*idx++];
714         sum1 += *v1++ * tmp0;
715         sum2 += *v2++ * tmp0;
716         sum3 += *v3++ * tmp0;
717         sum4 += *v4++ * tmp0;
718       }
719       y[row++]=sum1;
720       y[row++]=sum2;
721       y[row++]=sum3;
722       y[row++]=sum4;
723       v1      =v4;              /* Since the next block to be processed starts there*/
724       idx    +=3*sz;
725       break;
726     case 5:
727       sum1  = *zt++;
728       sum2  = *zt++;
729       sum3  = *zt++;
730       sum4  = *zt++;
731       sum5  = *zt++;
732       v2    = v1 + n;
733       v3    = v2 + n;
734       v4    = v3 + n;
735       v5    = v4 + n;
736 
737       for (n = 0; n<sz-1; n+=2) {
738         i1   = idx[0];
739         i2   = idx[1];
740         idx += 2;
741         tmp0 = x[i1];
742         tmp1 = x[i2];
743         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
744         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
745         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
746         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
747         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
748       }
749       if(n   == sz-1){
750         tmp0  = x[*idx++];
751         sum1 += *v1++ * tmp0;
752         sum2 += *v2++ * tmp0;
753         sum3 += *v3++ * tmp0;
754         sum4 += *v4++ * tmp0;
755         sum5 += *v5++ * tmp0;
756       }
757       y[row++]=sum1;
758       y[row++]=sum2;
759       y[row++]=sum3;
760       y[row++]=sum4;
761       y[row++]=sum5;
762       v1      =v5;       /* Since the next block to be processed starts there */
763       idx    +=4*sz;
764       break;
765     default :
766       SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
767     }
768   }
769   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
770   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
771   if (zz != yy) {
772     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
773   }
774   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
775   PetscFunctionReturn(0);
776 }
777 
778 /* ----------------------------------------------------------- */
779 #undef __FUNCT__
780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
782 {
783   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
784   IS                iscol = a->col,isrow = a->row;
785   PetscErrorCode    ierr;
786   const PetscInt    *r,*c,*rout,*cout;
787   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
788   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
789   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
790   PetscScalar       sum1,sum2,sum3,sum4,sum5;
791   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
792   const PetscScalar *b;
793 
794   PetscFunctionBegin;
795   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
796   node_max = a->inode.node_count;
797   ns       = a->inode.size;     /* Node Size array */
798 
799   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
800   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
801   tmp  = a->solve_work;
802 
803   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
804   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
805 
806   /* forward solve the lower triangular */
807   tmps = tmp ;
808   aa   = a_a ;
809   aj   = a_j ;
810   ad   = a->diag;
811 
812   for (i = 0,row = 0; i< node_max; ++i){
813     nsz = ns[i];
814     aii = ai[row];
815     v1  = aa + aii;
816     vi  = aj + aii;
817     nz  = ad[row]- aii;
818     if (i < node_max-1) {
819       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
820       * but our indexing to determine it's size could. */
821       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */
822       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
823       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0);
824       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
825     }
826 
827     switch (nsz){               /* Each loop in 'case' is unrolled */
828     case 1 :
829       sum1 = b[*r++];
830       for(j=0; j<nz-1; j+=2){
831         i0   = vi[0];
832         i1   = vi[1];
833         vi  +=2;
834         tmp0 = tmps[i0];
835         tmp1 = tmps[i1];
836         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
837       }
838       if(j == nz-1){
839         tmp0 = tmps[*vi++];
840         sum1 -= *v1++ *tmp0;
841       }
842       tmp[row ++]=sum1;
843       break;
844     case 2:
845       sum1 = b[*r++];
846       sum2 = b[*r++];
847       v2   = aa + ai[row+1];
848 
849       for(j=0; j<nz-1; j+=2){
850         i0   = vi[0];
851         i1   = vi[1];
852         vi  +=2;
853         tmp0 = tmps[i0];
854         tmp1 = tmps[i1];
855         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
856         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
857       }
858       if(j == nz-1){
859         tmp0 = tmps[*vi++];
860         sum1 -= *v1++ *tmp0;
861         sum2 -= *v2++ *tmp0;
862       }
863       sum2 -= *v2++ * sum1;
864       tmp[row ++]=sum1;
865       tmp[row ++]=sum2;
866       break;
867     case 3:
868       sum1 = b[*r++];
869       sum2 = b[*r++];
870       sum3 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873 
874       for (j=0; j<nz-1; j+=2){
875         i0   = vi[0];
876         i1   = vi[1];
877         vi  +=2;
878         tmp0 = tmps[i0];
879         tmp1 = tmps[i1];
880         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
881         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
882         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
883       }
884       if (j == nz-1){
885         tmp0 = tmps[*vi++];
886         sum1 -= *v1++ *tmp0;
887         sum2 -= *v2++ *tmp0;
888         sum3 -= *v3++ *tmp0;
889       }
890       sum2 -= *v2++ * sum1;
891       sum3 -= *v3++ * sum1;
892       sum3 -= *v3++ * sum2;
893       tmp[row ++]=sum1;
894       tmp[row ++]=sum2;
895       tmp[row ++]=sum3;
896       break;
897 
898     case 4:
899       sum1 = b[*r++];
900       sum2 = b[*r++];
901       sum3 = b[*r++];
902       sum4 = b[*r++];
903       v2   = aa + ai[row+1];
904       v3   = aa + ai[row+2];
905       v4   = aa + ai[row+3];
906 
907       for (j=0; j<nz-1; j+=2){
908         i0   = vi[0];
909         i1   = vi[1];
910         vi  +=2;
911         tmp0 = tmps[i0];
912         tmp1 = tmps[i1];
913         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
914         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
915         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
916         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
917       }
918       if (j == nz-1){
919         tmp0 = tmps[*vi++];
920         sum1 -= *v1++ *tmp0;
921         sum2 -= *v2++ *tmp0;
922         sum3 -= *v3++ *tmp0;
923         sum4 -= *v4++ *tmp0;
924       }
925       sum2 -= *v2++ * sum1;
926       sum3 -= *v3++ * sum1;
927       sum4 -= *v4++ * sum1;
928       sum3 -= *v3++ * sum2;
929       sum4 -= *v4++ * sum2;
930       sum4 -= *v4++ * sum3;
931 
932       tmp[row ++]=sum1;
933       tmp[row ++]=sum2;
934       tmp[row ++]=sum3;
935       tmp[row ++]=sum4;
936       break;
937     case 5:
938       sum1 = b[*r++];
939       sum2 = b[*r++];
940       sum3 = b[*r++];
941       sum4 = b[*r++];
942       sum5 = b[*r++];
943       v2   = aa + ai[row+1];
944       v3   = aa + ai[row+2];
945       v4   = aa + ai[row+3];
946       v5   = aa + ai[row+4];
947 
948       for (j=0; j<nz-1; j+=2){
949         i0   = vi[0];
950         i1   = vi[1];
951         vi  +=2;
952         tmp0 = tmps[i0];
953         tmp1 = tmps[i1];
954         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
955         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
956         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
957         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
958         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
959       }
960       if (j == nz-1){
961         tmp0 = tmps[*vi++];
962         sum1 -= *v1++ *tmp0;
963         sum2 -= *v2++ *tmp0;
964         sum3 -= *v3++ *tmp0;
965         sum4 -= *v4++ *tmp0;
966         sum5 -= *v5++ *tmp0;
967       }
968 
969       sum2 -= *v2++ * sum1;
970       sum3 -= *v3++ * sum1;
971       sum4 -= *v4++ * sum1;
972       sum5 -= *v5++ * sum1;
973       sum3 -= *v3++ * sum2;
974       sum4 -= *v4++ * sum2;
975       sum5 -= *v5++ * sum2;
976       sum4 -= *v4++ * sum3;
977       sum5 -= *v5++ * sum3;
978       sum5 -= *v5++ * sum4;
979 
980       tmp[row ++]=sum1;
981       tmp[row ++]=sum2;
982       tmp[row ++]=sum3;
983       tmp[row ++]=sum4;
984       tmp[row ++]=sum5;
985       break;
986     default:
987       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
988     }
989   }
990   /* backward solve the upper triangular */
991   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
992     nsz = ns[i];
993     aii = ai[row+1] -1;
994     v1  = aa + aii;
995     vi  = aj + aii;
996     nz  = aii- ad[row];
997     switch (nsz){               /* Each loop in 'case' is unrolled */
998     case 1 :
999       sum1 = tmp[row];
1000 
1001       for(j=nz ; j>1; j-=2){
1002         vi  -=2;
1003         i0   = vi[2];
1004         i1   = vi[1];
1005         tmp0 = tmps[i0];
1006         tmp1 = tmps[i1];
1007         v1   -= 2;
1008         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1009       }
1010       if (j==1){
1011         tmp0  = tmps[*vi--];
1012         sum1 -= *v1-- * tmp0;
1013       }
1014       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1015       break;
1016     case 2 :
1017       sum1 = tmp[row];
1018       sum2 = tmp[row -1];
1019       v2   = aa + ai[row]-1;
1020       for (j=nz ; j>1; j-=2){
1021         vi  -=2;
1022         i0   = vi[2];
1023         i1   = vi[1];
1024         tmp0 = tmps[i0];
1025         tmp1 = tmps[i1];
1026         v1   -= 2;
1027         v2   -= 2;
1028         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030       }
1031       if (j==1){
1032         tmp0  = tmps[*vi--];
1033         sum1 -= *v1-- * tmp0;
1034         sum2 -= *v2-- * tmp0;
1035       }
1036 
1037       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1038       sum2   -= *v2-- * tmp0;
1039       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1040       break;
1041     case 3 :
1042       sum1 = tmp[row];
1043       sum2 = tmp[row -1];
1044       sum3 = tmp[row -2];
1045       v2   = aa + ai[row]-1;
1046       v3   = aa + ai[row -1]-1;
1047       for (j=nz ; j>1; j-=2){
1048         vi  -=2;
1049         i0   = vi[2];
1050         i1   = vi[1];
1051         tmp0 = tmps[i0];
1052         tmp1 = tmps[i1];
1053         v1   -= 2;
1054         v2   -= 2;
1055         v3   -= 2;
1056         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1057         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1058         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1059       }
1060       if (j==1){
1061         tmp0  = tmps[*vi--];
1062         sum1 -= *v1-- * tmp0;
1063         sum2 -= *v2-- * tmp0;
1064         sum3 -= *v3-- * tmp0;
1065       }
1066       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1067       sum2   -= *v2-- * tmp0;
1068       sum3   -= *v3-- * tmp0;
1069       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1070       sum3   -= *v3-- * tmp0;
1071       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1072 
1073       break;
1074     case 4 :
1075       sum1 = tmp[row];
1076       sum2 = tmp[row -1];
1077       sum3 = tmp[row -2];
1078       sum4 = tmp[row -3];
1079       v2   = aa + ai[row]-1;
1080       v3   = aa + ai[row -1]-1;
1081       v4   = aa + ai[row -2]-1;
1082 
1083       for (j=nz ; j>1; j-=2){
1084         vi  -=2;
1085         i0   = vi[2];
1086         i1   = vi[1];
1087         tmp0 = tmps[i0];
1088         tmp1 = tmps[i1];
1089         v1  -= 2;
1090         v2  -= 2;
1091         v3  -= 2;
1092         v4  -= 2;
1093         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1094         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1095         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1096         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1097       }
1098       if (j==1){
1099         tmp0  = tmps[*vi--];
1100         sum1 -= *v1-- * tmp0;
1101         sum2 -= *v2-- * tmp0;
1102         sum3 -= *v3-- * tmp0;
1103         sum4 -= *v4-- * tmp0;
1104       }
1105 
1106       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1107       sum2   -= *v2-- * tmp0;
1108       sum3   -= *v3-- * tmp0;
1109       sum4   -= *v4-- * tmp0;
1110       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1111       sum3   -= *v3-- * tmp0;
1112       sum4   -= *v4-- * tmp0;
1113       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1114       sum4   -= *v4-- * tmp0;
1115       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1116       break;
1117     case 5 :
1118       sum1 = tmp[row];
1119       sum2 = tmp[row -1];
1120       sum3 = tmp[row -2];
1121       sum4 = tmp[row -3];
1122       sum5 = tmp[row -4];
1123       v2   = aa + ai[row]-1;
1124       v3   = aa + ai[row -1]-1;
1125       v4   = aa + ai[row -2]-1;
1126       v5   = aa + ai[row -3]-1;
1127       for (j=nz ; j>1; j-=2){
1128         vi  -= 2;
1129         i0   = vi[2];
1130         i1   = vi[1];
1131         tmp0 = tmps[i0];
1132         tmp1 = tmps[i1];
1133         v1   -= 2;
1134         v2   -= 2;
1135         v3   -= 2;
1136         v4   -= 2;
1137         v5   -= 2;
1138         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1139         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1140         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1141         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1142         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1143       }
1144       if (j==1){
1145         tmp0  = tmps[*vi--];
1146         sum1 -= *v1-- * tmp0;
1147         sum2 -= *v2-- * tmp0;
1148         sum3 -= *v3-- * tmp0;
1149         sum4 -= *v4-- * tmp0;
1150         sum5 -= *v5-- * tmp0;
1151       }
1152 
1153       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1154       sum2   -= *v2-- * tmp0;
1155       sum3   -= *v3-- * tmp0;
1156       sum4   -= *v4-- * tmp0;
1157       sum5   -= *v5-- * tmp0;
1158       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1159       sum3   -= *v3-- * tmp0;
1160       sum4   -= *v4-- * tmp0;
1161       sum5   -= *v5-- * tmp0;
1162       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1163       sum4   -= *v4-- * tmp0;
1164       sum5   -= *v5-- * tmp0;
1165       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1166       sum5   -= *v5-- * tmp0;
1167       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1168       break;
1169     default:
1170       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1171     }
1172   }
1173   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1174   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1176   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1177   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1178   PetscFunctionReturn(0);
1179 }
1180 
1181 #undef __FUNCT__
1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1184 {
1185   Mat              C=B;
1186   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1187   IS               isrow = b->row,isicol = b->icol;
1188   PetscErrorCode   ierr;
1189   const PetscInt   *r,*ic,*ics;
1190   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1191   PetscInt         i,j,k,nz,nzL,row,*pj;
1192   const PetscInt   *ajtmp,*bjtmp;
1193   MatScalar        *rtmp,*pc,*pc1,*pc2,*pc3,multiplier,multiplier1,multiplier2,multiplier3,*pv,*rtmp11,*rtmp22,*rtmp33;
1194   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3;
1195   FactorShiftCtx   sctx;
1196   PetscInt         *ddiag;
1197   PetscReal        rs;
1198   MatScalar        d;
1199   PetscInt         inod,nodesz,k1,k2,k3,node_max,*ns,col;
1200   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++)
1216 	rs += PetscAbsScalar(v[j]);
1217       if (rs>sctx.shift_top) sctx.shift_top = rs;
1218     }
1219     sctx.shift_top   *= 1.1;
1220     sctx.nshift_max   = 5;
1221     sctx.shift_lo     = 0.;
1222     sctx.shift_hi     = 1.;
1223   }
1224 
1225   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1226   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1227 
1228   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1230   rtmp   = rtmp11;
1231   rtmp22 = rtmp11 + n;
1232   rtmp33 = rtmp22 + n;
1233   ics  = ic;
1234 
1235   node_max = a->inode.node_count;
1236   ns       = a->inode.size;
1237   if (!ns){
1238     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1239   }
1240 
1241   /* If max inode size > 3, split it into two inodes.*/
1242   /* also map the inode sizes according to the ordering */
1243   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1244   for (i=0,j=0; i<node_max; ++i,++j){
1245     if (ns[i]>3) {
1246       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1247       ++j;
1248       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1249     } else {
1250       tmp_vec1[j] = ns[i];
1251     }
1252   }
1253   /* Use the correct node_max */
1254   node_max = j;
1255 
1256   /* Now reorder the inode info based on mat re-ordering info */
1257   /* First create a row -> inode_size_array_index map */
1258   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1259   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1260   for (i=0,row=0; i<node_max; i++) {
1261     nodesz = tmp_vec1[i];
1262     for (j=0; j<nodesz; j++,row++) {
1263       nsmap[row] = i;
1264     }
1265   }
1266   /* Using nsmap, create a reordered ns structure */
1267   for (i=0,j=0; i< node_max; i++) {
1268     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1269     tmp_vec2[i]  = nodesz;
1270     j           += nodesz;
1271   }
1272   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1273   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1274   /* Now use the correct ns */
1275   ns = tmp_vec2;
1276 
1277   do {
1278     sctx.useshift = PETSC_FALSE;
1279     /* Now loop over each block-row, and do the factorization */
1280     for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */
1281       nodesz = ns[inod];
1282 
1283       switch (nodesz){
1284       case 1:
1285       /*----------*/
1286       /* zero rtmp */
1287       /* L part */
1288       nz    = bi[i+1] - bi[i];
1289       bjtmp = bj + bi[i];
1290       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1291 
1292       /* U part */
1293       nz = bdiag[i]-bdiag[i+1];
1294       bjtmp = bj + bdiag[i+1]+1;
1295       for  (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0;
1296 
1297       /* load in initial (unfactored row) */
1298       nz    = ai[r[i]+1] - ai[r[i]];
1299       ajtmp = aj + ai[r[i]];
1300       v     = aa + ai[r[i]];
1301       for (j=0; j<nz; j++) {
1302         rtmp[ics[ajtmp[j]]] = v[j];
1303       }
1304       /* ZeropivotApply() */
1305       rtmp[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1306 
1307       /* elimination */
1308       bjtmp = bj + bi[i];
1309       row   = *bjtmp++;
1310       nzL   = bi[i+1] - bi[i];
1311       for(k=0; k < nzL;k++) {
1312         pc = rtmp + row;
1313         if (*pc != 0.0) {
1314           pv         = b->a + bdiag[row];
1315           multiplier = *pc * (*pv);
1316           *pc        = multiplier;
1317           pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1318 	  pv = b->a + bdiag[row+1]+1;
1319 	  nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1320           for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
1321           ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
1322         }
1323         row = *bjtmp++;
1324       }
1325 
1326       /* finished row so stick it into b->a */
1327       rs = 0.0;
1328       /* L part */
1329       pv   = b->a + bi[i] ;
1330       pj   = b->j + bi[i] ;
1331       nz   = bi[i+1] - bi[i];
1332       for (j=0; j<nz; j++) {
1333         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
1334       }
1335 
1336       /* U part */
1337       pv = b->a + bdiag[i+1]+1;
1338       pj = b->j + bdiag[i+1]+1;
1339       nz = bdiag[i] - bdiag[i+1]-1;
1340       for (j=0; j<nz; j++) {
1341         pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]);
1342       }
1343 
1344       /* MatPivotCheck() */
1345       sctx.rs  = rs;
1346       sctx.pv  = rtmp[i];
1347       if (info->shifttype == MAT_SHIFT_NONZERO){
1348         ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr);
1349       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1350         ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr);
1351       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1352         ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr);
1353       } else {
1354         ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr);
1355       }
1356       rtmp[i] = sctx.pv;
1357 
1358       /* Mark diagonal and invert diagonal for simplier triangular solves */
1359       pv  = b->a + bdiag[i];
1360       *pv = 1.0/rtmp[i];
1361 
1362       break;
1363 
1364       case 2:
1365       /*----------*/
1366         /* zero rtmp */
1367         /* L part */
1368         nz    = bi[i+1] - bi[i];
1369         bjtmp = bj + bi[i];
1370         for  (j=0; j<nz; j++) {
1371           col = bjtmp[j];
1372           rtmp11[col] = 0.0; rtmp22[col] = 0.0;
1373         }
1374 
1375         /* U part */
1376         nz = bdiag[i]-bdiag[i+1];
1377         bjtmp = bj + bdiag[i+1]+1;
1378         for  (j=0; j<nz; j++) {
1379           col = bjtmp[j];
1380           rtmp11[col] = 0.0; rtmp22[col] = 0.0;
1381         }
1382 
1383         /* load in initial (unfactored row) */
1384         nz    = ai[r[i]+1] - ai[r[i]];
1385         ajtmp = aj + ai[r[i]];
1386         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1387         for (j=0; j<nz; j++) {
1388           col = ics[ajtmp[j]];
1389           rtmp11[col] = v1[j]; rtmp22[col] = v2[j];
1390         }
1391         /* ZeropivotApply(): shift the diagonal of the matrix  */
1392         rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount;
1393 
1394         /* elimination */
1395         bjtmp = bj + bi[i];
1396         row   = *bjtmp++; /* pivot row */
1397         nzL   = bi[i+1] - bi[i];
1398         for(k=0; k < nzL;k++) {
1399           pc1 = rtmp11 + row;
1400           pc2 = rtmp22 + row;
1401           if (*pc1 != 0.0 || *pc2 != 0.0) {
1402             pv         = b->a + bdiag[row];
1403             multiplier1 = *pc1*(*pv);    multiplier2 = *pc2*(*pv);
1404             *pc1        = multiplier1;   *pc2 = multiplier2;
1405 
1406             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1407             pv = b->a + bdiag[row+1]+1;
1408             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1409             for (j=0; j<nz; j++){
1410               col = pj[j];
1411               rtmp11[col] -= multiplier1 * pv[j];
1412               rtmp22[col] -= multiplier2 * pv[j];
1413             }
1414             ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1415           }
1416           row = *bjtmp++;
1417         }
1418 
1419         /* Now take care of diagonal 2x2 block. */
1420         pc1 = rtmp11 + i;
1421         pc2 = rtmp22 + i;
1422 
1423         if (*pc2 != 0.0){
1424           multiplier = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1425           *pc2       = multiplier;    /* insert L entry */
1426           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428           for (j=0; j<nz; j++) {
1429             col = pj[j];
1430             rtmp22[col] -= multiplier * rtmp11[col];
1431           }
1432           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1433         }
1434 
1435 
1436         /* finished rows so stick it into b->a */
1437         rs = 0.0;
1438         /* L part */
1439         k1 = k2 =0;
1440         pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1];
1441         pj  = b->j + bi[i] ;
1442         nz  = bi[i+1] - bi[i];
1443         for (j=0; j<nz; j++) {
1444           col = pj[j];
1445           if (col < i)  {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);}
1446           if (col < i+1){pc2[k2] = rtmp22[col]; k2++;}
1447         }
1448 
1449         /* U part */
1450         pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1;
1451         pj = b->j + bdiag[i+1]+1;
1452         nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */
1453         k1 = k2 =0;
1454         for (j=0; j<nz; j++) {
1455           col = pj[j];
1456           if (col > i)  {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);}
1457           if (col > i+1){pc2[k2] = rtmp22[col]; k2++;}
1458         }
1459 
1460 #if defined(TMP)
1461       /* MatPivotCheck() */
1462       sctx.rs  = rs;
1463       sctx.pv  = rtmp11[i];
1464       if (info->shifttype == MAT_SHIFT_NONZERO){
1465         ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr);
1466       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1467         ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr);
1468       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1469         ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr);
1470       } else {
1471         ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr);
1472       }
1473       rtmp11[i] = sctx.pv;
1474 #endif
1475       /* Mark diagonal and invert diagonal for simplier triangular solves */
1476       pc1  = b->a + bdiag[i];
1477       *pc1 = 1.0/rtmp11[i];
1478 #if defined(TMP)
1479       sctx.rs  = rs;
1480       sctx.pv  = rtmp22[i+1];
1481       if (info->shifttype == MAT_SHIFT_NONZERO){
1482         ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr);
1483       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1484         ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr);
1485       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1486         ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr);
1487       } else {
1488         ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr);
1489       }
1490       rtmp22[i+1] = sctx.pv;
1491 #endif
1492       /* Mark diagonal and invert diagonal for simplier triangular solves */
1493       pc2  = b->a + bdiag[i+1];
1494       *pc2 = 1.0/rtmp22[i+1];
1495 
1496         break;
1497       case 3:
1498       /*----------*/
1499         /* zero rtmp */
1500         /* L part */
1501         nz    = bi[i+1] - bi[i];
1502         bjtmp = bj + bi[i];
1503         for  (j=0; j<nz; j++) {
1504           col = bjtmp[j];
1505           rtmp11[col] = 0.0; rtmp22[col] = 0.0; rtmp33[col] = 0.0;
1506         }
1507 
1508         /* U part */
1509         nz = bdiag[i]-bdiag[i+1];
1510         bjtmp = bj + bdiag[i+1]+1;
1511         for  (j=0; j<nz; j++) {
1512           col = bjtmp[j];
1513           rtmp11[col] = 0.0; rtmp22[col] = 0.0; rtmp33[col] = 0.0;
1514         }
1515 
1516         /* load in initial (unfactored row) */
1517         nz    = ai[r[i]+1] - ai[r[i]];
1518         ajtmp = aj + ai[r[i]];
1519         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1520         for (j=0; j<nz; j++) {
1521           col = ics[ajtmp[j]];
1522           rtmp11[col] = v1[j]; rtmp22[col] = v2[j]; rtmp33[col] = v3[j];
1523         }
1524         /* ZeropivotApply(): shift the diagonal of the matrix  */
1525         rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount; rtmp33[i+2] += sctx.shift_amount;
1526 
1527         /* elimination */
1528         bjtmp = bj + bi[i];
1529         row   = *bjtmp++; /* pivot row */
1530         nzL   = bi[i+1] - bi[i];
1531         for(k=0; k < nzL;k++) {
1532           pc1 = rtmp11 + row;
1533           pc2 = rtmp22 + row;
1534           pc3 = rtmp33 + row;
1535           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1536             pv  = b->a + bdiag[row];
1537             multiplier1 = *pc1*(*pv); multiplier2 = *pc2*(*pv); multiplier3 = *pc3*(*pv);
1538             *pc1 = multiplier1; *pc2 = multiplier2; *pc3 = multiplier3;
1539 
1540             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1541             pv = b->a + bdiag[row+1]+1;
1542             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1543             for (j=0; j<nz; j++){
1544               col = pj[j];
1545               rtmp11[col] -= multiplier1 * pv[j];
1546               rtmp22[col] -= multiplier2 * pv[j];
1547               rtmp33[col] -= multiplier3 * pv[j];
1548             }
1549             ierr = PetscLogFlops(6*nz);CHKERRQ(ierr);
1550           }
1551           row = *bjtmp++;
1552         }
1553 
1554         /* Now take care of diagonal 3x3 block. */
1555         pc1 = rtmp11 + i;
1556         pc2 = rtmp22 + i;
1557         pc3 = rtmp33 + i;
1558         if (*pc2 != 0.0 || *pc3 != 0.0){
1559           multiplier2 = (*pc2)/(*pc1);
1560           multiplier3 = (*pc3)/(*pc1);
1561 
1562           *pc2       = multiplier2;
1563           *pc3       = multiplier3;
1564           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1565           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1566           for (j=0; j<nz; j++) {
1567             col = pj[j];
1568             rtmp22[col] -= multiplier2 * rtmp11[col];
1569             rtmp33[col] -= multiplier3 * rtmp11[col];
1570           }
1571           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1572         }
1573 
1574         pc2 = rtmp22 + i+1;
1575         pc3 = rtmp33 + i+1;
1576         if (*pc3 != 0.0){
1577           multiplier3 = (*pc3)/(*pc2);
1578           *pc3        = multiplier3;
1579           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1580           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1581           for (j=0; j<nz; j++) {
1582             col = pj[j];
1583             rtmp33[col] -= multiplier3 * rtmp22[col];
1584           }
1585           ierr = PetscLogFlops(2*nz);CHKERRQ(ierr);
1586         }
1587 
1588         /* finished rows so stick it into b->a */
1589         rs = 0.0;
1590         /* L part */
1591         k1 = k2 = k3 = 0;
1592         pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1]; pc3 = b->a + bi[i+2];
1593         pj  = b->j + bi[i] ;
1594         nz  = bi[i+1] - bi[i];
1595         for (j=0; j<nz; j++) {
1596           col = pj[j];
1597           if (col < i)  {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);}
1598           if (col < i+1){pc2[k2] = rtmp22[col]; k2++;}
1599           if (col < i+2){pc3[k3] = rtmp33[col]; k3++;}
1600         }
1601 
1602         /* U part */
1603         pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1; pc3 = b->a + bdiag[i+3]+1;
1604         pj  = b->j + bdiag[i+1]+1;
1605         nz  = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */
1606         k1 = k2 = k3 = 0;
1607         for (j=0; j<nz; j++) {
1608           col = pj[j];
1609           if (col > i)  {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);}
1610           if (col > i+1){pc2[k2] = rtmp22[col]; k2++;}
1611           if (col > i+2){pc3[k3] = rtmp33[col]; k3++;}
1612         }
1613 
1614 #if defined(TMP)
1615       /* MatPivotCheck() */
1616       sctx.rs  = rs;
1617       sctx.pv  = rtmp11[i];
1618       if (info->shifttype == MAT_SHIFT_NONZERO){
1619         ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr);
1620       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1621         ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr);
1622       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1623         ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr);
1624       } else {
1625         ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr);
1626       }
1627       rtmp11[i] = sctx.pv;
1628 #endif
1629       /* Mark diagonal and invert diagonal for simplier triangular solves */
1630       pc1  = b->a + bdiag[i];
1631       *pc1 = 1.0/rtmp11[i];
1632 #if defined(TMP)
1633       sctx.rs  = rs;
1634       sctx.pv  = rtmp22[i+1];
1635       if (info->shifttype == MAT_SHIFT_NONZERO){
1636         ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr);
1637       } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){
1638         ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr);
1639       } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1640         ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr);
1641       } else {
1642         ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr);
1643       }
1644       rtmp22[i+1] = sctx.pv;
1645 #endif
1646       /* Mark diagonal and invert diagonal for simplier triangular solves */
1647       pc2  = b->a + bdiag[i+1];
1648       *pc2 = 1.0/rtmp22[i+1];
1649 
1650       pc3  = b->a + bdiag[i+2];
1651       *pc3 = 1.0/rtmp33[i+2];
1652         break;
1653 
1654       default:
1655         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
1656       }
1657       i += nodesz;                 /* Update the row */
1658     }
1659 
1660     /* MatPivotRefine() */
1661     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){
1662       /*
1663        * if no shift in this attempt & shifting & started shifting & can refine,
1664        * then try lower shift
1665        */
1666       sctx.shift_hi       = sctx.shift_fraction;
1667       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1668       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1669       sctx.useshift       = PETSC_TRUE;
1670       sctx.nshift++;
1671     }
1672   } while (sctx.useshift);
1673 
1674   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
1675   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1676   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1677   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1678 
1679   C->ops->solve              = MatSolve_SeqAIJ_Inode;
1680   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1681   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1682   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1683   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1684   C->assembled    = PETSC_TRUE;
1685   C->preallocated = PETSC_TRUE;
1686   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1687 
1688   /* MatShiftView(A,info,&sctx) */
1689   if (sctx.nshift){
1690     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
1691       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1692     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
1693       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1694     } else if (info->shifttype == MAT_SHIFT_INBLOCKS){
1695       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1696     }
1697   }
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 #undef __FUNCT__
1702 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1703 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1704 {
1705   Mat               C = B;
1706   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1707   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1708   PetscErrorCode    ierr;
1709   const PetscInt    *r,*ic,*c,*ics;
1710   PetscInt          n = A->rmap->n,*bi = b->i;
1711   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1712   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1713   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1714   PetscScalar       mul1,mul2,mul3,tmp;
1715   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1716   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1717   PetscReal         rs=0.0;
1718   FactorShiftCtx    sctx;
1719   PetscInt          newshift;
1720 
1721   PetscFunctionBegin;
1722   sctx.shift_top      = 0;
1723   sctx.nshift_max     = 0;
1724   sctx.shift_lo       = 0;
1725   sctx.shift_hi       = 0;
1726   sctx.shift_fraction = 0;
1727 
1728   /* if both shift schemes are chosen by user, only use info->shiftpd */
1729   if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1730     sctx.shift_top = 0;
1731     for (i=0; i<n; i++) {
1732       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1733       rs    = 0.0;
1734       ajtmp = aj + ai[i];
1735       rtmp1 = aa + ai[i];
1736       nz = ai[i+1] - ai[i];
1737       for (j=0; j<nz; j++){
1738         if (*ajtmp != i){
1739           rs += PetscAbsScalar(*rtmp1++);
1740         } else {
1741           rs -= PetscRealPart(*rtmp1++);
1742         }
1743         ajtmp++;
1744       }
1745       if (rs>sctx.shift_top) sctx.shift_top = rs;
1746     }
1747     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1748     sctx.shift_top *= 1.1;
1749     sctx.nshift_max = 5;
1750     sctx.shift_lo   = 0.;
1751     sctx.shift_hi   = 1.;
1752   }
1753   sctx.shift_amount = 0;
1754   sctx.nshift       = 0;
1755 
1756   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1757   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1758   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1759   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1760   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1761   ics   = ic ;
1762   rtmp22 = rtmp11 + n;
1763   rtmp33 = rtmp22 + n;
1764 
1765   node_max = a->inode.node_count;
1766   ns       = a->inode.size;
1767   if (!ns){
1768     SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1769   }
1770 
1771   /* If max inode size > 3, split it into two inodes.*/
1772   /* also map the inode sizes according to the ordering */
1773   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1774   for (i=0,j=0; i<node_max; ++i,++j){
1775     if (ns[i]>3) {
1776       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1777       ++j;
1778       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1779     } else {
1780       tmp_vec1[j] = ns[i];
1781     }
1782   }
1783   /* Use the correct node_max */
1784   node_max = j;
1785 
1786   /* Now reorder the inode info based on mat re-ordering info */
1787   /* First create a row -> inode_size_array_index map */
1788   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1789   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1790   for (i=0,row=0; i<node_max; i++) {
1791     nodesz = tmp_vec1[i];
1792     for (j=0; j<nodesz; j++,row++) {
1793       nsmap[row] = i;
1794     }
1795   }
1796   /* Using nsmap, create a reordered ns structure */
1797   for (i=0,j=0; i< node_max; i++) {
1798     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1799     tmp_vec2[i]  = nodesz;
1800     j           += nodesz;
1801   }
1802   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1803   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1804   /* Now use the correct ns */
1805   ns = tmp_vec2;
1806 
1807   do {
1808     sctx.useshift = PETSC_FALSE;
1809     /* Now loop over each block-row, and do the factorization */
1810     for (i=0,row=0; i<node_max; i++) {
1811       nodesz = ns[i];
1812       nz     = bi[row+1] - bi[row];
1813       bjtmp  = bj + bi[row];
1814 
1815       switch (nodesz){
1816       case 1:
1817         for  (j=0; j<nz; j++){
1818           idx        = bjtmp[j];
1819           rtmp11[idx] = 0.0;
1820         }
1821 
1822         /* load in initial (unfactored row) */
1823         idx    = r[row];
1824         nz_tmp = ai[idx+1] - ai[idx];
1825         ajtmp  = aj + ai[idx];
1826         v1     = aa + ai[idx];
1827 
1828         for (j=0; j<nz_tmp; j++) {
1829           idx        = ics[ajtmp[j]];
1830           rtmp11[idx] = v1[j];
1831         }
1832         rtmp11[ics[r[row]]] += sctx.shift_amount;
1833 
1834         prow = *bjtmp++ ;
1835         while (prow < row) {
1836           pc1 = rtmp11 + prow;
1837           if (*pc1 != 0.0){
1838             pv   = ba + bd[prow];
1839             pj   = nbj + bd[prow];
1840             mul1 = *pc1 * *pv++;
1841             *pc1 = mul1;
1842             nz_tmp = bi[prow+1] - bd[prow] - 1;
1843             ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1844             for (j=0; j<nz_tmp; j++) {
1845               tmp = pv[j];
1846               idx = pj[j];
1847               rtmp11[idx] -= mul1 * tmp;
1848             }
1849           }
1850           prow = *bjtmp++ ;
1851         }
1852         pj  = bj + bi[row];
1853         pc1 = ba + bi[row];
1854 
1855         sctx.pv    = rtmp11[row];
1856         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
1857         rs         = 0.0;
1858         for (j=0; j<nz; j++) {
1859           idx    = pj[j];
1860           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
1861           if (idx != row) rs += PetscAbsScalar(pc1[j]);
1862         }
1863         sctx.rs  = rs;
1864         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1865         if (newshift == 1) goto endofwhile;
1866         break;
1867 
1868       case 2:
1869         for (j=0; j<nz; j++) {
1870           idx        = bjtmp[j];
1871           rtmp11[idx] = 0.0;
1872           rtmp22[idx] = 0.0;
1873         }
1874 
1875         /* load in initial (unfactored row) */
1876         idx    = r[row];
1877         nz_tmp = ai[idx+1] - ai[idx];
1878         ajtmp  = aj + ai[idx];
1879         v1     = aa + ai[idx];
1880         v2     = aa + ai[idx+1];
1881         for (j=0; j<nz_tmp; j++) {
1882           idx        = ics[ajtmp[j]];
1883           rtmp11[idx] = v1[j];
1884           rtmp22[idx] = v2[j];
1885         }
1886         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1887         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1888 
1889         prow = *bjtmp++ ;
1890         while (prow < row) {
1891           pc1 = rtmp11 + prow;
1892           pc2 = rtmp22 + prow;
1893           if (*pc1 != 0.0 || *pc2 != 0.0){
1894             pv   = ba + bd[prow];
1895             pj   = nbj + bd[prow];
1896             mul1 = *pc1 * *pv;
1897             mul2 = *pc2 * *pv;
1898             ++pv;
1899             *pc1 = mul1;
1900             *pc2 = mul2;
1901 
1902             nz_tmp = bi[prow+1] - bd[prow] - 1;
1903             for (j=0; j<nz_tmp; j++) {
1904               tmp = pv[j];
1905               idx = pj[j];
1906               rtmp11[idx] -= mul1 * tmp;
1907               rtmp22[idx] -= mul2 * tmp;
1908             }
1909             ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
1910           }
1911           prow = *bjtmp++ ;
1912         }
1913 
1914         /* Now take care of diagonal 2x2 block. Note: prow = row here */
1915         pc1 = rtmp11 + prow;
1916         pc2 = rtmp22 + prow;
1917 
1918         sctx.pv = *pc1;
1919         pj      = bj + bi[prow];
1920         rs      = 0.0;
1921         for (j=0; j<nz; j++){
1922           idx = pj[j];
1923           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
1924         }
1925         sctx.rs = rs;
1926         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
1927         if (newshift == 1) goto endofwhile;
1928 
1929         if (*pc2 != 0.0){
1930           pj     = nbj + bd[prow];
1931           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1932           *pc2   = mul2;
1933           nz_tmp = bi[prow+1] - bd[prow] - 1;
1934           for (j=0; j<nz_tmp; j++) {
1935             idx = pj[j] ;
1936             tmp = rtmp11[idx];
1937             rtmp22[idx] -= mul2 * tmp;
1938           }
1939           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
1940         }
1941 
1942         pj  = bj + bi[row];
1943         pc1 = ba + bi[row];
1944         pc2 = ba + bi[row+1];
1945 
1946         sctx.pv = rtmp22[row+1];
1947         rs = 0.0;
1948         rtmp11[row]   = 1.0/rtmp11[row];
1949         rtmp22[row+1] = 1.0/rtmp22[row+1];
1950         /* copy row entries from dense representation to sparse */
1951         for (j=0; j<nz; j++) {
1952           idx    = pj[j];
1953           pc1[j] = rtmp11[idx];
1954           pc2[j] = rtmp22[idx];
1955           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1956         }
1957         sctx.rs = rs;
1958         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
1959         if (newshift == 1) goto endofwhile;
1960         break;
1961 
1962       case 3:
1963         for  (j=0; j<nz; j++) {
1964           idx        = bjtmp[j];
1965           rtmp11[idx] = 0.0;
1966           rtmp22[idx] = 0.0;
1967           rtmp33[idx] = 0.0;
1968         }
1969         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1970         idx    = r[row];
1971         nz_tmp = ai[idx+1] - ai[idx];
1972         ajtmp = aj + ai[idx];
1973         v1    = aa + ai[idx];
1974         v2    = aa + ai[idx+1];
1975         v3    = aa + ai[idx+2];
1976         for (j=0; j<nz_tmp; j++) {
1977           idx        = ics[ajtmp[j]];
1978           rtmp11[idx] = v1[j];
1979           rtmp22[idx] = v2[j];
1980           rtmp33[idx] = v3[j];
1981         }
1982         rtmp11[ics[r[row]]]   += sctx.shift_amount;
1983         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
1984         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
1985 
1986         /* loop over all pivot row blocks above this row block */
1987         prow = *bjtmp++ ;
1988         while (prow < row) {
1989           pc1 = rtmp11 + prow;
1990           pc2 = rtmp22 + prow;
1991           pc3 = rtmp33 + prow;
1992           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1993             pv   = ba  + bd[prow];
1994             pj   = nbj + bd[prow];
1995             mul1 = *pc1 * *pv;
1996             mul2 = *pc2 * *pv;
1997             mul3 = *pc3 * *pv;
1998             ++pv;
1999             *pc1 = mul1;
2000             *pc2 = mul2;
2001             *pc3 = mul3;
2002 
2003             nz_tmp = bi[prow+1] - bd[prow] - 1;
2004             /* update this row based on pivot row */
2005             for (j=0; j<nz_tmp; j++) {
2006               tmp = pv[j];
2007               idx = pj[j];
2008               rtmp11[idx] -= mul1 * tmp;
2009               rtmp22[idx] -= mul2 * tmp;
2010               rtmp33[idx] -= mul3 * tmp;
2011             }
2012             ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr);
2013           }
2014           prow = *bjtmp++ ;
2015         }
2016 
2017         /* Now take care of diagonal 3x3 block in this set of rows */
2018         /* note: prow = row here */
2019         pc1 = rtmp11 + prow;
2020         pc2 = rtmp22 + prow;
2021         pc3 = rtmp33 + prow;
2022 
2023         sctx.pv = *pc1;
2024         pj      = bj + bi[prow];
2025         rs      = 0.0;
2026         for (j=0; j<nz; j++){
2027           idx = pj[j];
2028           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2029         }
2030         sctx.rs = rs;
2031         ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr);
2032         if (newshift == 1) goto endofwhile;
2033 
2034         if (*pc2 != 0.0 || *pc3 != 0.0){
2035           mul2 = (*pc2)/(*pc1);
2036           mul3 = (*pc3)/(*pc1);
2037           *pc2 = mul2;
2038           *pc3 = mul3;
2039           nz_tmp = bi[prow+1] - bd[prow] - 1;
2040           pj     = nbj + bd[prow];
2041           for (j=0; j<nz_tmp; j++) {
2042             idx = pj[j] ;
2043             tmp = rtmp11[idx];
2044             rtmp22[idx] -= mul2 * tmp;
2045             rtmp33[idx] -= mul3 * tmp;
2046           }
2047           ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr);
2048         }
2049         ++prow;
2050 
2051         pc2 = rtmp22 + prow;
2052         pc3 = rtmp33 + prow;
2053         sctx.pv = *pc2;
2054         pj      = bj + bi[prow];
2055         rs      = 0.0;
2056         for (j=0; j<nz; j++){
2057           idx = pj[j];
2058           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2059         }
2060         sctx.rs = rs;
2061         ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr);
2062         if (newshift == 1) goto endofwhile;
2063 
2064         if (*pc3 != 0.0){
2065           mul3   = (*pc3)/(*pc2);
2066           *pc3   = mul3;
2067           pj     = nbj + bd[prow];
2068           nz_tmp = bi[prow+1] - bd[prow] - 1;
2069           for (j=0; j<nz_tmp; j++) {
2070             idx = pj[j] ;
2071             tmp = rtmp22[idx];
2072             rtmp33[idx] -= mul3 * tmp;
2073           }
2074           ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr);
2075         }
2076 
2077         pj  = bj + bi[row];
2078         pc1 = ba + bi[row];
2079         pc2 = ba + bi[row+1];
2080         pc3 = ba + bi[row+2];
2081 
2082         sctx.pv = rtmp33[row+2];
2083         rs = 0.0;
2084         rtmp11[row]   = 1.0/rtmp11[row];
2085         rtmp22[row+1] = 1.0/rtmp22[row+1];
2086         rtmp33[row+2] = 1.0/rtmp33[row+2];
2087         /* copy row entries from dense representation to sparse */
2088         for (j=0; j<nz; j++) {
2089           idx    = pj[j];
2090           pc1[j] = rtmp11[idx];
2091           pc2[j] = rtmp22[idx];
2092           pc3[j] = rtmp33[idx];
2093           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2094         }
2095 
2096         sctx.rs = rs;
2097         ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr);
2098         if (newshift == 1) goto endofwhile;
2099         break;
2100 
2101       default:
2102         SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n");
2103       }
2104       row += nodesz;                 /* Update the row */
2105     }
2106     endofwhile:;
2107   } while (sctx.useshift);
2108   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2109   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2110   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2111   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2112   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2113   (B)->ops->solve           = MatSolve_SeqAIJ_Inode_inplace;
2114   /* do not set solve add, since MatSolve_Inode + Add is faster */
2115   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2116   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2117   C->assembled   = PETSC_TRUE;
2118   C->preallocated = PETSC_TRUE;
2119   if (sctx.nshift) {
2120     if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) {
2121       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2122     } else if (info->shifttype == MAT_SHIFT_NONZERO) {
2123       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2124     }
2125   }
2126   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2127   PetscFunctionReturn(0);
2128 }
2129 
2130 
2131 /* ----------------------------------------------------------- */
2132 #undef __FUNCT__
2133 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2134 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2135 {
2136   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2137   IS                iscol = a->col,isrow = a->row;
2138   PetscErrorCode    ierr;
2139   const PetscInt    *r,*c,*rout,*cout;
2140   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2141   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2142   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2143   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2144   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2145   const PetscScalar *b;
2146 
2147   PetscFunctionBegin;
2148   if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
2149   node_max = a->inode.node_count;
2150   ns       = a->inode.size;     /* Node Size array */
2151 
2152   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2153   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2154   tmp  = a->solve_work;
2155 
2156   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2157   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2158 
2159   /* forward solve the lower triangular */
2160   tmps = tmp ;
2161   aa   = a_a ;
2162   aj   = a_j ;
2163   ad   = a->diag;
2164 
2165   for (i = 0,row = 0; i< node_max; ++i){
2166     nsz = ns[i];
2167     aii = ai[row];
2168     v1  = aa + aii;
2169     vi  = aj + aii;
2170     nz  = ai[row+1]- ai[row];
2171 
2172     if (i < node_max-1) {
2173       /* Prefetch the indices for the next block */
2174       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */
2175       /* Prefetch the data for the next block */
2176       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0);
2177     }
2178 
2179     switch (nsz){               /* Each loop in 'case' is unrolled */
2180     case 1 :
2181       sum1 = b[r[row]];
2182       for(j=0; j<nz-1; j+=2){
2183         i0   = vi[j];
2184         i1   = vi[j+1];
2185         tmp0 = tmps[i0];
2186         tmp1 = tmps[i1];
2187         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2188       }
2189       if(j == nz-1){
2190         tmp0 = tmps[vi[j]];
2191         sum1 -= v1[j]*tmp0;
2192       }
2193       tmp[row++]=sum1;
2194       break;
2195     case 2:
2196       sum1 = b[r[row]];
2197       sum2 = b[r[row+1]];
2198       v2   = aa + ai[row+1];
2199 
2200       for(j=0; j<nz-1; j+=2){
2201         i0   = vi[j];
2202         i1   = vi[j+1];
2203         tmp0 = tmps[i0];
2204         tmp1 = tmps[i1];
2205         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2206         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2207       }
2208       if(j == nz-1){
2209         tmp0 = tmps[vi[j]];
2210         sum1 -= v1[j] *tmp0;
2211         sum2 -= v2[j] *tmp0;
2212       }
2213       sum2 -= v2[nz] * sum1;
2214       tmp[row ++]=sum1;
2215       tmp[row ++]=sum2;
2216       break;
2217     case 3:
2218       sum1 = b[r[row]];
2219       sum2 = b[r[row+1]];
2220       sum3 = b[r[row+2]];
2221       v2   = aa + ai[row+1];
2222       v3   = aa + ai[row+2];
2223 
2224       for (j=0; j<nz-1; j+=2){
2225         i0   = vi[j];
2226         i1   = vi[j+1];
2227         tmp0 = tmps[i0];
2228         tmp1 = tmps[i1];
2229         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2230         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2231         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2232       }
2233       if (j == nz-1){
2234         tmp0 = tmps[vi[j]];
2235         sum1 -= v1[j] *tmp0;
2236         sum2 -= v2[j] *tmp0;
2237         sum3 -= v3[j] *tmp0;
2238       }
2239       sum2 -= v2[nz] * sum1;
2240       sum3 -= v3[nz] * sum1;
2241       sum3 -= v3[nz+1] * sum2;
2242       tmp[row ++]=sum1;
2243       tmp[row ++]=sum2;
2244       tmp[row ++]=sum3;
2245       break;
2246 
2247     case 4:
2248       sum1 = b[r[row]];
2249       sum2 = b[r[row+1]];
2250       sum3 = b[r[row+2]];
2251       sum4 = b[r[row+3]];
2252       v2   = aa + ai[row+1];
2253       v3   = aa + ai[row+2];
2254       v4   = aa + ai[row+3];
2255 
2256       for (j=0; j<nz-1; j+=2){
2257         i0   = vi[j];
2258         i1   = vi[j+1];
2259         tmp0 = tmps[i0];
2260         tmp1 = tmps[i1];
2261         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2262         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2263         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2264         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2265       }
2266       if (j == nz-1){
2267         tmp0 = tmps[vi[j]];
2268         sum1 -= v1[j] *tmp0;
2269         sum2 -= v2[j] *tmp0;
2270         sum3 -= v3[j] *tmp0;
2271         sum4 -= v4[j] *tmp0;
2272       }
2273       sum2 -= v2[nz] * sum1;
2274       sum3 -= v3[nz] * sum1;
2275       sum4 -= v4[nz] * sum1;
2276       sum3 -= v3[nz+1] * sum2;
2277       sum4 -= v4[nz+1] * sum2;
2278       sum4 -= v4[nz+2] * sum3;
2279 
2280       tmp[row ++]=sum1;
2281       tmp[row ++]=sum2;
2282       tmp[row ++]=sum3;
2283       tmp[row ++]=sum4;
2284       break;
2285     case 5:
2286       sum1 = b[r[row]];
2287       sum2 = b[r[row+1]];
2288       sum3 = b[r[row+2]];
2289       sum4 = b[r[row+3]];
2290       sum5 = b[r[row+4]];
2291       v2   = aa + ai[row+1];
2292       v3   = aa + ai[row+2];
2293       v4   = aa + ai[row+3];
2294       v5   = aa + ai[row+4];
2295 
2296       for (j=0; j<nz-1; j+=2){
2297         i0   = vi[j];
2298         i1   = vi[j+1];
2299         tmp0 = tmps[i0];
2300         tmp1 = tmps[i1];
2301         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2302         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2303         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2304         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2305         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2306       }
2307       if (j == nz-1){
2308         tmp0 = tmps[vi[j]];
2309         sum1 -= v1[j] *tmp0;
2310         sum2 -= v2[j] *tmp0;
2311         sum3 -= v3[j] *tmp0;
2312         sum4 -= v4[j] *tmp0;
2313         sum5 -= v5[j] *tmp0;
2314       }
2315 
2316       sum2 -= v2[nz] * sum1;
2317       sum3 -= v3[nz] * sum1;
2318       sum4 -= v4[nz] * sum1;
2319       sum5 -= v5[nz] * sum1;
2320       sum3 -= v3[nz+1] * sum2;
2321       sum4 -= v4[nz+1] * sum2;
2322       sum5 -= v5[nz+1] * sum2;
2323       sum4 -= v4[nz+2] * sum3;
2324       sum5 -= v5[nz+2] * sum3;
2325       sum5 -= v5[nz+3] * sum4;
2326 
2327       tmp[row ++]=sum1;
2328       tmp[row ++]=sum2;
2329       tmp[row ++]=sum3;
2330       tmp[row ++]=sum4;
2331       tmp[row ++]=sum5;
2332       break;
2333     default:
2334       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2335     }
2336   }
2337   /* backward solve the upper triangular */
2338   for (i=node_max -1 ,row = n-1 ; i>=0; i--){
2339     nsz = ns[i];
2340     aii = ad[row+1] + 1;
2341     v1  = aa + aii;
2342     vi  = aj + aii;
2343     nz  = ad[row]- ad[row+1] - 1;
2344 
2345     if (i > 0) {
2346       /* Prefetch the indices for the next block */
2347       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */
2348       /* Prefetch the data for the next block */
2349       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0);
2350     }
2351 
2352     switch (nsz){               /* Each loop in 'case' is unrolled */
2353     case 1 :
2354       sum1 = tmp[row];
2355 
2356       for(j=0 ; j<nz-1; j+=2){
2357         i0   = vi[j];
2358         i1   = vi[j+1];
2359         tmp0 = tmps[i0];
2360         tmp1 = tmps[i1];
2361         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2362       }
2363       if (j == nz-1){
2364         tmp0  = tmps[vi[j]];
2365         sum1 -= v1[j]*tmp0;
2366       }
2367       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2368       break;
2369     case 2 :
2370       sum1 = tmp[row];
2371       sum2 = tmp[row-1];
2372       v2   = aa + ad[row] + 1;
2373       for (j=0 ; j<nz-1; j+=2){
2374         i0   = vi[j];
2375         i1   = vi[j+1];
2376         tmp0 = tmps[i0];
2377         tmp1 = tmps[i1];
2378         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2379         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2380       }
2381       if (j == nz-1){
2382         tmp0  = tmps[vi[j]];
2383         sum1 -= v1[j]* tmp0;
2384         sum2 -= v2[j+1]* tmp0;
2385       }
2386 
2387       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2388       sum2   -= v2[0] * tmp0;
2389       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2390       break;
2391     case 3 :
2392       sum1 = tmp[row];
2393       sum2 = tmp[row -1];
2394       sum3 = tmp[row -2];
2395       v2   = aa + ad[row] + 1;
2396       v3   = aa + ad[row -1] + 1;
2397       for (j=0 ; j<nz-1; j+=2){
2398         i0   = vi[j];
2399         i1   = vi[j+1];
2400         tmp0 = tmps[i0];
2401         tmp1 = tmps[i1];
2402         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2403         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2404         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2405       }
2406       if (j== nz-1){
2407         tmp0  = tmps[vi[j]];
2408         sum1 -= v1[j] * tmp0;
2409         sum2 -= v2[j+1] * tmp0;
2410         sum3 -= v3[j+2] * tmp0;
2411       }
2412       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2413       sum2   -= v2[0]* tmp0;
2414       sum3   -= v3[1] * tmp0;
2415       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2416       sum3   -= v3[0]* tmp0;
2417       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2418 
2419       break;
2420     case 4 :
2421       sum1 = tmp[row];
2422       sum2 = tmp[row -1];
2423       sum3 = tmp[row -2];
2424       sum4 = tmp[row -3];
2425       v2   = aa + ad[row]+1;
2426       v3   = aa + ad[row -1]+1;
2427       v4   = aa + ad[row -2]+1;
2428 
2429       for (j=0 ; j<nz-1; j+=2){
2430         i0   = vi[j];
2431         i1   = vi[j+1];
2432         tmp0 = tmps[i0];
2433         tmp1 = tmps[i1];
2434         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2435         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2436         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2437         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2438       }
2439       if (j== nz-1){
2440         tmp0  = tmps[vi[j]];
2441         sum1 -= v1[j] * tmp0;
2442         sum2 -= v2[j+1] * tmp0;
2443         sum3 -= v3[j+2] * tmp0;
2444         sum4 -= v4[j+3] * tmp0;
2445       }
2446 
2447       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2448       sum2   -= v2[0] * tmp0;
2449       sum3   -= v3[1] * tmp0;
2450       sum4   -= v4[2] * tmp0;
2451       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2452       sum3   -= v3[0] * tmp0;
2453       sum4   -= v4[1] * tmp0;
2454       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2455       sum4   -= v4[0] * tmp0;
2456       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2457       break;
2458     case 5 :
2459       sum1 = tmp[row];
2460       sum2 = tmp[row -1];
2461       sum3 = tmp[row -2];
2462       sum4 = tmp[row -3];
2463       sum5 = tmp[row -4];
2464       v2   = aa + ad[row]+1;
2465       v3   = aa + ad[row -1]+1;
2466       v4   = aa + ad[row -2]+1;
2467       v5   = aa + ad[row -3]+1;
2468       for (j=0 ; j<nz-1; j+=2){
2469         i0   = vi[j];
2470         i1   = vi[j+1];
2471         tmp0 = tmps[i0];
2472         tmp1 = tmps[i1];
2473         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2474         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2475         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2476         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2477         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2478       }
2479       if (j==nz-1){
2480         tmp0  = tmps[vi[j]];
2481         sum1 -= v1[j] * tmp0;
2482         sum2 -= v2[j+1] * tmp0;
2483         sum3 -= v3[j+2] * tmp0;
2484         sum4 -= v4[j+3] * tmp0;
2485         sum5 -= v5[j+4] * tmp0;
2486       }
2487 
2488       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2489       sum2   -= v2[0] * tmp0;
2490       sum3   -= v3[1] * tmp0;
2491       sum4   -= v4[2] * tmp0;
2492       sum5   -= v5[3] * tmp0;
2493       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2494       sum3   -= v3[0] * tmp0;
2495       sum4   -= v4[1] * tmp0;
2496       sum5   -= v5[2] * tmp0;
2497       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2498       sum4   -= v4[0] * tmp0;
2499       sum5   -= v5[1] * tmp0;
2500       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2501       sum5   -= v5[0] * tmp0;
2502       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2503       break;
2504     default:
2505       SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
2506     }
2507   }
2508   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2509   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2510   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2511   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2512   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2513   PetscFunctionReturn(0);
2514 }
2515 
2516 
2517 /*
2518      Makes a longer coloring[] array and calls the usual code with that
2519 */
2520 #undef __FUNCT__
2521 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2522 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2523 {
2524   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2525   PetscErrorCode  ierr;
2526   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2527   PetscInt        *colorused,i;
2528   ISColoringValue *newcolor;
2529 
2530   PetscFunctionBegin;
2531   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2532   /* loop over inodes, marking a color for each column*/
2533   row = 0;
2534   for (i=0; i<m; i++){
2535     for (j=0; j<ns[i]; j++) {
2536       newcolor[row++] = coloring[i] + j*ncolors;
2537     }
2538   }
2539 
2540   /* eliminate unneeded colors */
2541   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2542   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2543   for (i=0; i<n; i++) {
2544     colorused[newcolor[i]] = 1;
2545   }
2546 
2547   for (i=1; i<5*ncolors; i++) {
2548     colorused[i] += colorused[i-1];
2549   }
2550   ncolors = colorused[5*ncolors-1];
2551   for (i=0; i<n; i++) {
2552     newcolor[i] = colorused[newcolor[i]]-1;
2553   }
2554   ierr = PetscFree(colorused);CHKERRQ(ierr);
2555   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2556   ierr = PetscFree(coloring);CHKERRQ(ierr);
2557   PetscFunctionReturn(0);
2558 }
2559 
2560 #include "../src/mat/blockinvert.h"
2561 
2562 #undef __FUNCT__
2563 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2564 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2565 {
2566   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2567   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2568   MatScalar          *ibdiag,*bdiag,work[25];
2569   PetscScalar        *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5;
2570   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2571   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2572   PetscErrorCode     ierr;
2573   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2574   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2575 
2576   PetscFunctionBegin;
2577   if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2578   if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2579   if (its > 1) {
2580     /* switch to non-inode version */
2581     ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr);
2582     PetscFunctionReturn(0);
2583   }
2584 
2585   if (!a->inode.ibdiagvalid) {
2586     if (!a->inode.ibdiag) {
2587       /* calculate space needed for diagonal blocks */
2588       for (i=0; i<m; i++) {
2589 	cnt += sizes[i]*sizes[i];
2590       }
2591       a->inode.bdiagsize = cnt;
2592       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2593     }
2594 
2595     /* copy over the diagonal blocks and invert them */
2596     ibdiag = a->inode.ibdiag;
2597     bdiag  = a->inode.bdiag;
2598     cnt = 0;
2599     for (i=0, row = 0; i<m; i++) {
2600       for (j=0; j<sizes[i]; j++) {
2601         for (k=0; k<sizes[i]; k++) {
2602           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2603         }
2604       }
2605       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2606 
2607       switch(sizes[i]) {
2608         case 1:
2609           /* Create matrix data structure */
2610           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2611           ibdiag[cnt] = 1.0/ibdiag[cnt];
2612           break;
2613         case 2:
2614           ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2615           break;
2616         case 3:
2617           ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2618           break;
2619         case 4:
2620           ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2621           break;
2622         case 5:
2623           ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2624           break;
2625        default:
2626 	 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2627       }
2628       cnt += sizes[i]*sizes[i];
2629       row += sizes[i];
2630     }
2631     a->inode.ibdiagvalid = PETSC_TRUE;
2632   }
2633   ibdiag = a->inode.ibdiag;
2634   bdiag  = a->inode.bdiag;
2635 
2636 
2637   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2638   if (flag & SOR_ZERO_INITIAL_GUESS) {
2639     PetscScalar *b;
2640     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2641     if (xx != bb) {
2642       ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2643     } else {
2644       b = x;
2645     }
2646     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){
2647 
2648       for (i=0, row=0; i<m; i++) {
2649         sz  = diag[row] - ii[row];
2650         v1  = a->a + ii[row];
2651         idx = a->j + ii[row];
2652 
2653         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2654         switch (sizes[i]){
2655           case 1:
2656 
2657             sum1  = b[row];
2658             for(n = 0; n<sz-1; n+=2) {
2659               i1   = idx[0];
2660               i2   = idx[1];
2661               idx += 2;
2662               tmp0 = x[i1];
2663               tmp1 = x[i2];
2664               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2665             }
2666 
2667             if (n == sz-1){
2668               tmp0  = x[*idx];
2669               sum1 -= *v1 * tmp0;
2670             }
2671             x[row++] = sum1*(*ibdiag++);
2672             break;
2673           case 2:
2674             v2    = a->a + ii[row+1];
2675             sum1  = b[row];
2676             sum2  = b[row+1];
2677             for(n = 0; n<sz-1; n+=2) {
2678               i1   = idx[0];
2679               i2   = idx[1];
2680               idx += 2;
2681               tmp0 = x[i1];
2682               tmp1 = x[i2];
2683               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2684               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2685             }
2686 
2687             if (n == sz-1){
2688               tmp0  = x[*idx];
2689               sum1 -= v1[0] * tmp0;
2690               sum2 -= v2[0] * tmp0;
2691             }
2692             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2693             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2694             ibdiag  += 4;
2695             break;
2696           case 3:
2697             v2    = a->a + ii[row+1];
2698             v3    = a->a + ii[row+2];
2699             sum1  = b[row];
2700             sum2  = b[row+1];
2701             sum3  = b[row+2];
2702             for(n = 0; n<sz-1; n+=2) {
2703               i1   = idx[0];
2704               i2   = idx[1];
2705               idx += 2;
2706               tmp0 = x[i1];
2707               tmp1 = x[i2];
2708               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2709               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2710               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2711             }
2712 
2713             if (n == sz-1){
2714               tmp0  = x[*idx];
2715               sum1 -= v1[0] * tmp0;
2716               sum2 -= v2[0] * tmp0;
2717               sum3 -= v3[0] * tmp0;
2718             }
2719             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2720             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2721             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2722             ibdiag  += 9;
2723             break;
2724           case 4:
2725             v2    = a->a + ii[row+1];
2726             v3    = a->a + ii[row+2];
2727             v4    = a->a + ii[row+3];
2728             sum1  = b[row];
2729             sum2  = b[row+1];
2730             sum3  = b[row+2];
2731             sum4  = b[row+3];
2732             for(n = 0; n<sz-1; n+=2) {
2733               i1   = idx[0];
2734               i2   = idx[1];
2735               idx += 2;
2736               tmp0 = x[i1];
2737               tmp1 = x[i2];
2738               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2739               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2740               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2741               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2742             }
2743 
2744             if (n == sz-1){
2745               tmp0  = x[*idx];
2746               sum1 -= v1[0] * tmp0;
2747               sum2 -= v2[0] * tmp0;
2748               sum3 -= v3[0] * tmp0;
2749               sum4 -= v4[0] * tmp0;
2750             }
2751             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2752             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2753             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2754             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2755             ibdiag  += 16;
2756             break;
2757           case 5:
2758             v2    = a->a + ii[row+1];
2759             v3    = a->a + ii[row+2];
2760             v4    = a->a + ii[row+3];
2761             v5    = a->a + ii[row+4];
2762             sum1  = b[row];
2763             sum2  = b[row+1];
2764             sum3  = b[row+2];
2765             sum4  = b[row+3];
2766             sum5  = b[row+4];
2767             for(n = 0; n<sz-1; n+=2) {
2768               i1   = idx[0];
2769               i2   = idx[1];
2770               idx += 2;
2771               tmp0 = x[i1];
2772               tmp1 = x[i2];
2773               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2774               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2775               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2776               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2777               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2778             }
2779 
2780             if (n == sz-1){
2781               tmp0  = x[*idx];
2782               sum1 -= v1[0] * tmp0;
2783               sum2 -= v2[0] * tmp0;
2784               sum3 -= v3[0] * tmp0;
2785               sum4 -= v4[0] * tmp0;
2786               sum5 -= v5[0] * tmp0;
2787             }
2788             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2789             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2790             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2791             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2792             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2793             ibdiag  += 25;
2794             break;
2795 	  default:
2796    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2797         }
2798       }
2799 
2800       xb = x;
2801       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2802     } else xb = b;
2803     if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) &&
2804         (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) {
2805       cnt = 0;
2806       for (i=0, row=0; i<m; i++) {
2807 
2808         switch (sizes[i]){
2809           case 1:
2810             x[row++] *= bdiag[cnt++];
2811             break;
2812           case 2:
2813             x1   = x[row]; x2 = x[row+1];
2814             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
2815             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
2816             x[row++] = tmp1;
2817             x[row++] = tmp2;
2818             cnt += 4;
2819             break;
2820           case 3:
2821             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
2822             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
2823             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
2824             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
2825             x[row++] = tmp1;
2826             x[row++] = tmp2;
2827             x[row++] = tmp3;
2828             cnt += 9;
2829             break;
2830           case 4:
2831             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
2832             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
2833             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
2834             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
2835             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
2836             x[row++] = tmp1;
2837             x[row++] = tmp2;
2838             x[row++] = tmp3;
2839             x[row++] = tmp4;
2840             cnt += 16;
2841             break;
2842           case 5:
2843             x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
2844             tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
2845             tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
2846             tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
2847             tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
2848             tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
2849             x[row++] = tmp1;
2850             x[row++] = tmp2;
2851             x[row++] = tmp3;
2852             x[row++] = tmp4;
2853             x[row++] = tmp5;
2854             cnt += 25;
2855             break;
2856 	  default:
2857    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2858         }
2859       }
2860       ierr = PetscLogFlops(m);CHKERRQ(ierr);
2861     }
2862     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){
2863 
2864       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2865       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2866         ibdiag -= sizes[i]*sizes[i];
2867         sz      = ii[row+1] - diag[row] - 1;
2868         v1      = a->a + diag[row] + 1;
2869         idx     = a->j + diag[row] + 1;
2870 
2871         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2872         switch (sizes[i]){
2873           case 1:
2874 
2875             sum1  = xb[row];
2876             for(n = 0; n<sz-1; n+=2) {
2877               i1   = idx[0];
2878               i2   = idx[1];
2879               idx += 2;
2880               tmp0 = x[i1];
2881               tmp1 = x[i2];
2882               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2883             }
2884 
2885             if (n == sz-1){
2886               tmp0  = x[*idx];
2887               sum1 -= *v1*tmp0;
2888             }
2889             x[row--] = sum1*(*ibdiag);
2890             break;
2891 
2892           case 2:
2893 
2894             sum1  = xb[row];
2895             sum2  = xb[row-1];
2896             /* note that sum1 is associated with the second of the two rows */
2897             v2    = a->a + diag[row-1] + 2;
2898             for(n = 0; n<sz-1; n+=2) {
2899               i1   = idx[0];
2900               i2   = idx[1];
2901               idx += 2;
2902               tmp0 = x[i1];
2903               tmp1 = x[i2];
2904               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2905               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2906             }
2907 
2908             if (n == sz-1){
2909               tmp0  = x[*idx];
2910               sum1 -= *v1*tmp0;
2911               sum2 -= *v2*tmp0;
2912             }
2913             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
2914             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
2915             break;
2916           case 3:
2917 
2918             sum1  = xb[row];
2919             sum2  = xb[row-1];
2920             sum3  = xb[row-2];
2921             v2    = a->a + diag[row-1] + 2;
2922             v3    = a->a + diag[row-2] + 3;
2923             for(n = 0; n<sz-1; n+=2) {
2924               i1   = idx[0];
2925               i2   = idx[1];
2926               idx += 2;
2927               tmp0 = x[i1];
2928               tmp1 = x[i2];
2929               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2930               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2931               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2932             }
2933 
2934             if (n == sz-1){
2935               tmp0  = x[*idx];
2936               sum1 -= *v1*tmp0;
2937               sum2 -= *v2*tmp0;
2938               sum3 -= *v3*tmp0;
2939             }
2940             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
2941             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
2942             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
2943             break;
2944           case 4:
2945 
2946             sum1  = xb[row];
2947             sum2  = xb[row-1];
2948             sum3  = xb[row-2];
2949             sum4  = xb[row-3];
2950             v2    = a->a + diag[row-1] + 2;
2951             v3    = a->a + diag[row-2] + 3;
2952             v4    = a->a + diag[row-3] + 4;
2953             for(n = 0; n<sz-1; n+=2) {
2954               i1   = idx[0];
2955               i2   = idx[1];
2956               idx += 2;
2957               tmp0 = x[i1];
2958               tmp1 = x[i2];
2959               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2960               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2961               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2962               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2963             }
2964 
2965             if (n == sz-1){
2966               tmp0  = x[*idx];
2967               sum1 -= *v1*tmp0;
2968               sum2 -= *v2*tmp0;
2969               sum3 -= *v3*tmp0;
2970               sum4 -= *v4*tmp0;
2971             }
2972             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
2973             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
2974             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
2975             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
2976             break;
2977           case 5:
2978 
2979             sum1  = xb[row];
2980             sum2  = xb[row-1];
2981             sum3  = xb[row-2];
2982             sum4  = xb[row-3];
2983             sum5  = xb[row-4];
2984             v2    = a->a + diag[row-1] + 2;
2985             v3    = a->a + diag[row-2] + 3;
2986             v4    = a->a + diag[row-3] + 4;
2987             v5    = a->a + diag[row-4] + 5;
2988             for(n = 0; n<sz-1; n+=2) {
2989               i1   = idx[0];
2990               i2   = idx[1];
2991               idx += 2;
2992               tmp0 = x[i1];
2993               tmp1 = x[i2];
2994               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2995               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2996               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2997               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2998               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2999             }
3000 
3001             if (n == sz-1){
3002               tmp0  = x[*idx];
3003               sum1 -= *v1*tmp0;
3004               sum2 -= *v2*tmp0;
3005               sum3 -= *v3*tmp0;
3006               sum4 -= *v4*tmp0;
3007               sum5 -= *v5*tmp0;
3008             }
3009             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3010             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3011             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3012             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3013             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3014             break;
3015 	  default:
3016    	    SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3017         }
3018       }
3019 
3020       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3021     }
3022     its--;
3023     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3024     if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);}
3025   }
3026   if (flag & SOR_EISENSTAT) {
3027     const PetscScalar *b;
3028     MatScalar         *t = a->inode.ssor_work;
3029 
3030     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3031     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3032     /*
3033           Apply  (U + D)^-1  where D is now the block diagonal
3034     */
3035     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3036     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3037       ibdiag -= sizes[i]*sizes[i];
3038       sz      = ii[row+1] - diag[row] - 1;
3039       v1      = a->a + diag[row] + 1;
3040       idx     = a->j + diag[row] + 1;
3041       CHKMEMQ;
3042       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3043       switch (sizes[i]){
3044         case 1:
3045 
3046 	  sum1  = b[row];
3047 	  for(n = 0; n<sz-1; n+=2) {
3048 	    i1   = idx[0];
3049 	    i2   = idx[1];
3050 	    idx += 2;
3051 	    tmp0 = x[i1];
3052 	    tmp1 = x[i2];
3053 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3054 	  }
3055 
3056 	  if (n == sz-1){
3057 	    tmp0  = x[*idx];
3058 	    sum1 -= *v1*tmp0;
3059 	  }
3060 	  x[row] = sum1*(*ibdiag);row--;
3061 	  break;
3062 
3063         case 2:
3064 
3065 	  sum1  = b[row];
3066 	  sum2  = b[row-1];
3067 	  /* note that sum1 is associated with the second of the two rows */
3068 	  v2    = a->a + diag[row-1] + 2;
3069 	  for(n = 0; n<sz-1; n+=2) {
3070 	    i1   = idx[0];
3071 	    i2   = idx[1];
3072 	    idx += 2;
3073 	    tmp0 = x[i1];
3074 	    tmp1 = x[i2];
3075 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3076 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3077 	  }
3078 
3079 	  if (n == sz-1){
3080 	    tmp0  = x[*idx];
3081 	    sum1 -= *v1*tmp0;
3082 	    sum2 -= *v2*tmp0;
3083 	  }
3084 	  x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3085 	  x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3086           row -= 2;
3087 	  break;
3088         case 3:
3089 
3090 	  sum1  = b[row];
3091 	  sum2  = b[row-1];
3092 	  sum3  = b[row-2];
3093 	  v2    = a->a + diag[row-1] + 2;
3094 	  v3    = a->a + diag[row-2] + 3;
3095 	  for(n = 0; n<sz-1; n+=2) {
3096 	    i1   = idx[0];
3097 	    i2   = idx[1];
3098 	    idx += 2;
3099 	    tmp0 = x[i1];
3100 	    tmp1 = x[i2];
3101 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3102 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3103 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3104 	  }
3105 
3106 	  if (n == sz-1){
3107 	    tmp0  = x[*idx];
3108 	    sum1 -= *v1*tmp0;
3109 	    sum2 -= *v2*tmp0;
3110 	    sum3 -= *v3*tmp0;
3111 	  }
3112 	  x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3113 	  x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3114 	  x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3115           row -= 3;
3116 	  break;
3117         case 4:
3118 
3119 	  sum1  = b[row];
3120 	  sum2  = b[row-1];
3121 	  sum3  = b[row-2];
3122 	  sum4  = b[row-3];
3123 	  v2    = a->a + diag[row-1] + 2;
3124 	  v3    = a->a + diag[row-2] + 3;
3125 	  v4    = a->a + diag[row-3] + 4;
3126 	  for(n = 0; n<sz-1; n+=2) {
3127 	    i1   = idx[0];
3128 	    i2   = idx[1];
3129 	    idx += 2;
3130 	    tmp0 = x[i1];
3131 	    tmp1 = x[i2];
3132 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3133 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3134 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3135 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3136 	  }
3137 
3138 	  if (n == sz-1){
3139 	    tmp0  = x[*idx];
3140 	    sum1 -= *v1*tmp0;
3141 	    sum2 -= *v2*tmp0;
3142 	    sum3 -= *v3*tmp0;
3143 	    sum4 -= *v4*tmp0;
3144 	  }
3145 	  x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3146 	  x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3147 	  x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3148 	  x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3149           row -= 4;
3150 	  break;
3151         case 5:
3152 
3153 	  sum1  = b[row];
3154 	  sum2  = b[row-1];
3155 	  sum3  = b[row-2];
3156 	  sum4  = b[row-3];
3157 	  sum5  = b[row-4];
3158 	  v2    = a->a + diag[row-1] + 2;
3159 	  v3    = a->a + diag[row-2] + 3;
3160 	  v4    = a->a + diag[row-3] + 4;
3161 	  v5    = a->a + diag[row-4] + 5;
3162 	  for(n = 0; n<sz-1; n+=2) {
3163 	    i1   = idx[0];
3164 	    i2   = idx[1];
3165 	    idx += 2;
3166 	    tmp0 = x[i1];
3167 	    tmp1 = x[i2];
3168 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3169 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3170 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3171 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3172 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3173 	  }
3174 
3175 	  if (n == sz-1){
3176 	    tmp0  = x[*idx];
3177 	    sum1 -= *v1*tmp0;
3178 	    sum2 -= *v2*tmp0;
3179 	    sum3 -= *v3*tmp0;
3180 	    sum4 -= *v4*tmp0;
3181 	    sum5 -= *v5*tmp0;
3182 	  }
3183 	  x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3184 	  x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3185 	  x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3186 	  x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3187 	  x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3188           row -= 5;
3189 	  break;
3190         default:
3191 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3192       }
3193       CHKMEMQ;
3194     }
3195     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3196 
3197     /*
3198            t = b - D x    where D is the block diagonal
3199     */
3200     cnt = 0;
3201     for (i=0, row=0; i<m; i++) {
3202       CHKMEMQ;
3203       switch (sizes[i]){
3204         case 1:
3205 	  t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3206 	  break;
3207         case 2:
3208 	  x1   = x[row]; x2 = x[row+1];
3209 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3210 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3211 	  t[row]   = b[row] - tmp1;
3212 	  t[row+1] = b[row+1] - tmp2; row += 2;
3213 	  cnt += 4;
3214 	  break;
3215         case 3:
3216 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3217 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3218 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3219 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3220 	  t[row] = b[row] - tmp1;
3221 	  t[row+1] = b[row+1] - tmp2;
3222 	  t[row+2] = b[row+2] - tmp3; row += 3;
3223 	  cnt += 9;
3224 	  break;
3225         case 4:
3226 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3227 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3228 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3229 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3230 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3231 	  t[row] = b[row] - tmp1;
3232 	  t[row+1] = b[row+1] - tmp2;
3233 	  t[row+2] = b[row+2] - tmp3;
3234 	  t[row+3] = b[row+3] - tmp4; row += 4;
3235 	  cnt += 16;
3236 	  break;
3237         case 5:
3238 	  x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3239 	  tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3240 	  tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3241 	  tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3242 	  tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3243 	  tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3244 	  t[row] = b[row] - tmp1;
3245 	  t[row+1] = b[row+1] - tmp2;
3246 	  t[row+2] = b[row+2] - tmp3;
3247 	  t[row+3] = b[row+3] - tmp4;
3248 	  t[row+4] = b[row+4] - tmp5;row += 5;
3249 	  cnt += 25;
3250 	  break;
3251         default:
3252 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3253       }
3254       CHKMEMQ;
3255     }
3256     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3257 
3258 
3259 
3260     /*
3261           Apply (L + D)^-1 where D is the block diagonal
3262     */
3263     for (i=0, row=0; i<m; i++) {
3264       sz  = diag[row] - ii[row];
3265       v1  = a->a + ii[row];
3266       idx = a->j + ii[row];
3267       CHKMEMQ;
3268       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3269       switch (sizes[i]){
3270         case 1:
3271 
3272 	  sum1  = t[row];
3273 	  for(n = 0; n<sz-1; n+=2) {
3274 	    i1   = idx[0];
3275 	    i2   = idx[1];
3276 	    idx += 2;
3277 	    tmp0 = t[i1];
3278 	    tmp1 = t[i2];
3279 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3280 	  }
3281 
3282 	  if (n == sz-1){
3283 	    tmp0  = t[*idx];
3284 	    sum1 -= *v1 * tmp0;
3285 	  }
3286 	  x[row] += t[row] = sum1*(*ibdiag++); row++;
3287 	  break;
3288         case 2:
3289 	  v2    = a->a + ii[row+1];
3290 	  sum1  = t[row];
3291 	  sum2  = t[row+1];
3292 	  for(n = 0; n<sz-1; n+=2) {
3293 	    i1   = idx[0];
3294 	    i2   = idx[1];
3295 	    idx += 2;
3296 	    tmp0 = t[i1];
3297 	    tmp1 = t[i2];
3298 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3299 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3300 	  }
3301 
3302 	  if (n == sz-1){
3303 	    tmp0  = t[*idx];
3304 	    sum1 -= v1[0] * tmp0;
3305 	    sum2 -= v2[0] * tmp0;
3306 	  }
3307 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3308 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3309 	  ibdiag  += 4; row += 2;
3310 	  break;
3311         case 3:
3312 	  v2    = a->a + ii[row+1];
3313 	  v3    = a->a + ii[row+2];
3314 	  sum1  = t[row];
3315 	  sum2  = t[row+1];
3316 	  sum3  = t[row+2];
3317 	  for(n = 0; n<sz-1; n+=2) {
3318 	    i1   = idx[0];
3319 	    i2   = idx[1];
3320 	    idx += 2;
3321 	    tmp0 = t[i1];
3322 	    tmp1 = t[i2];
3323 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3324 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3325 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3326 	  }
3327 
3328 	  if (n == sz-1){
3329 	    tmp0  = t[*idx];
3330 	    sum1 -= v1[0] * tmp0;
3331 	    sum2 -= v2[0] * tmp0;
3332 	    sum3 -= v3[0] * tmp0;
3333 	  }
3334 	  x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3335 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3336 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3337 	  ibdiag  += 9; row += 3;
3338 	  break;
3339         case 4:
3340 	  v2    = a->a + ii[row+1];
3341 	  v3    = a->a + ii[row+2];
3342 	  v4    = a->a + ii[row+3];
3343 	  sum1  = t[row];
3344 	  sum2  = t[row+1];
3345 	  sum3  = t[row+2];
3346 	  sum4  = t[row+3];
3347 	  for(n = 0; n<sz-1; n+=2) {
3348 	    i1   = idx[0];
3349 	    i2   = idx[1];
3350 	    idx += 2;
3351 	    tmp0 = t[i1];
3352 	    tmp1 = t[i2];
3353 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3354 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3355 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3356 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3357 	  }
3358 
3359 	  if (n == sz-1){
3360 	    tmp0  = t[*idx];
3361 	    sum1 -= v1[0] * tmp0;
3362 	    sum2 -= v2[0] * tmp0;
3363 	    sum3 -= v3[0] * tmp0;
3364 	    sum4 -= v4[0] * tmp0;
3365 	  }
3366 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3367 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3368 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3369 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3370 	  ibdiag  += 16; row += 4;
3371 	  break;
3372         case 5:
3373 	  v2    = a->a + ii[row+1];
3374 	  v3    = a->a + ii[row+2];
3375 	  v4    = a->a + ii[row+3];
3376 	  v5    = a->a + ii[row+4];
3377 	  sum1  = t[row];
3378 	  sum2  = t[row+1];
3379 	  sum3  = t[row+2];
3380 	  sum4  = t[row+3];
3381 	  sum5  = t[row+4];
3382 	  for(n = 0; n<sz-1; n+=2) {
3383 	    i1   = idx[0];
3384 	    i2   = idx[1];
3385 	    idx += 2;
3386 	    tmp0 = t[i1];
3387 	    tmp1 = t[i2];
3388 	    sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3389 	    sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3390 	    sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3391 	    sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3392 	    sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3393 	  }
3394 
3395 	  if (n == sz-1){
3396 	    tmp0  = t[*idx];
3397 	    sum1 -= v1[0] * tmp0;
3398 	    sum2 -= v2[0] * tmp0;
3399 	    sum3 -= v3[0] * tmp0;
3400 	    sum4 -= v4[0] * tmp0;
3401 	    sum5 -= v5[0] * tmp0;
3402 	  }
3403 	  x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3404 	  x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3405 	  x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3406 	  x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3407 	  x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3408 	  ibdiag  += 25; row += 5;
3409 	  break;
3410         default:
3411 	  SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3412       }
3413       CHKMEMQ;
3414     }
3415     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3416     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3417     ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3418   }
3419   PetscFunctionReturn(0);
3420 }
3421 
3422 #undef __FUNCT__
3423 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3424 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3425 {
3426   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3427   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3428   const MatScalar    *bdiag = a->inode.bdiag;
3429   const PetscScalar  *b;
3430   PetscErrorCode      ierr;
3431   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3432   const PetscInt      *sizes = a->inode.size;
3433 
3434   PetscFunctionBegin;
3435   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3436   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3437   cnt = 0;
3438   for (i=0, row=0; i<m; i++) {
3439     switch (sizes[i]){
3440       case 1:
3441 	x[row] = b[row]*bdiag[cnt++];row++;
3442 	break;
3443       case 2:
3444 	x1   = b[row]; x2 = b[row+1];
3445 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3446 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3447 	x[row++] = tmp1;
3448 	x[row++] = tmp2;
3449 	cnt += 4;
3450 	break;
3451       case 3:
3452 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3453 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3454 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3455 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3456 	x[row++] = tmp1;
3457 	x[row++] = tmp2;
3458 	x[row++] = tmp3;
3459 	cnt += 9;
3460 	break;
3461       case 4:
3462 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3463 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3464 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3465 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3466 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3467 	x[row++] = tmp1;
3468 	x[row++] = tmp2;
3469 	x[row++] = tmp3;
3470 	x[row++] = tmp4;
3471 	cnt += 16;
3472 	break;
3473       case 5:
3474 	x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3475 	tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3476 	tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3477 	tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3478 	tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3479 	tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3480 	x[row++] = tmp1;
3481 	x[row++] = tmp2;
3482 	x[row++] = tmp3;
3483 	x[row++] = tmp4;
3484 	x[row++] = tmp5;
3485 	cnt += 25;
3486 	break;
3487       default:
3488 	SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3489     }
3490   }
3491   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3492   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3493   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3494   PetscFunctionReturn(0);
3495 }
3496 
3497 /*
3498     samestructure indicates that the matrix has not changed its nonzero structure so we
3499     do not need to recompute the inodes
3500 */
3501 #undef __FUNCT__
3502 #define __FUNCT__ "Mat_CheckInode"
3503 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
3504 {
3505   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3506   PetscErrorCode ierr;
3507   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3508   PetscTruth     flag;
3509 
3510   PetscFunctionBegin;
3511   if (!a->inode.use)                     PetscFunctionReturn(0);
3512   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3513 
3514 
3515   m = A->rmap->n;
3516   if (a->inode.size) {ns = a->inode.size;}
3517   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3518 
3519   i          = 0;
3520   node_count = 0;
3521   idx        = a->j;
3522   ii         = a->i;
3523   while (i < m){                /* For each row */
3524     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3525     /* Limits the number of elements in a node to 'a->inode.limit' */
3526     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3527       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3528       if (nzy != nzx) break;
3529       idy  += nzx;             /* Same nonzero pattern */
3530       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3531       if (!flag) break;
3532     }
3533     ns[node_count++] = blk_size;
3534     idx += blk_size*nzx;
3535     i    = j;
3536   }
3537   /* If not enough inodes found,, do not use inode version of the routines */
3538   if (!a->inode.size && m && node_count > .9*m) {
3539     ierr = PetscFree(ns);CHKERRQ(ierr);
3540     a->inode.node_count     = 0;
3541     a->inode.size           = PETSC_NULL;
3542     a->inode.use            = PETSC_FALSE;
3543     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3544   } else {
3545     A->ops->mult              = MatMult_SeqAIJ_Inode;
3546     A->ops->sor               = MatSOR_SeqAIJ_Inode;
3547     A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
3548     A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
3549     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
3550     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
3551     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
3552     A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
3553     A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3554     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode_inplace;
3555     a->inode.node_count       = node_count;
3556     a->inode.size             = ns;
3557     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3558   }
3559   PetscFunctionReturn(0);
3560 }
3561 
3562 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) {	\
3563 PetscInt __k, *__vi; \
3564 __vi = aj + ai[row];				\
3565 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \
3566 __vi = aj + adiag[row];				\
3567 cols[nzl] = __vi[0];\
3568 __vi = aj + adiag[row+1]+1;\
3569 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];}
3570 
3571 
3572 /*
3573    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
3574    Modified from Mat_CheckInode().
3575 
3576    Input Parameters:
3577 +  Mat A - ILU or LU matrix factor
3578 -  samestructure - TURE indicates that the matrix has not changed its nonzero structure so we
3579     do not need to recompute the inodes
3580 */
3581 #undef __FUNCT__
3582 #define __FUNCT__ "Mat_CheckInode_FactorLU"
3583 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure)
3584 {
3585   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3586   PetscErrorCode ierr;
3587   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
3588   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
3589   PetscTruth     flag;
3590 
3591   PetscFunctionBegin;
3592   if (!a->inode.use)                     PetscFunctionReturn(0);
3593   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3594 
3595   m = A->rmap->n;
3596   if (a->inode.size) {ns = a->inode.size;}
3597   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3598 
3599   i          = 0;
3600   node_count = 0;
3601   while (i < m){                /* For each row */
3602     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
3603     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
3604     nzx  = nzl1 + nzu1 + 1;
3605     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
3606     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
3607 
3608     /* Limits the number of elements in a node to 'a->inode.limit' */
3609     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3610       nzl2    = ai[j+1] - ai[j];
3611       nzu2    = adiag[j] - adiag[j+1] - 1;
3612       nzy     = nzl2 + nzu2 + 1;
3613       if( nzy != nzx) break;
3614       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
3615       MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);
3616       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3617       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
3618       ierr = PetscFree(cols2);CHKERRQ(ierr);
3619     }
3620     ns[node_count++] = blk_size;
3621     ierr = PetscFree(cols1);CHKERRQ(ierr);
3622     i    = j;
3623   }
3624   /* If not enough inodes found,, do not use inode version of the routines */
3625   if (!a->inode.size && m && node_count > .9*m) {
3626     ierr = PetscFree(ns);CHKERRQ(ierr);
3627     a->inode.node_count     = 0;
3628     a->inode.size           = PETSC_NULL;
3629     a->inode.use            = PETSC_FALSE;
3630     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
3631   } else {
3632     A->ops->mult              = 0;
3633     A->ops->sor               = 0;
3634     A->ops->multadd           = 0;
3635     A->ops->getrowij          = 0;
3636     A->ops->restorerowij      = 0;
3637     A->ops->getcolumnij       = 0;
3638     A->ops->restorecolumnij   = 0;
3639     A->ops->coloringpatch     = 0;
3640     A->ops->multdiagonalblock = 0;
3641     A->ops->lufactornumeric   = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */
3642     a->inode.node_count       = node_count;
3643     a->inode.size             = ns;
3644     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
3645   }
3646   PetscFunctionReturn(0);
3647 }
3648 
3649 /*
3650      This is really ugly. if inodes are used this replaces the
3651   permutations with ones that correspond to rows/cols of the matrix
3652   rather then inode blocks
3653 */
3654 #undef __FUNCT__
3655 #define __FUNCT__ "MatInodeAdjustForInodes"
3656 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
3657 {
3658   PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
3659 
3660   PetscFunctionBegin;
3661   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr);
3662   if (f) {
3663     ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr);
3664   }
3665   PetscFunctionReturn(0);
3666 }
3667 
3668 EXTERN_C_BEGIN
3669 #undef __FUNCT__
3670 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode"
3671 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
3672 {
3673   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
3674   PetscErrorCode ierr;
3675   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
3676   const PetscInt *ridx,*cidx;
3677   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
3678   PetscInt       nslim_col,*ns_col;
3679   IS             ris = *rperm,cis = *cperm;
3680 
3681   PetscFunctionBegin;
3682   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
3683   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
3684 
3685   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
3686   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
3687   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
3688 
3689   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
3690   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
3691 
3692   /* Form the inode structure for the rows of permuted matric using inv perm*/
3693   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
3694 
3695   /* Construct the permutations for rows*/
3696   for (i=0,row = 0; i<nslim_row; ++i){
3697     indx      = ridx[i];
3698     start_val = tns[indx];
3699     end_val   = tns[indx + 1];
3700     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
3701   }
3702 
3703   /* Form the inode structure for the columns of permuted matrix using inv perm*/
3704   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
3705 
3706  /* Construct permutations for columns */
3707   for (i=0,col=0; i<nslim_col; ++i){
3708     indx      = cidx[i];
3709     start_val = tns[indx];
3710     end_val   = tns[indx + 1];
3711     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
3712   }
3713 
3714   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr);
3715   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
3716   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr);
3717   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
3718 
3719   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
3720   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
3721 
3722   ierr = PetscFree(ns_col);CHKERRQ(ierr);
3723   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
3724   ierr = ISDestroy(cis);CHKERRQ(ierr);
3725   ierr = ISDestroy(ris);CHKERRQ(ierr);
3726   ierr = PetscFree(tns);CHKERRQ(ierr);
3727   PetscFunctionReturn(0);
3728 }
3729 EXTERN_C_END
3730 
3731 #undef __FUNCT__
3732 #define __FUNCT__ "MatInodeGetInodeSizes"
3733 /*@C
3734    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
3735 
3736    Collective on Mat
3737 
3738    Input Parameter:
3739 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
3740 
3741    Output Parameter:
3742 +  node_count - no of inodes present in the matrix.
3743 .  sizes      - an array of size node_count,with sizes of each inode.
3744 -  limit      - the max size used to generate the inodes.
3745 
3746    Level: advanced
3747 
3748    Notes: This routine returns some internal storage information
3749    of the matrix, it is intended to be used by advanced users.
3750    It should be called after the matrix is assembled.
3751    The contents of the sizes[] array should not be changed.
3752    PETSC_NULL may be passed for information not requested.
3753 
3754 .keywords: matrix, seqaij, get, inode
3755 
3756 .seealso: MatGetInfo()
3757 @*/
3758 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3759 {
3760   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
3761 
3762   PetscFunctionBegin;
3763   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
3764   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
3765   if (f) {
3766     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
3767   }
3768   PetscFunctionReturn(0);
3769 }
3770 
3771 EXTERN_C_BEGIN
3772 #undef __FUNCT__
3773 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
3774 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
3775 {
3776   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3777 
3778   PetscFunctionBegin;
3779   if (node_count) *node_count = a->inode.node_count;
3780   if (sizes)      *sizes      = a->inode.size;
3781   if (limit)      *limit      = a->inode.limit;
3782   PetscFunctionReturn(0);
3783 }
3784 EXTERN_C_END
3785