xref: /petsc/src/mat/impls/aij/seq/inode.c (revision fe998a80077c9ee0917a39496df43fc256e1b478)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   if (zz != yy) {
601     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
602   } else {
603     z = y;
604   }
605   zt = z;
606 
607   idx = a->j;
608   v1  = a->a;
609   ii  = a->i;
610 
611   for (i = 0,row = 0; i< node_max; ++i) {
612     nsz = ns[i];
613     n   = ii[1] - ii[0];
614     ii += nsz;
615     sz  = n;                    /* No of non zeros in this row */
616                                 /* Switch on the size of Node */
617     switch (nsz) {               /* Each loop in 'case' is unrolled */
618     case 1:
619       sum1 = *zt++;
620 
621       for (n = 0; n< sz-1; n+=2) {
622         i1    = idx[0];         /* The instructions are ordered to */
623         i2    = idx[1];         /* make the compiler's job easy */
624         idx  += 2;
625         tmp0  = x[i1];
626         tmp1  = x[i2];
627         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628       }
629 
630       if (n   == sz-1) {          /* Take care of the last nonzero  */
631         tmp0  = x[*idx++];
632         sum1 += *v1++ * tmp0;
633       }
634       y[row++]=sum1;
635       break;
636     case 2:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       v2   = v1 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649       }
650       if (n   == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654       }
655       y[row++]=sum1;
656       y[row++]=sum2;
657       v1      =v2;              /* Since the next block to be processed starts there*/
658       idx    +=sz;
659       break;
660     case 3:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666 
667       for (n = 0; n< sz-1; n+=2) {
668         i1    = idx[0];
669         i2    = idx[1];
670         idx  += 2;
671         tmp0  = x[i1];
672         tmp1  = x[i2];
673         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676       }
677       if (n == sz-1) {
678         tmp0  = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682       }
683       y[row++]=sum1;
684       y[row++]=sum2;
685       y[row++]=sum3;
686       v1      =v3;              /* Since the next block to be processed starts there*/
687       idx    +=2*sz;
688       break;
689     case 4:
690       sum1 = *zt++;
691       sum2 = *zt++;
692       sum3 = *zt++;
693       sum4 = *zt++;
694       v2   = v1 + n;
695       v3   = v2 + n;
696       v4   = v3 + n;
697 
698       for (n = 0; n< sz-1; n+=2) {
699         i1    = idx[0];
700         i2    = idx[1];
701         idx  += 2;
702         tmp0  = x[i1];
703         tmp1  = x[i2];
704         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708       }
709       if (n == sz-1) {
710         tmp0  = x[*idx++];
711         sum1 += *v1++ * tmp0;
712         sum2 += *v2++ * tmp0;
713         sum3 += *v3++ * tmp0;
714         sum4 += *v4++ * tmp0;
715       }
716       y[row++]=sum1;
717       y[row++]=sum2;
718       y[row++]=sum3;
719       y[row++]=sum4;
720       v1      =v4;              /* Since the next block to be processed starts there*/
721       idx    +=3*sz;
722       break;
723     case 5:
724       sum1 = *zt++;
725       sum2 = *zt++;
726       sum3 = *zt++;
727       sum4 = *zt++;
728       sum5 = *zt++;
729       v2   = v1 + n;
730       v3   = v2 + n;
731       v4   = v3 + n;
732       v5   = v4 + n;
733 
734       for (n = 0; n<sz-1; n+=2) {
735         i1    = idx[0];
736         i2    = idx[1];
737         idx  += 2;
738         tmp0  = x[i1];
739         tmp1  = x[i2];
740         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745       }
746       if (n   == sz-1) {
747         tmp0  = x[*idx++];
748         sum1 += *v1++ * tmp0;
749         sum2 += *v2++ * tmp0;
750         sum3 += *v3++ * tmp0;
751         sum4 += *v4++ * tmp0;
752         sum5 += *v5++ * tmp0;
753       }
754       y[row++]=sum1;
755       y[row++]=sum2;
756       y[row++]=sum3;
757       y[row++]=sum4;
758       y[row++]=sum5;
759       v1      =v5;       /* Since the next block to be processed starts there */
760       idx    +=4*sz;
761       break;
762     default:
763       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764     }
765   }
766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770   }
771   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* ----------------------------------------------------------- */
776 #undef __FUNCT__
777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779 {
780   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781   IS                iscol = a->col,isrow = a->row;
782   PetscErrorCode    ierr;
783   const PetscInt    *r,*c,*rout,*cout;
784   PetscInt          i,j,n = A->rmap->n,nz;
785   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp;
806   aa   = a_a;
807   aj   = a_j;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i) {
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816     if (i < node_max-1) {
817       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818       * but our indexing to determine it's size could. */
819       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823     }
824 
825     switch (nsz) {               /* Each loop in 'case' is unrolled */
826     case 1:
827       sum1 = b[*r++];
828       for (j=0; j<nz-1; j+=2) {
829         i0    = vi[0];
830         i1    = vi[1];
831         vi   +=2;
832         tmp0  = tmps[i0];
833         tmp1  = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839       }
840       tmp[row++]=sum1;
841       break;
842     case 2:
843       sum1 = b[*r++];
844       sum2 = b[*r++];
845       v2   = aa + ai[row+1];
846 
847       for (j=0; j<nz-1; j+=2) {
848         i0    = vi[0];
849         i1    = vi[1];
850         vi   +=2;
851         tmp0  = tmps[i0];
852         tmp1  = tmps[i1];
853         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860       }
861       sum2     -= *v2++ *sum1;
862       tmp[row++]=sum1;
863       tmp[row++]=sum2;
864       break;
865     case 3:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       v2   = aa + ai[row+1];
870       v3   = aa + ai[row+2];
871 
872       for (j=0; j<nz-1; j+=2) {
873         i0    = vi[0];
874         i1    = vi[1];
875         vi   +=2;
876         tmp0  = tmps[i0];
877         tmp1  = tmps[i1];
878         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887       }
888       sum2 -= *v2++ * sum1;
889       sum3 -= *v3++ * sum1;
890       sum3 -= *v3++ * sum2;
891 
892       tmp[row++]=sum1;
893       tmp[row++]=sum2;
894       tmp[row++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2) {
907         i0    = vi[0];
908         i1    = vi[1];
909         vi   +=2;
910         tmp0  = tmps[i0];
911         tmp1  = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1) {
918         tmp0  = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row++]=sum1;
932       tmp[row++]=sum2;
933       tmp[row++]=sum3;
934       tmp[row++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2) {
948         i0    = vi[0];
949         i1    = vi[1];
950         vi   +=2;
951         tmp0  = tmps[i0];
952         tmp1  = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1) {
960         tmp0  = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row++]=sum1;
980       tmp[row++]=sum2;
981       tmp[row++]=sum3;
982       tmp[row++]=sum4;
983       tmp[row++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1,row = n-1; i>=0; i--) {
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz) {               /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1) {
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2:
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz; j>1; j-=2) {
1020         vi   -=2;
1021         i0    = vi[2];
1022         i1    = vi[1];
1023         tmp0  = tmps[i0];
1024         tmp1  = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1) {
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3:
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz; j>1; j-=2) {
1047         vi   -=2;
1048         i0    = vi[2];
1049         i1    = vi[1];
1050         tmp0  = tmps[i0];
1051         tmp1  = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1) {
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4:
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz; j>1; j-=2) {
1083         vi   -=2;
1084         i0    = vi[2];
1085         i1    = vi[1];
1086         tmp0  = tmps[i0];
1087         tmp1  = tmps[i1];
1088         v1   -= 2;
1089         v2   -= 2;
1090         v3   -= 2;
1091         v4   -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz; j>1; j-=2) {
1127         vi   -= 2;
1128         i0    = vi[2];
1129         i1    = vi[1];
1130         tmp0  = tmps[i0];
1131         tmp1  = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1) {
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat             C     =B;
1185   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186   IS              isrow = b->row,isicol = b->icol;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *r,*ic,*ics;
1189   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt        i,j,k,nz,nzL,row,*pj;
1191   const PetscInt  *ajtmp,*bjtmp;
1192   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx  sctx;
1195   const PetscInt  *ddiag;
1196   PetscReal       rs;
1197   MatScalar       d;
1198   PetscInt        inod,nodesz,node_max,col;
1199   const PetscInt  *ns;
1200   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223 
1224   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1225   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226 
1227   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1228   ics   = ic;
1229 
1230   node_max = a->inode.node_count;
1231   ns       = a->inode.size;
1232   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1233 
1234   /* If max inode size > 4, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j) {
1238     if (ns[i] > 4) {
1239       tmp_vec1[j] = 4;
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1262     tmp_vec2[i] = nodesz;
1263     j          += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267 
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.newshift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275       nodesz = ns[inod];
1276 
1277       switch (nodesz) {
1278       case 1:
1279         /*----------*/
1280         /* zero rtmp1 */
1281         /* L part */
1282         nz    = bi[i+1] - bi[i];
1283         bjtmp = bj + bi[i];
1284         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1285 
1286         /* U part */
1287         nz    = bdiag[i]-bdiag[i+1];
1288         bjtmp = bj + bdiag[i+1]+1;
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* load in initial (unfactored row) */
1292         nz    = ai[r[i]+1] - ai[r[i]];
1293         ajtmp = aj + ai[r[i]];
1294         v     = aa + ai[r[i]];
1295         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1296 
1297         /* ZeropivotApply() */
1298         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1299 
1300         /* elimination */
1301         bjtmp = bj + bi[i];
1302         row   = *bjtmp++;
1303         nzL   = bi[i+1] - bi[i];
1304         for (k=0; k < nzL; k++) {
1305           pc = rtmp1 + row;
1306           if (*pc != 0.0) {
1307             pv   = b->a + bdiag[row];
1308             mul1 = *pc * (*pv);
1309             *pc  = mul1;
1310             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1311             pv   = b->a + bdiag[row+1]+1;
1312             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1313             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1314             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1315           }
1316           row = *bjtmp++;
1317         }
1318 
1319         /* finished row so stick it into b->a */
1320         rs = 0.0;
1321         /* L part */
1322         pv = b->a + bi[i];
1323         pj = b->j + bi[i];
1324         nz = bi[i+1] - bi[i];
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* U part */
1330         pv = b->a + bdiag[i+1]+1;
1331         pj = b->j + bdiag[i+1]+1;
1332         nz = bdiag[i] - bdiag[i+1]-1;
1333         for (j=0; j<nz; j++) {
1334           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335         }
1336 
1337         /* Check zero pivot */
1338         sctx.rs = rs;
1339         sctx.pv = rtmp1[i];
1340         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1341         if (sctx.newshift) break;
1342 
1343         /* Mark diagonal and invert diagonal for simplier triangular solves */
1344         pv  = b->a + bdiag[i];
1345         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1346         break;
1347 
1348       case 2:
1349         /*----------*/
1350         /* zero rtmp1 and rtmp2 */
1351         /* L part */
1352         nz    = bi[i+1] - bi[i];
1353         bjtmp = bj + bi[i];
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* U part */
1360         nz    = bdiag[i]-bdiag[i+1];
1361         bjtmp = bj + bdiag[i+1]+1;
1362         for  (j=0; j<nz; j++) {
1363           col        = bjtmp[j];
1364           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365         }
1366 
1367         /* load in initial (unfactored row) */
1368         nz    = ai[r[i]+1] - ai[r[i]];
1369         ajtmp = aj + ai[r[i]];
1370         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1371         for (j=0; j<nz; j++) {
1372           col        = ics[ajtmp[j]];
1373           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1374         }
1375         /* ZeropivotApply(): shift the diagonal of the matrix  */
1376         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1377 
1378         /* elimination */
1379         bjtmp = bj + bi[i];
1380         row   = *bjtmp++; /* pivot row */
1381         nzL   = bi[i+1] - bi[i];
1382         for (k=0; k < nzL; k++) {
1383           pc1 = rtmp1 + row;
1384           pc2 = rtmp2 + row;
1385           if (*pc1 != 0.0 || *pc2 != 0.0) {
1386             pv   = b->a + bdiag[row];
1387             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1388             *pc1 = mul1;       *pc2 = mul2;
1389 
1390             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1391             pv = b->a + bdiag[row+1]+1;
1392             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1393             for (j=0; j<nz; j++) {
1394               col         = pj[j];
1395               rtmp1[col] -= mul1 * pv[j];
1396               rtmp2[col] -= mul2 * pv[j];
1397             }
1398             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1399           }
1400           row = *bjtmp++;
1401         }
1402 
1403         /* finished row i; check zero pivot, then stick row i into b->a */
1404         rs = 0.0;
1405         /* L part */
1406         pc1 = b->a + bi[i];
1407         pj  = b->j + bi[i];
1408         nz  = bi[i+1] - bi[i];
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413         /* U part */
1414         pc1 = b->a + bdiag[i+1]+1;
1415         pj  = b->j + bdiag[i+1]+1;
1416         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1417         for (j=0; j<nz; j++) {
1418           col    = pj[j];
1419           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420         }
1421 
1422         sctx.rs = rs;
1423         sctx.pv = rtmp1[i];
1424         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1425         if (sctx.newshift) break;
1426         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1427         *pc1 = 1.0/sctx.pv;
1428 
1429         /* Now take care of diagonal 2x2 block. */
1430         pc2 = rtmp2 + i;
1431         if (*pc2 != 0.0) {
1432           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1433           *pc2 = mul1;          /* insert L entry */
1434           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1435           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1436           for (j=0; j<nz; j++) {
1437             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1438           }
1439           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1440         }
1441 
1442         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1443         rs = 0.0;
1444         /* L part */
1445         pc2 = b->a + bi[i+1];
1446         pj  = b->j + bi[i+1];
1447         nz  = bi[i+2] - bi[i+1];
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452         /* U part */
1453         pc2 = b->a + bdiag[i+2]+1;
1454         pj  = b->j + bdiag[i+2]+1;
1455         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1456         for (j=0; j<nz; j++) {
1457           col    = pj[j];
1458           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459         }
1460 
1461         sctx.rs = rs;
1462         sctx.pv = rtmp2[i+1];
1463         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1464         if (sctx.newshift) break;
1465         pc2  = b->a + bdiag[i+1];
1466         *pc2 = 1.0/sctx.pv;
1467         break;
1468 
1469       case 3:
1470         /*----------*/
1471         /* zero rtmp */
1472         /* L part */
1473         nz    = bi[i+1] - bi[i];
1474         bjtmp = bj + bi[i];
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* U part */
1481         nz    = bdiag[i]-bdiag[i+1];
1482         bjtmp = bj + bdiag[i+1]+1;
1483         for  (j=0; j<nz; j++) {
1484           col        = bjtmp[j];
1485           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486         }
1487 
1488         /* load in initial (unfactored row) */
1489         nz    = ai[r[i]+1] - ai[r[i]];
1490         ajtmp = aj + ai[r[i]];
1491         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1492         for (j=0; j<nz; j++) {
1493           col        = ics[ajtmp[j]];
1494           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1495         }
1496         /* ZeropivotApply(): shift the diagonal of the matrix  */
1497         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1498 
1499         /* elimination */
1500         bjtmp = bj + bi[i];
1501         row   = *bjtmp++; /* pivot row */
1502         nzL   = bi[i+1] - bi[i];
1503         for (k=0; k < nzL; k++) {
1504           pc1 = rtmp1 + row;
1505           pc2 = rtmp2 + row;
1506           pc3 = rtmp3 + row;
1507           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1508             pv   = b->a + bdiag[row];
1509             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1510             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1511 
1512             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1513             pv = b->a + bdiag[row+1]+1;
1514             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1515             for (j=0; j<nz; j++) {
1516               col         = pj[j];
1517               rtmp1[col] -= mul1 * pv[j];
1518               rtmp2[col] -= mul2 * pv[j];
1519               rtmp3[col] -= mul3 * pv[j];
1520             }
1521             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1522           }
1523           row = *bjtmp++;
1524         }
1525 
1526         /* finished row i; check zero pivot, then stick row i into b->a */
1527         rs = 0.0;
1528         /* L part */
1529         pc1 = b->a + bi[i];
1530         pj  = b->j + bi[i];
1531         nz  = bi[i+1] - bi[i];
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536         /* U part */
1537         pc1 = b->a + bdiag[i+1]+1;
1538         pj  = b->j + bdiag[i+1]+1;
1539         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1540         for (j=0; j<nz; j++) {
1541           col    = pj[j];
1542           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543         }
1544 
1545         sctx.rs = rs;
1546         sctx.pv = rtmp1[i];
1547         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1548         if (sctx.newshift) break;
1549         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1550         *pc1 = 1.0/sctx.pv;
1551 
1552         /* Now take care of 1st column of diagonal 3x3 block. */
1553         pc2 = rtmp2 + i;
1554         pc3 = rtmp3 + i;
1555         if (*pc2 != 0.0 || *pc3 != 0.0) {
1556           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1557           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1558           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1559           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1560           for (j=0; j<nz; j++) {
1561             col         = pj[j];
1562             rtmp2[col] -= mul2 * rtmp1[col];
1563             rtmp3[col] -= mul3 * rtmp1[col];
1564           }
1565           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1566         }
1567 
1568         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1569         rs = 0.0;
1570         /* L part */
1571         pc2 = b->a + bi[i+1];
1572         pj  = b->j + bi[i+1];
1573         nz  = bi[i+2] - bi[i+1];
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578         /* U part */
1579         pc2 = b->a + bdiag[i+2]+1;
1580         pj  = b->j + bdiag[i+2]+1;
1581         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1582         for (j=0; j<nz; j++) {
1583           col    = pj[j];
1584           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp2[i+1];
1589         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1590         if (sctx.newshift) break;
1591         pc2  = b->a + bdiag[i+1];
1592         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1593 
1594         /* Now take care of 2nd column of diagonal 3x3 block. */
1595         pc3 = rtmp3 + i+1;
1596         if (*pc3 != 0.0) {
1597           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1598           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1599           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1600           for (j=0; j<nz; j++) {
1601             col         = pj[j];
1602             rtmp3[col] -= mul3 * rtmp2[col];
1603           }
1604           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1605         }
1606 
1607         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1608         rs = 0.0;
1609         /* L part */
1610         pc3 = b->a + bi[i+2];
1611         pj  = b->j + bi[i+2];
1612         nz  = bi[i+3] - bi[i+2];
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617         /* U part */
1618         pc3 = b->a + bdiag[i+3]+1;
1619         pj  = b->j + bdiag[i+3]+1;
1620         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1621         for (j=0; j<nz; j++) {
1622           col    = pj[j];
1623           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624         }
1625 
1626         sctx.rs = rs;
1627         sctx.pv = rtmp3[i+2];
1628         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1629         if (sctx.newshift) break;
1630         pc3  = b->a + bdiag[i+2];
1631         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1632         break;
1633       case 4:
1634         /*----------*/
1635         /* zero rtmp */
1636         /* L part */
1637         nz    = bi[i+1] - bi[i];
1638         bjtmp = bj + bi[i];
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1642         }
1643 
1644         /* U part */
1645         nz    = bdiag[i]-bdiag[i+1];
1646         bjtmp = bj + bdiag[i+1]+1;
1647         for  (j=0; j<nz; j++) {
1648           col        = bjtmp[j];
1649           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1650         }
1651 
1652         /* load in initial (unfactored row) */
1653         nz    = ai[r[i]+1] - ai[r[i]];
1654         ajtmp = aj + ai[r[i]];
1655         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1656         for (j=0; j<nz; j++) {
1657           col        = ics[ajtmp[j]];
1658           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1659         }
1660         /* ZeropivotApply(): shift the diagonal of the matrix  */
1661         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1662 
1663         /* elimination */
1664         bjtmp = bj + bi[i];
1665         row   = *bjtmp++; /* pivot row */
1666         nzL   = bi[i+1] - bi[i];
1667         for (k=0; k < nzL; k++) {
1668           pc1 = rtmp1 + row;
1669           pc2 = rtmp2 + row;
1670           pc3 = rtmp3 + row;
1671           pc4 = rtmp4 + row;
1672           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1673             pv   = b->a + bdiag[row];
1674             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1675             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1676 
1677             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1678             pv = b->a + bdiag[row+1]+1;
1679             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1680             for (j=0; j<nz; j++) {
1681               col         = pj[j];
1682               rtmp1[col] -= mul1 * pv[j];
1683               rtmp2[col] -= mul2 * pv[j];
1684               rtmp3[col] -= mul3 * pv[j];
1685               rtmp4[col] -= mul4 * pv[j];
1686             }
1687             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1688           }
1689           row = *bjtmp++;
1690         }
1691 
1692         /* finished row i; check zero pivot, then stick row i into b->a */
1693         rs = 0.0;
1694         /* L part */
1695         pc1 = b->a + bi[i];
1696         pj  = b->j + bi[i];
1697         nz  = bi[i+1] - bi[i];
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702         /* U part */
1703         pc1 = b->a + bdiag[i+1]+1;
1704         pj  = b->j + bdiag[i+1]+1;
1705         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1706         for (j=0; j<nz; j++) {
1707           col    = pj[j];
1708           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709         }
1710 
1711         sctx.rs = rs;
1712         sctx.pv = rtmp1[i];
1713         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1714         if (sctx.newshift) break;
1715         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1716         *pc1 = 1.0/sctx.pv;
1717 
1718         /* Now take care of 1st column of diagonal 4x4 block. */
1719         pc2 = rtmp2 + i;
1720         pc3 = rtmp3 + i;
1721         pc4 = rtmp4 + i;
1722         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1724           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1725           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1726           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1727           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1728           for (j=0; j<nz; j++) {
1729             col         = pj[j];
1730             rtmp2[col] -= mul2 * rtmp1[col];
1731             rtmp3[col] -= mul3 * rtmp1[col];
1732             rtmp4[col] -= mul4 * rtmp1[col];
1733           }
1734           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1735         }
1736 
1737         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1738         rs = 0.0;
1739         /* L part */
1740         pc2 = b->a + bi[i+1];
1741         pj  = b->j + bi[i+1];
1742         nz  = bi[i+2] - bi[i+1];
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747         /* U part */
1748         pc2 = b->a + bdiag[i+2]+1;
1749         pj  = b->j + bdiag[i+2]+1;
1750         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1751         for (j=0; j<nz; j++) {
1752           col    = pj[j];
1753           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754         }
1755 
1756         sctx.rs = rs;
1757         sctx.pv = rtmp2[i+1];
1758         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1759         if (sctx.newshift) break;
1760         pc2  = b->a + bdiag[i+1];
1761         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1762 
1763         /* Now take care of 2nd column of diagonal 4x4 block. */
1764         pc3 = rtmp3 + i+1;
1765         pc4 = rtmp4 + i+1;
1766         if (*pc3 != 0.0 || *pc4 != 0.0) {
1767           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1768           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1769           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1770           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1771           for (j=0; j<nz; j++) {
1772             col         = pj[j];
1773             rtmp3[col] -= mul3 * rtmp2[col];
1774             rtmp4[col] -= mul4 * rtmp2[col];
1775           }
1776           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1777         }
1778 
1779         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1780         rs = 0.0;
1781         /* L part */
1782         pc3 = b->a + bi[i+2];
1783         pj  = b->j + bi[i+2];
1784         nz  = bi[i+3] - bi[i+2];
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789         /* U part */
1790         pc3 = b->a + bdiag[i+3]+1;
1791         pj  = b->j + bdiag[i+3]+1;
1792         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1793         for (j=0; j<nz; j++) {
1794           col    = pj[j];
1795           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796         }
1797 
1798         sctx.rs = rs;
1799         sctx.pv = rtmp3[i+2];
1800         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1801         if (sctx.newshift) break;
1802         pc3  = b->a + bdiag[i+2];
1803         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1804 
1805         /* Now take care of 3rd column of diagonal 4x4 block. */
1806         pc4 = rtmp4 + i+2;
1807         if (*pc4 != 0.0) {
1808           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1810           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811           for (j=0; j<nz; j++) {
1812             col         = pj[j];
1813             rtmp4[col] -= mul4 * rtmp3[col];
1814           }
1815           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1816         }
1817 
1818         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819         rs = 0.0;
1820         /* L part */
1821         pc4 = b->a + bi[i+3];
1822         pj  = b->j + bi[i+3];
1823         nz  = bi[i+4] - bi[i+3];
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828         /* U part */
1829         pc4 = b->a + bdiag[i+4]+1;
1830         pj  = b->j + bdiag[i+4]+1;
1831         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832         for (j=0; j<nz; j++) {
1833           col    = pj[j];
1834           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835         }
1836 
1837         sctx.rs = rs;
1838         sctx.pv = rtmp4[i+3];
1839         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1840         if (sctx.newshift) break;
1841         pc4  = b->a + bdiag[i+3];
1842         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1843         break;
1844 
1845       default:
1846         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1847       }
1848       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1849       i += nodesz;                 /* Update the row */
1850     }
1851 
1852     /* MatPivotRefine() */
1853     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1854       /*
1855        * if no shift in this attempt & shifting & started shifting & can refine,
1856        * then try lower shift
1857        */
1858       sctx.shift_hi       = sctx.shift_fraction;
1859       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1860       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1861       sctx.newshift       = PETSC_TRUE;
1862       sctx.nshift++;
1863     }
1864   } while (sctx.newshift);
1865 
1866   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1867   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1868   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1869   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1870 
1871   if (b->inode.size) {
1872     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1873   } else {
1874     C->ops->solve           = MatSolve_SeqAIJ;
1875   }
1876   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1877   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1878   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1880   C->assembled              = PETSC_TRUE;
1881   C->preallocated           = PETSC_TRUE;
1882 
1883   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1884 
1885   /* MatShiftView(A,info,&sctx) */
1886   if (sctx.nshift) {
1887     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1889     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1893     }
1894   }
1895   PetscFunctionReturn(0);
1896 }
1897 
1898 #undef __FUNCT__
1899 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1900 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1901 {
1902   Mat             C     = B;
1903   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1904   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1905   PetscErrorCode  ierr;
1906   const PetscInt  *r,*ic,*c,*ics;
1907   PetscInt        n   = A->rmap->n,*bi = b->i;
1908   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1909   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1910   PetscInt        *ai = a->i,*aj = a->j;
1911   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912   PetscScalar     mul1,mul2,mul3,tmp;
1913   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915   PetscReal       rs=0.0;
1916   FactorShiftCtx  sctx;
1917 
1918   PetscFunctionBegin;
1919   sctx.shift_top      = 0;
1920   sctx.nshift_max     = 0;
1921   sctx.shift_lo       = 0;
1922   sctx.shift_hi       = 0;
1923   sctx.shift_fraction = 0;
1924 
1925   /* if both shift schemes are chosen by user, only use info->shiftpd */
1926   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1927     sctx.shift_top = 0;
1928     for (i=0; i<n; i++) {
1929       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1930       rs    = 0.0;
1931       ajtmp = aj + ai[i];
1932       rtmp1 = aa + ai[i];
1933       nz    = ai[i+1] - ai[i];
1934       for (j=0; j<nz; j++) {
1935         if (*ajtmp != i) {
1936           rs += PetscAbsScalar(*rtmp1++);
1937         } else {
1938           rs -= PetscRealPart(*rtmp1++);
1939         }
1940         ajtmp++;
1941       }
1942       if (rs>sctx.shift_top) sctx.shift_top = rs;
1943     }
1944     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1945     sctx.shift_top *= 1.1;
1946     sctx.nshift_max = 5;
1947     sctx.shift_lo   = 0.;
1948     sctx.shift_hi   = 1.;
1949   }
1950   sctx.shift_amount = 0;
1951   sctx.nshift       = 0;
1952 
1953   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1954   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1955   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1956   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1957   ics    = ic;
1958 
1959   node_max = a->inode.node_count;
1960   ns       = a->inode.size;
1961   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1962 
1963   /* If max inode size > 3, split it into two inodes.*/
1964   /* also map the inode sizes according to the ordering */
1965   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1966   for (i=0,j=0; i<node_max; ++i,++j) {
1967     if (ns[i]>3) {
1968       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1969       ++j;
1970       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971     } else {
1972       tmp_vec1[j] = ns[i];
1973     }
1974   }
1975   /* Use the correct node_max */
1976   node_max = j;
1977 
1978   /* Now reorder the inode info based on mat re-ordering info */
1979   /* First create a row -> inode_size_array_index map */
1980   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1981   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1982   for (i=0,row=0; i<node_max; i++) {
1983     nodesz = tmp_vec1[i];
1984     for (j=0; j<nodesz; j++,row++) {
1985       nsmap[row] = i;
1986     }
1987   }
1988   /* Using nsmap, create a reordered ns structure */
1989   for (i=0,j=0; i< node_max; i++) {
1990     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1991     tmp_vec2[i] = nodesz;
1992     j          += nodesz;
1993   }
1994   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1995   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1996   /* Now use the correct ns */
1997   ns = tmp_vec2;
1998 
1999   do {
2000     sctx.newshift = PETSC_FALSE;
2001     /* Now loop over each block-row, and do the factorization */
2002     for (i=0,row=0; i<node_max; i++) {
2003       nodesz = ns[i];
2004       nz     = bi[row+1] - bi[row];
2005       bjtmp  = bj + bi[row];
2006 
2007       switch (nodesz) {
2008       case 1:
2009         for  (j=0; j<nz; j++) {
2010           idx         = bjtmp[j];
2011           rtmp11[idx] = 0.0;
2012         }
2013 
2014         /* load in initial (unfactored row) */
2015         idx    = r[row];
2016         nz_tmp = ai[idx+1] - ai[idx];
2017         ajtmp  = aj + ai[idx];
2018         v1     = aa + ai[idx];
2019 
2020         for (j=0; j<nz_tmp; j++) {
2021           idx         = ics[ajtmp[j]];
2022           rtmp11[idx] = v1[j];
2023         }
2024         rtmp11[ics[r[row]]] += sctx.shift_amount;
2025 
2026         prow = *bjtmp++;
2027         while (prow < row) {
2028           pc1 = rtmp11 + prow;
2029           if (*pc1 != 0.0) {
2030             pv     = ba + bd[prow];
2031             pj     = nbj + bd[prow];
2032             mul1   = *pc1 * *pv++;
2033             *pc1   = mul1;
2034             nz_tmp = bi[prow+1] - bd[prow] - 1;
2035             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2036             for (j=0; j<nz_tmp; j++) {
2037               tmp          = pv[j];
2038               idx          = pj[j];
2039               rtmp11[idx] -= mul1 * tmp;
2040             }
2041           }
2042           prow = *bjtmp++;
2043         }
2044         pj  = bj + bi[row];
2045         pc1 = ba + bi[row];
2046 
2047         sctx.pv     = rtmp11[row];
2048         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049         rs          = 0.0;
2050         for (j=0; j<nz; j++) {
2051           idx    = pj[j];
2052           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054         }
2055         sctx.rs = rs;
2056         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2057         if (sctx.newshift) goto endofwhile;
2058         break;
2059 
2060       case 2:
2061         for (j=0; j<nz; j++) {
2062           idx         = bjtmp[j];
2063           rtmp11[idx] = 0.0;
2064           rtmp22[idx] = 0.0;
2065         }
2066 
2067         /* load in initial (unfactored row) */
2068         idx    = r[row];
2069         nz_tmp = ai[idx+1] - ai[idx];
2070         ajtmp  = aj + ai[idx];
2071         v1     = aa + ai[idx];
2072         v2     = aa + ai[idx+1];
2073         for (j=0; j<nz_tmp; j++) {
2074           idx         = ics[ajtmp[j]];
2075           rtmp11[idx] = v1[j];
2076           rtmp22[idx] = v2[j];
2077         }
2078         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2079         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2080 
2081         prow = *bjtmp++;
2082         while (prow < row) {
2083           pc1 = rtmp11 + prow;
2084           pc2 = rtmp22 + prow;
2085           if (*pc1 != 0.0 || *pc2 != 0.0) {
2086             pv   = ba + bd[prow];
2087             pj   = nbj + bd[prow];
2088             mul1 = *pc1 * *pv;
2089             mul2 = *pc2 * *pv;
2090             ++pv;
2091             *pc1 = mul1;
2092             *pc2 = mul2;
2093 
2094             nz_tmp = bi[prow+1] - bd[prow] - 1;
2095             for (j=0; j<nz_tmp; j++) {
2096               tmp          = pv[j];
2097               idx          = pj[j];
2098               rtmp11[idx] -= mul1 * tmp;
2099               rtmp22[idx] -= mul2 * tmp;
2100             }
2101             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2102           }
2103           prow = *bjtmp++;
2104         }
2105 
2106         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107         pc1 = rtmp11 + prow;
2108         pc2 = rtmp22 + prow;
2109 
2110         sctx.pv = *pc1;
2111         pj      = bj + bi[prow];
2112         rs      = 0.0;
2113         for (j=0; j<nz; j++) {
2114           idx = pj[j];
2115           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116         }
2117         sctx.rs = rs;
2118         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2119         if (sctx.newshift) goto endofwhile;
2120 
2121         if (*pc2 != 0.0) {
2122           pj     = nbj + bd[prow];
2123           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124           *pc2   = mul2;
2125           nz_tmp = bi[prow+1] - bd[prow] - 1;
2126           for (j=0; j<nz_tmp; j++) {
2127             idx          = pj[j];
2128             tmp          = rtmp11[idx];
2129             rtmp22[idx] -= mul2 * tmp;
2130           }
2131           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2132         }
2133 
2134         pj  = bj + bi[row];
2135         pc1 = ba + bi[row];
2136         pc2 = ba + bi[row+1];
2137 
2138         sctx.pv       = rtmp22[row+1];
2139         rs            = 0.0;
2140         rtmp11[row]   = 1.0/rtmp11[row];
2141         rtmp22[row+1] = 1.0/rtmp22[row+1];
2142         /* copy row entries from dense representation to sparse */
2143         for (j=0; j<nz; j++) {
2144           idx    = pj[j];
2145           pc1[j] = rtmp11[idx];
2146           pc2[j] = rtmp22[idx];
2147           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148         }
2149         sctx.rs = rs;
2150         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2151         if (sctx.newshift) goto endofwhile;
2152         break;
2153 
2154       case 3:
2155         for  (j=0; j<nz; j++) {
2156           idx         = bjtmp[j];
2157           rtmp11[idx] = 0.0;
2158           rtmp22[idx] = 0.0;
2159           rtmp33[idx] = 0.0;
2160         }
2161         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162         idx    = r[row];
2163         nz_tmp = ai[idx+1] - ai[idx];
2164         ajtmp  = aj + ai[idx];
2165         v1     = aa + ai[idx];
2166         v2     = aa + ai[idx+1];
2167         v3     = aa + ai[idx+2];
2168         for (j=0; j<nz_tmp; j++) {
2169           idx         = ics[ajtmp[j]];
2170           rtmp11[idx] = v1[j];
2171           rtmp22[idx] = v2[j];
2172           rtmp33[idx] = v3[j];
2173         }
2174         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2175         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2177 
2178         /* loop over all pivot row blocks above this row block */
2179         prow = *bjtmp++;
2180         while (prow < row) {
2181           pc1 = rtmp11 + prow;
2182           pc2 = rtmp22 + prow;
2183           pc3 = rtmp33 + prow;
2184           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185             pv   = ba  + bd[prow];
2186             pj   = nbj + bd[prow];
2187             mul1 = *pc1 * *pv;
2188             mul2 = *pc2 * *pv;
2189             mul3 = *pc3 * *pv;
2190             ++pv;
2191             *pc1 = mul1;
2192             *pc2 = mul2;
2193             *pc3 = mul3;
2194 
2195             nz_tmp = bi[prow+1] - bd[prow] - 1;
2196             /* update this row based on pivot row */
2197             for (j=0; j<nz_tmp; j++) {
2198               tmp          = pv[j];
2199               idx          = pj[j];
2200               rtmp11[idx] -= mul1 * tmp;
2201               rtmp22[idx] -= mul2 * tmp;
2202               rtmp33[idx] -= mul3 * tmp;
2203             }
2204             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2205           }
2206           prow = *bjtmp++;
2207         }
2208 
2209         /* Now take care of diagonal 3x3 block in this set of rows */
2210         /* note: prow = row here */
2211         pc1 = rtmp11 + prow;
2212         pc2 = rtmp22 + prow;
2213         pc3 = rtmp33 + prow;
2214 
2215         sctx.pv = *pc1;
2216         pj      = bj + bi[prow];
2217         rs      = 0.0;
2218         for (j=0; j<nz; j++) {
2219           idx = pj[j];
2220           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221         }
2222         sctx.rs = rs;
2223         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2224         if (sctx.newshift) goto endofwhile;
2225 
2226         if (*pc2 != 0.0 || *pc3 != 0.0) {
2227           mul2   = (*pc2)/(*pc1);
2228           mul3   = (*pc3)/(*pc1);
2229           *pc2   = mul2;
2230           *pc3   = mul3;
2231           nz_tmp = bi[prow+1] - bd[prow] - 1;
2232           pj     = nbj + bd[prow];
2233           for (j=0; j<nz_tmp; j++) {
2234             idx          = pj[j];
2235             tmp          = rtmp11[idx];
2236             rtmp22[idx] -= mul2 * tmp;
2237             rtmp33[idx] -= mul3 * tmp;
2238           }
2239           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2240         }
2241         ++prow;
2242 
2243         pc2     = rtmp22 + prow;
2244         pc3     = rtmp33 + prow;
2245         sctx.pv = *pc2;
2246         pj      = bj + bi[prow];
2247         rs      = 0.0;
2248         for (j=0; j<nz; j++) {
2249           idx = pj[j];
2250           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251         }
2252         sctx.rs = rs;
2253         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2254         if (sctx.newshift) goto endofwhile;
2255 
2256         if (*pc3 != 0.0) {
2257           mul3   = (*pc3)/(*pc2);
2258           *pc3   = mul3;
2259           pj     = nbj + bd[prow];
2260           nz_tmp = bi[prow+1] - bd[prow] - 1;
2261           for (j=0; j<nz_tmp; j++) {
2262             idx          = pj[j];
2263             tmp          = rtmp22[idx];
2264             rtmp33[idx] -= mul3 * tmp;
2265           }
2266           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2267         }
2268 
2269         pj  = bj + bi[row];
2270         pc1 = ba + bi[row];
2271         pc2 = ba + bi[row+1];
2272         pc3 = ba + bi[row+2];
2273 
2274         sctx.pv       = rtmp33[row+2];
2275         rs            = 0.0;
2276         rtmp11[row]   = 1.0/rtmp11[row];
2277         rtmp22[row+1] = 1.0/rtmp22[row+1];
2278         rtmp33[row+2] = 1.0/rtmp33[row+2];
2279         /* copy row entries from dense representation to sparse */
2280         for (j=0; j<nz; j++) {
2281           idx    = pj[j];
2282           pc1[j] = rtmp11[idx];
2283           pc2[j] = rtmp22[idx];
2284           pc3[j] = rtmp33[idx];
2285           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286         }
2287 
2288         sctx.rs = rs;
2289         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2290         if (sctx.newshift) goto endofwhile;
2291         break;
2292 
2293       default:
2294         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295       }
2296       row += nodesz;                 /* Update the row */
2297     }
2298 endofwhile:;
2299   } while (sctx.newshift);
2300   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2301   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2302   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2303   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2304   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2305 
2306   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2307   /* do not set solve add, since MatSolve_Inode + Add is faster */
2308   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2309   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2310   C->assembled              = PETSC_TRUE;
2311   C->preallocated           = PETSC_TRUE;
2312   if (sctx.nshift) {
2313     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2314       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2315     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2316       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2317     }
2318   }
2319   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2320   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2321   PetscFunctionReturn(0);
2322 }
2323 
2324 
2325 /* ----------------------------------------------------------- */
2326 #undef __FUNCT__
2327 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2328 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2329 {
2330   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2331   IS                iscol = a->col,isrow = a->row;
2332   PetscErrorCode    ierr;
2333   const PetscInt    *r,*c,*rout,*cout;
2334   PetscInt          i,j,n = A->rmap->n;
2335   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2336   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2337   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2338   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2339   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2340   const PetscScalar *b;
2341 
2342   PetscFunctionBegin;
2343   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2344   node_max = a->inode.node_count;
2345   ns       = a->inode.size;     /* Node Size array */
2346 
2347   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2348   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2349   tmp  = a->solve_work;
2350 
2351   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2352   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2353 
2354   /* forward solve the lower triangular */
2355   tmps = tmp;
2356   aa   = a_a;
2357   aj   = a_j;
2358   ad   = a->diag;
2359 
2360   for (i = 0,row = 0; i< node_max; ++i) {
2361     nsz = ns[i];
2362     aii = ai[row];
2363     v1  = aa + aii;
2364     vi  = aj + aii;
2365     nz  = ai[row+1]- ai[row];
2366 
2367     if (i < node_max-1) {
2368       /* Prefetch the indices for the next block */
2369       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2370       /* Prefetch the data for the next block */
2371       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2372     }
2373 
2374     switch (nsz) {               /* Each loop in 'case' is unrolled */
2375     case 1:
2376       sum1 = b[r[row]];
2377       for (j=0; j<nz-1; j+=2) {
2378         i0    = vi[j];
2379         i1    = vi[j+1];
2380         tmp0  = tmps[i0];
2381         tmp1  = tmps[i1];
2382         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2383       }
2384       if (j == nz-1) {
2385         tmp0  = tmps[vi[j]];
2386         sum1 -= v1[j]*tmp0;
2387       }
2388       tmp[row++]=sum1;
2389       break;
2390     case 2:
2391       sum1 = b[r[row]];
2392       sum2 = b[r[row+1]];
2393       v2   = aa + ai[row+1];
2394 
2395       for (j=0; j<nz-1; j+=2) {
2396         i0    = vi[j];
2397         i1    = vi[j+1];
2398         tmp0  = tmps[i0];
2399         tmp1  = tmps[i1];
2400         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2401         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2402       }
2403       if (j == nz-1) {
2404         tmp0  = tmps[vi[j]];
2405         sum1 -= v1[j] *tmp0;
2406         sum2 -= v2[j] *tmp0;
2407       }
2408       sum2     -= v2[nz] * sum1;
2409       tmp[row++]=sum1;
2410       tmp[row++]=sum2;
2411       break;
2412     case 3:
2413       sum1 = b[r[row]];
2414       sum2 = b[r[row+1]];
2415       sum3 = b[r[row+2]];
2416       v2   = aa + ai[row+1];
2417       v3   = aa + ai[row+2];
2418 
2419       for (j=0; j<nz-1; j+=2) {
2420         i0    = vi[j];
2421         i1    = vi[j+1];
2422         tmp0  = tmps[i0];
2423         tmp1  = tmps[i1];
2424         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427       }
2428       if (j == nz-1) {
2429         tmp0  = tmps[vi[j]];
2430         sum1 -= v1[j] *tmp0;
2431         sum2 -= v2[j] *tmp0;
2432         sum3 -= v3[j] *tmp0;
2433       }
2434       sum2     -= v2[nz] * sum1;
2435       sum3     -= v3[nz] * sum1;
2436       sum3     -= v3[nz+1] * sum2;
2437       tmp[row++]=sum1;
2438       tmp[row++]=sum2;
2439       tmp[row++]=sum3;
2440       break;
2441 
2442     case 4:
2443       sum1 = b[r[row]];
2444       sum2 = b[r[row+1]];
2445       sum3 = b[r[row+2]];
2446       sum4 = b[r[row+3]];
2447       v2   = aa + ai[row+1];
2448       v3   = aa + ai[row+2];
2449       v4   = aa + ai[row+3];
2450 
2451       for (j=0; j<nz-1; j+=2) {
2452         i0    = vi[j];
2453         i1    = vi[j+1];
2454         tmp0  = tmps[i0];
2455         tmp1  = tmps[i1];
2456         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2457         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2458         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2459         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2460       }
2461       if (j == nz-1) {
2462         tmp0  = tmps[vi[j]];
2463         sum1 -= v1[j] *tmp0;
2464         sum2 -= v2[j] *tmp0;
2465         sum3 -= v3[j] *tmp0;
2466         sum4 -= v4[j] *tmp0;
2467       }
2468       sum2 -= v2[nz] * sum1;
2469       sum3 -= v3[nz] * sum1;
2470       sum4 -= v4[nz] * sum1;
2471       sum3 -= v3[nz+1] * sum2;
2472       sum4 -= v4[nz+1] * sum2;
2473       sum4 -= v4[nz+2] * sum3;
2474 
2475       tmp[row++]=sum1;
2476       tmp[row++]=sum2;
2477       tmp[row++]=sum3;
2478       tmp[row++]=sum4;
2479       break;
2480     case 5:
2481       sum1 = b[r[row]];
2482       sum2 = b[r[row+1]];
2483       sum3 = b[r[row+2]];
2484       sum4 = b[r[row+3]];
2485       sum5 = b[r[row+4]];
2486       v2   = aa + ai[row+1];
2487       v3   = aa + ai[row+2];
2488       v4   = aa + ai[row+3];
2489       v5   = aa + ai[row+4];
2490 
2491       for (j=0; j<nz-1; j+=2) {
2492         i0    = vi[j];
2493         i1    = vi[j+1];
2494         tmp0  = tmps[i0];
2495         tmp1  = tmps[i1];
2496         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2497         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2498         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2499         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2500         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2501       }
2502       if (j == nz-1) {
2503         tmp0  = tmps[vi[j]];
2504         sum1 -= v1[j] *tmp0;
2505         sum2 -= v2[j] *tmp0;
2506         sum3 -= v3[j] *tmp0;
2507         sum4 -= v4[j] *tmp0;
2508         sum5 -= v5[j] *tmp0;
2509       }
2510 
2511       sum2 -= v2[nz] * sum1;
2512       sum3 -= v3[nz] * sum1;
2513       sum4 -= v4[nz] * sum1;
2514       sum5 -= v5[nz] * sum1;
2515       sum3 -= v3[nz+1] * sum2;
2516       sum4 -= v4[nz+1] * sum2;
2517       sum5 -= v5[nz+1] * sum2;
2518       sum4 -= v4[nz+2] * sum3;
2519       sum5 -= v5[nz+2] * sum3;
2520       sum5 -= v5[nz+3] * sum4;
2521 
2522       tmp[row++]=sum1;
2523       tmp[row++]=sum2;
2524       tmp[row++]=sum3;
2525       tmp[row++]=sum4;
2526       tmp[row++]=sum5;
2527       break;
2528     default:
2529       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2530     }
2531   }
2532   /* backward solve the upper triangular */
2533   for (i=node_max -1,row = n-1; i>=0; i--) {
2534     nsz = ns[i];
2535     aii = ad[row+1] + 1;
2536     v1  = aa + aii;
2537     vi  = aj + aii;
2538     nz  = ad[row]- ad[row+1] - 1;
2539 
2540     if (i > 0) {
2541       /* Prefetch the indices for the next block */
2542       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543       /* Prefetch the data for the next block */
2544       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545     }
2546 
2547     switch (nsz) {               /* Each loop in 'case' is unrolled */
2548     case 1:
2549       sum1 = tmp[row];
2550 
2551       for (j=0; j<nz-1; j+=2) {
2552         i0    = vi[j];
2553         i1    = vi[j+1];
2554         tmp0  = tmps[i0];
2555         tmp1  = tmps[i1];
2556         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557       }
2558       if (j == nz-1) {
2559         tmp0  = tmps[vi[j]];
2560         sum1 -= v1[j]*tmp0;
2561       }
2562       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2563       break;
2564     case 2:
2565       sum1 = tmp[row];
2566       sum2 = tmp[row-1];
2567       v2   = aa + ad[row] + 1;
2568       for (j=0; j<nz-1; j+=2) {
2569         i0    = vi[j];
2570         i1    = vi[j+1];
2571         tmp0  = tmps[i0];
2572         tmp1  = tmps[i1];
2573         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2574         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2575       }
2576       if (j == nz-1) {
2577         tmp0  = tmps[vi[j]];
2578         sum1 -= v1[j]* tmp0;
2579         sum2 -= v2[j+1]* tmp0;
2580       }
2581 
2582       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2583       sum2     -= v2[0] * tmp0;
2584       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2585       break;
2586     case 3:
2587       sum1 = tmp[row];
2588       sum2 = tmp[row -1];
2589       sum3 = tmp[row -2];
2590       v2   = aa + ad[row] + 1;
2591       v3   = aa + ad[row -1] + 1;
2592       for (j=0; j<nz-1; j+=2) {
2593         i0    = vi[j];
2594         i1    = vi[j+1];
2595         tmp0  = tmps[i0];
2596         tmp1  = tmps[i1];
2597         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600       }
2601       if (j== nz-1) {
2602         tmp0  = tmps[vi[j]];
2603         sum1 -= v1[j] * tmp0;
2604         sum2 -= v2[j+1] * tmp0;
2605         sum3 -= v3[j+2] * tmp0;
2606       }
2607       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2608       sum2     -= v2[0]* tmp0;
2609       sum3     -= v3[1] * tmp0;
2610       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2611       sum3     -= v3[0]* tmp0;
2612       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2613 
2614       break;
2615     case 4:
2616       sum1 = tmp[row];
2617       sum2 = tmp[row -1];
2618       sum3 = tmp[row -2];
2619       sum4 = tmp[row -3];
2620       v2   = aa + ad[row]+1;
2621       v3   = aa + ad[row -1]+1;
2622       v4   = aa + ad[row -2]+1;
2623 
2624       for (j=0; j<nz-1; j+=2) {
2625         i0    = vi[j];
2626         i1    = vi[j+1];
2627         tmp0  = tmps[i0];
2628         tmp1  = tmps[i1];
2629         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2630         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633       }
2634       if (j== nz-1) {
2635         tmp0  = tmps[vi[j]];
2636         sum1 -= v1[j] * tmp0;
2637         sum2 -= v2[j+1] * tmp0;
2638         sum3 -= v3[j+2] * tmp0;
2639         sum4 -= v4[j+3] * tmp0;
2640       }
2641 
2642       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643       sum2     -= v2[0] * tmp0;
2644       sum3     -= v3[1] * tmp0;
2645       sum4     -= v4[2] * tmp0;
2646       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2647       sum3     -= v3[0] * tmp0;
2648       sum4     -= v4[1] * tmp0;
2649       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650       sum4     -= v4[0] * tmp0;
2651       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2652       break;
2653     case 5:
2654       sum1 = tmp[row];
2655       sum2 = tmp[row -1];
2656       sum3 = tmp[row -2];
2657       sum4 = tmp[row -3];
2658       sum5 = tmp[row -4];
2659       v2   = aa + ad[row]+1;
2660       v3   = aa + ad[row -1]+1;
2661       v4   = aa + ad[row -2]+1;
2662       v5   = aa + ad[row -3]+1;
2663       for (j=0; j<nz-1; j+=2) {
2664         i0    = vi[j];
2665         i1    = vi[j+1];
2666         tmp0  = tmps[i0];
2667         tmp1  = tmps[i1];
2668         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2669         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2670         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2671         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2672         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2673       }
2674       if (j==nz-1) {
2675         tmp0  = tmps[vi[j]];
2676         sum1 -= v1[j] * tmp0;
2677         sum2 -= v2[j+1] * tmp0;
2678         sum3 -= v3[j+2] * tmp0;
2679         sum4 -= v4[j+3] * tmp0;
2680         sum5 -= v5[j+4] * tmp0;
2681       }
2682 
2683       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2684       sum2     -= v2[0] * tmp0;
2685       sum3     -= v3[1] * tmp0;
2686       sum4     -= v4[2] * tmp0;
2687       sum5     -= v5[3] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2689       sum3     -= v3[0] * tmp0;
2690       sum4     -= v4[1] * tmp0;
2691       sum5     -= v5[2] * tmp0;
2692       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2693       sum4     -= v4[0] * tmp0;
2694       sum5     -= v5[1] * tmp0;
2695       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2696       sum5     -= v5[0] * tmp0;
2697       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2698       break;
2699     default:
2700       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2701     }
2702   }
2703   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2704   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2705   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2706   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2707   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2708   PetscFunctionReturn(0);
2709 }
2710 
2711 
2712 /*
2713      Makes a longer coloring[] array and calls the usual code with that
2714 */
2715 #undef __FUNCT__
2716 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2717 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2718 {
2719   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2720   PetscErrorCode  ierr;
2721   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2722   PetscInt        *colorused,i;
2723   ISColoringValue *newcolor;
2724 
2725   PetscFunctionBegin;
2726   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2727   /* loop over inodes, marking a color for each column*/
2728   row = 0;
2729   for (i=0; i<m; i++) {
2730     for (j=0; j<ns[i]; j++) {
2731       newcolor[row++] = coloring[i] + j*ncolors;
2732     }
2733   }
2734 
2735   /* eliminate unneeded colors */
2736   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2737   for (i=0; i<n; i++) {
2738     colorused[newcolor[i]] = 1;
2739   }
2740 
2741   for (i=1; i<5*ncolors; i++) {
2742     colorused[i] += colorused[i-1];
2743   }
2744   ncolors = colorused[5*ncolors-1];
2745   for (i=0; i<n; i++) {
2746     newcolor[i] = colorused[newcolor[i]]-1;
2747   }
2748   ierr = PetscFree(colorused);CHKERRQ(ierr);
2749   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2750   ierr = PetscFree(coloring);CHKERRQ(ierr);
2751   PetscFunctionReturn(0);
2752 }
2753 
2754 #include <petsc-private/kernels/blockinvert.h>
2755 
2756 #undef __FUNCT__
2757 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2758 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2759 {
2760   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2761   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2762   MatScalar         *ibdiag,*bdiag,work[25],*t;
2763   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2764   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2765   const PetscScalar *xb, *b;
2766   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2767   PetscErrorCode    ierr;
2768   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2769   PetscInt          sz,k,ipvt[5];
2770   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2771 
2772   PetscFunctionBegin;
2773   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2774   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2775 
2776   if (!a->inode.ibdiagvalid) {
2777     if (!a->inode.ibdiag) {
2778       /* calculate space needed for diagonal blocks */
2779       for (i=0; i<m; i++) {
2780         cnt += sizes[i]*sizes[i];
2781       }
2782       a->inode.bdiagsize = cnt;
2783 
2784       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2785     }
2786 
2787     /* copy over the diagonal blocks and invert them */
2788     ibdiag = a->inode.ibdiag;
2789     bdiag  = a->inode.bdiag;
2790     cnt    = 0;
2791     for (i=0, row = 0; i<m; i++) {
2792       for (j=0; j<sizes[i]; j++) {
2793         for (k=0; k<sizes[i]; k++) {
2794           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2795         }
2796       }
2797       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2798 
2799       switch (sizes[i]) {
2800       case 1:
2801         /* Create matrix data structure */
2802         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2803         ibdiag[cnt] = 1.0/ibdiag[cnt];
2804         break;
2805       case 2:
2806         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2807         break;
2808       case 3:
2809         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2810         break;
2811       case 4:
2812         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2813         break;
2814       case 5:
2815         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2816         break;
2817       default:
2818         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2819       }
2820       cnt += sizes[i]*sizes[i];
2821       row += sizes[i];
2822     }
2823     a->inode.ibdiagvalid = PETSC_TRUE;
2824   }
2825   ibdiag = a->inode.ibdiag;
2826   bdiag  = a->inode.bdiag;
2827   t      = a->inode.ssor_work;
2828 
2829   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2830   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2831   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2832   if (flag & SOR_ZERO_INITIAL_GUESS) {
2833     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2834 
2835       for (i=0, row=0; i<m; i++) {
2836         sz  = diag[row] - ii[row];
2837         v1  = a->a + ii[row];
2838         idx = a->j + ii[row];
2839 
2840         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2841         switch (sizes[i]) {
2842         case 1:
2843 
2844           sum1 = b[row];
2845           for (n = 0; n<sz-1; n+=2) {
2846             i1    = idx[0];
2847             i2    = idx[1];
2848             idx  += 2;
2849             tmp0  = x[i1];
2850             tmp1  = x[i2];
2851             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2852           }
2853 
2854           if (n == sz-1) {
2855             tmp0  = x[*idx];
2856             sum1 -= *v1 * tmp0;
2857           }
2858           t[row]   = sum1;
2859           x[row++] = sum1*(*ibdiag++);
2860           break;
2861         case 2:
2862           v2   = a->a + ii[row+1];
2863           sum1 = b[row];
2864           sum2 = b[row+1];
2865           for (n = 0; n<sz-1; n+=2) {
2866             i1    = idx[0];
2867             i2    = idx[1];
2868             idx  += 2;
2869             tmp0  = x[i1];
2870             tmp1  = x[i2];
2871             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2872             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2873           }
2874 
2875           if (n == sz-1) {
2876             tmp0  = x[*idx];
2877             sum1 -= v1[0] * tmp0;
2878             sum2 -= v2[0] * tmp0;
2879           }
2880           t[row]   = sum1;
2881           t[row+1] = sum2;
2882           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2883           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2884           ibdiag  += 4;
2885           break;
2886         case 3:
2887           v2   = a->a + ii[row+1];
2888           v3   = a->a + ii[row+2];
2889           sum1 = b[row];
2890           sum2 = b[row+1];
2891           sum3 = b[row+2];
2892           for (n = 0; n<sz-1; n+=2) {
2893             i1    = idx[0];
2894             i2    = idx[1];
2895             idx  += 2;
2896             tmp0  = x[i1];
2897             tmp1  = x[i2];
2898             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2899             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2900             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2901           }
2902 
2903           if (n == sz-1) {
2904             tmp0  = x[*idx];
2905             sum1 -= v1[0] * tmp0;
2906             sum2 -= v2[0] * tmp0;
2907             sum3 -= v3[0] * tmp0;
2908           }
2909           t[row]   = sum1;
2910           t[row+1] = sum2;
2911           t[row+2] = sum3;
2912           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2913           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2914           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2915           ibdiag  += 9;
2916           break;
2917         case 4:
2918           v2   = a->a + ii[row+1];
2919           v3   = a->a + ii[row+2];
2920           v4   = a->a + ii[row+3];
2921           sum1 = b[row];
2922           sum2 = b[row+1];
2923           sum3 = b[row+2];
2924           sum4 = b[row+3];
2925           for (n = 0; n<sz-1; n+=2) {
2926             i1    = idx[0];
2927             i2    = idx[1];
2928             idx  += 2;
2929             tmp0  = x[i1];
2930             tmp1  = x[i2];
2931             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2932             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2933             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2934             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2935           }
2936 
2937           if (n == sz-1) {
2938             tmp0  = x[*idx];
2939             sum1 -= v1[0] * tmp0;
2940             sum2 -= v2[0] * tmp0;
2941             sum3 -= v3[0] * tmp0;
2942             sum4 -= v4[0] * tmp0;
2943           }
2944           t[row]   = sum1;
2945           t[row+1] = sum2;
2946           t[row+2] = sum3;
2947           t[row+3] = sum4;
2948           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2949           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2950           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2951           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2952           ibdiag  += 16;
2953           break;
2954         case 5:
2955           v2   = a->a + ii[row+1];
2956           v3   = a->a + ii[row+2];
2957           v4   = a->a + ii[row+3];
2958           v5   = a->a + ii[row+4];
2959           sum1 = b[row];
2960           sum2 = b[row+1];
2961           sum3 = b[row+2];
2962           sum4 = b[row+3];
2963           sum5 = b[row+4];
2964           for (n = 0; n<sz-1; n+=2) {
2965             i1    = idx[0];
2966             i2    = idx[1];
2967             idx  += 2;
2968             tmp0  = x[i1];
2969             tmp1  = x[i2];
2970             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2971             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2972             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2973             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2974             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2975           }
2976 
2977           if (n == sz-1) {
2978             tmp0  = x[*idx];
2979             sum1 -= v1[0] * tmp0;
2980             sum2 -= v2[0] * tmp0;
2981             sum3 -= v3[0] * tmp0;
2982             sum4 -= v4[0] * tmp0;
2983             sum5 -= v5[0] * tmp0;
2984           }
2985           t[row]   = sum1;
2986           t[row+1] = sum2;
2987           t[row+2] = sum3;
2988           t[row+3] = sum4;
2989           t[row+4] = sum5;
2990           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2991           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2992           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2993           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2994           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2995           ibdiag  += 25;
2996           break;
2997         default:
2998           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2999         }
3000       }
3001 
3002       xb   = t;
3003       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3004     } else xb = b;
3005     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3006 
3007       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3008       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3009         ibdiag -= sizes[i]*sizes[i];
3010         sz      = ii[row+1] - diag[row] - 1;
3011         v1      = a->a + diag[row] + 1;
3012         idx     = a->j + diag[row] + 1;
3013 
3014         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3015         switch (sizes[i]) {
3016         case 1:
3017 
3018           sum1 = xb[row];
3019           for (n = 0; n<sz-1; n+=2) {
3020             i1    = idx[0];
3021             i2    = idx[1];
3022             idx  += 2;
3023             tmp0  = x[i1];
3024             tmp1  = x[i2];
3025             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3026           }
3027 
3028           if (n == sz-1) {
3029             tmp0  = x[*idx];
3030             sum1 -= *v1*tmp0;
3031           }
3032           x[row--] = sum1*(*ibdiag);
3033           break;
3034 
3035         case 2:
3036 
3037           sum1 = xb[row];
3038           sum2 = xb[row-1];
3039           /* note that sum1 is associated with the second of the two rows */
3040           v2 = a->a + diag[row-1] + 2;
3041           for (n = 0; n<sz-1; n+=2) {
3042             i1    = idx[0];
3043             i2    = idx[1];
3044             idx  += 2;
3045             tmp0  = x[i1];
3046             tmp1  = x[i2];
3047             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3048             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3049           }
3050 
3051           if (n == sz-1) {
3052             tmp0  = x[*idx];
3053             sum1 -= *v1*tmp0;
3054             sum2 -= *v2*tmp0;
3055           }
3056           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3057           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3058           break;
3059         case 3:
3060 
3061           sum1 = xb[row];
3062           sum2 = xb[row-1];
3063           sum3 = xb[row-2];
3064           v2   = a->a + diag[row-1] + 2;
3065           v3   = a->a + diag[row-2] + 3;
3066           for (n = 0; n<sz-1; n+=2) {
3067             i1    = idx[0];
3068             i2    = idx[1];
3069             idx  += 2;
3070             tmp0  = x[i1];
3071             tmp1  = x[i2];
3072             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3073             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3074             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3075           }
3076 
3077           if (n == sz-1) {
3078             tmp0  = x[*idx];
3079             sum1 -= *v1*tmp0;
3080             sum2 -= *v2*tmp0;
3081             sum3 -= *v3*tmp0;
3082           }
3083           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3084           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3085           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3086           break;
3087         case 4:
3088 
3089           sum1 = xb[row];
3090           sum2 = xb[row-1];
3091           sum3 = xb[row-2];
3092           sum4 = xb[row-3];
3093           v2   = a->a + diag[row-1] + 2;
3094           v3   = a->a + diag[row-2] + 3;
3095           v4   = a->a + diag[row-3] + 4;
3096           for (n = 0; n<sz-1; n+=2) {
3097             i1    = idx[0];
3098             i2    = idx[1];
3099             idx  += 2;
3100             tmp0  = x[i1];
3101             tmp1  = x[i2];
3102             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3103             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3104             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3105             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3106           }
3107 
3108           if (n == sz-1) {
3109             tmp0  = x[*idx];
3110             sum1 -= *v1*tmp0;
3111             sum2 -= *v2*tmp0;
3112             sum3 -= *v3*tmp0;
3113             sum4 -= *v4*tmp0;
3114           }
3115           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3116           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3117           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3118           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3119           break;
3120         case 5:
3121 
3122           sum1 = xb[row];
3123           sum2 = xb[row-1];
3124           sum3 = xb[row-2];
3125           sum4 = xb[row-3];
3126           sum5 = xb[row-4];
3127           v2   = a->a + diag[row-1] + 2;
3128           v3   = a->a + diag[row-2] + 3;
3129           v4   = a->a + diag[row-3] + 4;
3130           v5   = a->a + diag[row-4] + 5;
3131           for (n = 0; n<sz-1; n+=2) {
3132             i1    = idx[0];
3133             i2    = idx[1];
3134             idx  += 2;
3135             tmp0  = x[i1];
3136             tmp1  = x[i2];
3137             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3138             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3139             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3140             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3141             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3142           }
3143 
3144           if (n == sz-1) {
3145             tmp0  = x[*idx];
3146             sum1 -= *v1*tmp0;
3147             sum2 -= *v2*tmp0;
3148             sum3 -= *v3*tmp0;
3149             sum4 -= *v4*tmp0;
3150             sum5 -= *v5*tmp0;
3151           }
3152           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3153           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3154           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3155           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3156           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3157           break;
3158         default:
3159           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3160         }
3161       }
3162 
3163       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3164     }
3165     its--;
3166   }
3167   while (its--) {
3168 
3169     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3170       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3171            i<m;
3172            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3173 
3174         sz  = diag[row] - ii[row];
3175         v1  = a->a + ii[row];
3176         idx = a->j + ii[row];
3177         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3178         switch (sizes[i]) {
3179         case 1:
3180           sum1 = b[row];
3181           for (n = 0; n<sz-1; n+=2) {
3182             i1    = idx[0];
3183             i2    = idx[1];
3184             idx  += 2;
3185             tmp0  = x[i1];
3186             tmp1  = x[i2];
3187             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3188           }
3189           if (n == sz-1) {
3190             tmp0  = x[*idx++];
3191             sum1 -= *v1 * tmp0;
3192             v1++;
3193           }
3194           t[row]   = sum1;
3195           sz      = ii[row+1] - diag[row] - 1;
3196           idx     = a->j + diag[row] + 1;
3197           v1 += 1;
3198           for (n = 0; n<sz-1; n+=2) {
3199             i1    = idx[0];
3200             i2    = idx[1];
3201             idx  += 2;
3202             tmp0  = x[i1];
3203             tmp1  = x[i2];
3204             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3205           }
3206           if (n == sz-1) {
3207             tmp0  = x[*idx++];
3208             sum1 -= *v1 * tmp0;
3209           }
3210           /* in MatSOR_SeqAIJ this line would be
3211            *
3212            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3213            *
3214            * but omega == 1, so this becomes
3215            *
3216            * x[row] = sum1*(*ibdiag++);
3217            *
3218            */
3219           x[row] = sum1*(*ibdiag);
3220           break;
3221         case 2:
3222           v2   = a->a + ii[row+1];
3223           sum1 = b[row];
3224           sum2 = b[row+1];
3225           for (n = 0; n<sz-1; n+=2) {
3226             i1    = idx[0];
3227             i2    = idx[1];
3228             idx  += 2;
3229             tmp0  = x[i1];
3230             tmp1  = x[i2];
3231             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3232             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3233           }
3234           if (n == sz-1) {
3235             tmp0  = x[*idx++];
3236             sum1 -= v1[0] * tmp0;
3237             sum2 -= v2[0] * tmp0;
3238             v1++; v2++;
3239           }
3240           t[row]   = sum1;
3241           t[row+1] = sum2;
3242           sz      = ii[row+1] - diag[row] - 2;
3243           idx     = a->j + diag[row] + 2;
3244           v1 += 2;
3245           v2 += 2;
3246           for (n = 0; n<sz-1; n+=2) {
3247             i1    = idx[0];
3248             i2    = idx[1];
3249             idx  += 2;
3250             tmp0  = x[i1];
3251             tmp1  = x[i2];
3252             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3253             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3254           }
3255           if (n == sz-1) {
3256             tmp0  = x[*idx];
3257             sum1 -= v1[0] * tmp0;
3258             sum2 -= v2[0] * tmp0;
3259           }
3260           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3261           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3262           break;
3263         case 3:
3264           v2   = a->a + ii[row+1];
3265           v3   = a->a + ii[row+2];
3266           sum1 = b[row];
3267           sum2 = b[row+1];
3268           sum3 = b[row+2];
3269           for (n = 0; n<sz-1; n+=2) {
3270             i1    = idx[0];
3271             i2    = idx[1];
3272             idx  += 2;
3273             tmp0  = x[i1];
3274             tmp1  = x[i2];
3275             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3276             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3277             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3278           }
3279           if (n == sz-1) {
3280             tmp0  = x[*idx++];
3281             sum1 -= v1[0] * tmp0;
3282             sum2 -= v2[0] * tmp0;
3283             sum3 -= v3[0] * tmp0;
3284             v1++; v2++; v3++;
3285           }
3286           t[row]   = sum1;
3287           t[row+1] = sum2;
3288           t[row+2] = sum3;
3289           sz      = ii[row+1] - diag[row] - 3;
3290           idx     = a->j + diag[row] + 3;
3291           v1 += 3;
3292           v2 += 3;
3293           v3 += 3;
3294           for (n = 0; n<sz-1; n+=2) {
3295             i1    = idx[0];
3296             i2    = idx[1];
3297             idx  += 2;
3298             tmp0  = x[i1];
3299             tmp1  = x[i2];
3300             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303           }
3304           if (n == sz-1) {
3305             tmp0  = x[*idx];
3306             sum1 -= v1[0] * tmp0;
3307             sum2 -= v2[0] * tmp0;
3308             sum3 -= v3[0] * tmp0;
3309           }
3310           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3311           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3312           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3313           break;
3314         case 4:
3315           v2   = a->a + ii[row+1];
3316           v3   = a->a + ii[row+2];
3317           v4   = a->a + ii[row+3];
3318           sum1 = b[row];
3319           sum2 = b[row+1];
3320           sum3 = b[row+2];
3321           sum4 = b[row+3];
3322           for (n = 0; n<sz-1; n+=2) {
3323             i1    = idx[0];
3324             i2    = idx[1];
3325             idx  += 2;
3326             tmp0  = x[i1];
3327             tmp1  = x[i2];
3328             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3329             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3330             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3331             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3332           }
3333           if (n == sz-1) {
3334             tmp0  = x[*idx++];
3335             sum1 -= v1[0] * tmp0;
3336             sum2 -= v2[0] * tmp0;
3337             sum3 -= v3[0] * tmp0;
3338             sum4 -= v4[0] * tmp0;
3339             v1++; v2++; v3++; v4++;
3340           }
3341           t[row]   = sum1;
3342           t[row+1] = sum2;
3343           t[row+2] = sum3;
3344           t[row+3] = sum4;
3345           sz      = ii[row+1] - diag[row] - 4;
3346           idx     = a->j + diag[row] + 4;
3347           v1 += 4;
3348           v2 += 4;
3349           v3 += 4;
3350           v4 += 4;
3351           for (n = 0; n<sz-1; n+=2) {
3352             i1    = idx[0];
3353             i2    = idx[1];
3354             idx  += 2;
3355             tmp0  = x[i1];
3356             tmp1  = x[i2];
3357             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3358             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3359             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3360             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3361           }
3362           if (n == sz-1) {
3363             tmp0  = x[*idx];
3364             sum1 -= v1[0] * tmp0;
3365             sum2 -= v2[0] * tmp0;
3366             sum3 -= v3[0] * tmp0;
3367             sum4 -= v4[0] * tmp0;
3368           }
3369           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3370           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3371           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3372           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3373           break;
3374         case 5:
3375           v2   = a->a + ii[row+1];
3376           v3   = a->a + ii[row+2];
3377           v4   = a->a + ii[row+3];
3378           v5   = a->a + ii[row+4];
3379           sum1 = b[row];
3380           sum2 = b[row+1];
3381           sum3 = b[row+2];
3382           sum4 = b[row+3];
3383           sum5 = b[row+4];
3384           for (n = 0; n<sz-1; n+=2) {
3385             i1    = idx[0];
3386             i2    = idx[1];
3387             idx  += 2;
3388             tmp0  = x[i1];
3389             tmp1  = x[i2];
3390             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3391             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3392             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3393             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3394             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3395           }
3396           if (n == sz-1) {
3397             tmp0  = x[*idx++];
3398             sum1 -= v1[0] * tmp0;
3399             sum2 -= v2[0] * tmp0;
3400             sum3 -= v3[0] * tmp0;
3401             sum4 -= v4[0] * tmp0;
3402             sum5 -= v5[0] * tmp0;
3403             v1++; v2++; v3++; v4++; v5++;
3404           }
3405           t[row]   = sum1;
3406           t[row+1] = sum2;
3407           t[row+2] = sum3;
3408           t[row+3] = sum4;
3409           t[row+4] = sum5;
3410           sz      = ii[row+1] - diag[row] - 5;
3411           idx     = a->j + diag[row] + 5;
3412           v1 += 5;
3413           v2 += 5;
3414           v3 += 5;
3415           v4 += 5;
3416           v5 += 5;
3417           for (n = 0; n<sz-1; n+=2) {
3418             i1    = idx[0];
3419             i2    = idx[1];
3420             idx  += 2;
3421             tmp0  = x[i1];
3422             tmp1  = x[i2];
3423             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3424             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3425             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3426             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3427             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3428           }
3429           if (n == sz-1) {
3430             tmp0  = x[*idx];
3431             sum1 -= v1[0] * tmp0;
3432             sum2 -= v2[0] * tmp0;
3433             sum3 -= v3[0] * tmp0;
3434             sum4 -= v4[0] * tmp0;
3435             sum5 -= v5[0] * tmp0;
3436           }
3437           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3438           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3439           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3440           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3441           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3442           break;
3443         default:
3444           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3445         }
3446       }
3447       xb   = t;
3448       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3449     } else xb = b;
3450 
3451     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3452 
3453       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3454       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3455         ibdiag -= sizes[i]*sizes[i];
3456 
3457         /* set RHS */
3458         if (xb == b) {
3459           /* whole (old way) */
3460           sz      = ii[row+1] - ii[row];
3461           idx     = a->j + ii[row];
3462           switch (sizes[i]) {
3463           case 5:
3464             v5      = a->a + ii[row-4];
3465           case 4: /* fall through */
3466             v4      = a->a + ii[row-3];
3467           case 3:
3468             v3      = a->a + ii[row-2];
3469           case 2:
3470             v2      = a->a + ii[row-1];
3471           case 1:
3472             v1      = a->a + ii[row];
3473             break;
3474           default:
3475             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3476           }
3477         } else {
3478           /* upper, no diag */
3479           sz      = ii[row+1] - diag[row] - 1;
3480           idx     = a->j + diag[row] + 1;
3481           switch (sizes[i]) {
3482           case 5:
3483             v5      = a->a + diag[row-4] + 5;
3484           case 4: /* fall through */
3485             v4      = a->a + diag[row-3] + 4;
3486           case 3:
3487             v3      = a->a + diag[row-2] + 3;
3488           case 2:
3489             v2      = a->a + diag[row-1] + 2;
3490           case 1:
3491             v1      = a->a + diag[row] + 1;
3492           }
3493         }
3494         /* set sum */
3495         switch (sizes[i]) {
3496         case 5:
3497           sum5 = xb[row-4];
3498         case 4: /* fall through */
3499           sum4 = xb[row-3];
3500         case 3:
3501           sum3 = xb[row-2];
3502         case 2:
3503           sum2 = xb[row-1];
3504         case 1:
3505           /* note that sum1 is associated with the last row */
3506           sum1 = xb[row];
3507         }
3508         /* do sums */
3509         for (n = 0; n<sz-1; n+=2) {
3510             i1    = idx[0];
3511             i2    = idx[1];
3512             idx  += 2;
3513             tmp0  = x[i1];
3514             tmp1  = x[i2];
3515             switch (sizes[i]) {
3516             case 5:
3517               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3518             case 4: /* fall through */
3519               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3520             case 3:
3521               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3522             case 2:
3523               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3524             case 1:
3525               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3526             }
3527         }
3528         /* ragged edge */
3529         if (n == sz-1) {
3530           tmp0  = x[*idx];
3531           switch (sizes[i]) {
3532           case 5:
3533             sum5 -= *v5*tmp0;
3534           case 4: /* fall through */
3535             sum4 -= *v4*tmp0;
3536           case 3:
3537             sum3 -= *v3*tmp0;
3538           case 2:
3539             sum2 -= *v2*tmp0;
3540           case 1:
3541             sum1 -= *v1*tmp0;
3542           }
3543         }
3544         /* update */
3545         if (xb == b) {
3546           /* whole (old way) w/ diag */
3547           switch (sizes[i]) {
3548           case 5:
3549             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3550             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3551             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3552             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3553             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3554             break;
3555           case 4:
3556             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3557             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3558             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3559             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3560             break;
3561           case 3:
3562             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3563             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3564             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3565             break;
3566           case 2:
3567             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3568             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3569             break;
3570           case 1:
3571             x[row--] += sum1*(*ibdiag);
3572             break;
3573           }
3574         } else {
3575           /* no diag so set =  */
3576           switch (sizes[i]) {
3577           case 5:
3578             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3579             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3580             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3581             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3582             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3583             break;
3584           case 4:
3585             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3586             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3587             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3588             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3589             break;
3590           case 3:
3591             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3592             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3593             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3594             break;
3595           case 2:
3596             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3597             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3598             break;
3599           case 1:
3600             x[row--] = sum1*(*ibdiag);
3601             break;
3602           }
3603         }
3604       }
3605       if (xb == b) {
3606         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3607       } else {
3608         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3609       }
3610     }
3611   }
3612   if (flag & SOR_EISENSTAT) {
3613     /*
3614           Apply  (U + D)^-1  where D is now the block diagonal
3615     */
3616     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3617     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3618       ibdiag -= sizes[i]*sizes[i];
3619       sz      = ii[row+1] - diag[row] - 1;
3620       v1      = a->a + diag[row] + 1;
3621       idx     = a->j + diag[row] + 1;
3622       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3623       switch (sizes[i]) {
3624       case 1:
3625 
3626         sum1 = b[row];
3627         for (n = 0; n<sz-1; n+=2) {
3628           i1    = idx[0];
3629           i2    = idx[1];
3630           idx  += 2;
3631           tmp0  = x[i1];
3632           tmp1  = x[i2];
3633           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634         }
3635 
3636         if (n == sz-1) {
3637           tmp0  = x[*idx];
3638           sum1 -= *v1*tmp0;
3639         }
3640         x[row] = sum1*(*ibdiag);row--;
3641         break;
3642 
3643       case 2:
3644 
3645         sum1 = b[row];
3646         sum2 = b[row-1];
3647         /* note that sum1 is associated with the second of the two rows */
3648         v2 = a->a + diag[row-1] + 2;
3649         for (n = 0; n<sz-1; n+=2) {
3650           i1    = idx[0];
3651           i2    = idx[1];
3652           idx  += 2;
3653           tmp0  = x[i1];
3654           tmp1  = x[i2];
3655           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3656           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3657         }
3658 
3659         if (n == sz-1) {
3660           tmp0  = x[*idx];
3661           sum1 -= *v1*tmp0;
3662           sum2 -= *v2*tmp0;
3663         }
3664         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3665         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3666         row     -= 2;
3667         break;
3668       case 3:
3669 
3670         sum1 = b[row];
3671         sum2 = b[row-1];
3672         sum3 = b[row-2];
3673         v2   = a->a + diag[row-1] + 2;
3674         v3   = a->a + diag[row-2] + 3;
3675         for (n = 0; n<sz-1; n+=2) {
3676           i1    = idx[0];
3677           i2    = idx[1];
3678           idx  += 2;
3679           tmp0  = x[i1];
3680           tmp1  = x[i2];
3681           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3682           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3683           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3684         }
3685 
3686         if (n == sz-1) {
3687           tmp0  = x[*idx];
3688           sum1 -= *v1*tmp0;
3689           sum2 -= *v2*tmp0;
3690           sum3 -= *v3*tmp0;
3691         }
3692         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3693         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3694         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3695         row     -= 3;
3696         break;
3697       case 4:
3698 
3699         sum1 = b[row];
3700         sum2 = b[row-1];
3701         sum3 = b[row-2];
3702         sum4 = b[row-3];
3703         v2   = a->a + diag[row-1] + 2;
3704         v3   = a->a + diag[row-2] + 3;
3705         v4   = a->a + diag[row-3] + 4;
3706         for (n = 0; n<sz-1; n+=2) {
3707           i1    = idx[0];
3708           i2    = idx[1];
3709           idx  += 2;
3710           tmp0  = x[i1];
3711           tmp1  = x[i2];
3712           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3713           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3714           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3715           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3716         }
3717 
3718         if (n == sz-1) {
3719           tmp0  = x[*idx];
3720           sum1 -= *v1*tmp0;
3721           sum2 -= *v2*tmp0;
3722           sum3 -= *v3*tmp0;
3723           sum4 -= *v4*tmp0;
3724         }
3725         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3726         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3727         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3728         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3729         row     -= 4;
3730         break;
3731       case 5:
3732 
3733         sum1 = b[row];
3734         sum2 = b[row-1];
3735         sum3 = b[row-2];
3736         sum4 = b[row-3];
3737         sum5 = b[row-4];
3738         v2   = a->a + diag[row-1] + 2;
3739         v3   = a->a + diag[row-2] + 3;
3740         v4   = a->a + diag[row-3] + 4;
3741         v5   = a->a + diag[row-4] + 5;
3742         for (n = 0; n<sz-1; n+=2) {
3743           i1    = idx[0];
3744           i2    = idx[1];
3745           idx  += 2;
3746           tmp0  = x[i1];
3747           tmp1  = x[i2];
3748           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3749           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3750           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3751           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3752           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3753         }
3754 
3755         if (n == sz-1) {
3756           tmp0  = x[*idx];
3757           sum1 -= *v1*tmp0;
3758           sum2 -= *v2*tmp0;
3759           sum3 -= *v3*tmp0;
3760           sum4 -= *v4*tmp0;
3761           sum5 -= *v5*tmp0;
3762         }
3763         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3764         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3765         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3766         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3767         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3768         row     -= 5;
3769         break;
3770       default:
3771         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3772       }
3773     }
3774     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3775 
3776     /*
3777            t = b - D x    where D is the block diagonal
3778     */
3779     cnt = 0;
3780     for (i=0, row=0; i<m; i++) {
3781       switch (sizes[i]) {
3782       case 1:
3783         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3784         break;
3785       case 2:
3786         x1       = x[row]; x2 = x[row+1];
3787         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3788         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3789         t[row]   = b[row] - tmp1;
3790         t[row+1] = b[row+1] - tmp2; row += 2;
3791         cnt     += 4;
3792         break;
3793       case 3:
3794         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3795         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3796         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3797         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3798         t[row]   = b[row] - tmp1;
3799         t[row+1] = b[row+1] - tmp2;
3800         t[row+2] = b[row+2] - tmp3; row += 3;
3801         cnt     += 9;
3802         break;
3803       case 4:
3804         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3805         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3806         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3807         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3808         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3809         t[row]   = b[row] - tmp1;
3810         t[row+1] = b[row+1] - tmp2;
3811         t[row+2] = b[row+2] - tmp3;
3812         t[row+3] = b[row+3] - tmp4; row += 4;
3813         cnt     += 16;
3814         break;
3815       case 5:
3816         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3817         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3818         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3819         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3820         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3821         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3822         t[row]   = b[row] - tmp1;
3823         t[row+1] = b[row+1] - tmp2;
3824         t[row+2] = b[row+2] - tmp3;
3825         t[row+3] = b[row+3] - tmp4;
3826         t[row+4] = b[row+4] - tmp5;row += 5;
3827         cnt     += 25;
3828         break;
3829       default:
3830         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3831       }
3832     }
3833     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3834 
3835 
3836 
3837     /*
3838           Apply (L + D)^-1 where D is the block diagonal
3839     */
3840     for (i=0, row=0; i<m; i++) {
3841       sz  = diag[row] - ii[row];
3842       v1  = a->a + ii[row];
3843       idx = a->j + ii[row];
3844       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3845       switch (sizes[i]) {
3846       case 1:
3847 
3848         sum1 = t[row];
3849         for (n = 0; n<sz-1; n+=2) {
3850           i1    = idx[0];
3851           i2    = idx[1];
3852           idx  += 2;
3853           tmp0  = t[i1];
3854           tmp1  = t[i2];
3855           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3856         }
3857 
3858         if (n == sz-1) {
3859           tmp0  = t[*idx];
3860           sum1 -= *v1 * tmp0;
3861         }
3862         x[row] += t[row] = sum1*(*ibdiag++); row++;
3863         break;
3864       case 2:
3865         v2   = a->a + ii[row+1];
3866         sum1 = t[row];
3867         sum2 = t[row+1];
3868         for (n = 0; n<sz-1; n+=2) {
3869           i1    = idx[0];
3870           i2    = idx[1];
3871           idx  += 2;
3872           tmp0  = t[i1];
3873           tmp1  = t[i2];
3874           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3875           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3876         }
3877 
3878         if (n == sz-1) {
3879           tmp0  = t[*idx];
3880           sum1 -= v1[0] * tmp0;
3881           sum2 -= v2[0] * tmp0;
3882         }
3883         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3884         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3885         ibdiag   += 4; row += 2;
3886         break;
3887       case 3:
3888         v2   = a->a + ii[row+1];
3889         v3   = a->a + ii[row+2];
3890         sum1 = t[row];
3891         sum2 = t[row+1];
3892         sum3 = t[row+2];
3893         for (n = 0; n<sz-1; n+=2) {
3894           i1    = idx[0];
3895           i2    = idx[1];
3896           idx  += 2;
3897           tmp0  = t[i1];
3898           tmp1  = t[i2];
3899           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3900           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3901           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3902         }
3903 
3904         if (n == sz-1) {
3905           tmp0  = t[*idx];
3906           sum1 -= v1[0] * tmp0;
3907           sum2 -= v2[0] * tmp0;
3908           sum3 -= v3[0] * tmp0;
3909         }
3910         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3911         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3912         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3913         ibdiag   += 9; row += 3;
3914         break;
3915       case 4:
3916         v2   = a->a + ii[row+1];
3917         v3   = a->a + ii[row+2];
3918         v4   = a->a + ii[row+3];
3919         sum1 = t[row];
3920         sum2 = t[row+1];
3921         sum3 = t[row+2];
3922         sum4 = t[row+3];
3923         for (n = 0; n<sz-1; n+=2) {
3924           i1    = idx[0];
3925           i2    = idx[1];
3926           idx  += 2;
3927           tmp0  = t[i1];
3928           tmp1  = t[i2];
3929           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3930           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3931           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3932           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3933         }
3934 
3935         if (n == sz-1) {
3936           tmp0  = t[*idx];
3937           sum1 -= v1[0] * tmp0;
3938           sum2 -= v2[0] * tmp0;
3939           sum3 -= v3[0] * tmp0;
3940           sum4 -= v4[0] * tmp0;
3941         }
3942         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3943         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3944         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3945         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3946         ibdiag   += 16; row += 4;
3947         break;
3948       case 5:
3949         v2   = a->a + ii[row+1];
3950         v3   = a->a + ii[row+2];
3951         v4   = a->a + ii[row+3];
3952         v5   = a->a + ii[row+4];
3953         sum1 = t[row];
3954         sum2 = t[row+1];
3955         sum3 = t[row+2];
3956         sum4 = t[row+3];
3957         sum5 = t[row+4];
3958         for (n = 0; n<sz-1; n+=2) {
3959           i1    = idx[0];
3960           i2    = idx[1];
3961           idx  += 2;
3962           tmp0  = t[i1];
3963           tmp1  = t[i2];
3964           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3965           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3966           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3967           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3968           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3969         }
3970 
3971         if (n == sz-1) {
3972           tmp0  = t[*idx];
3973           sum1 -= v1[0] * tmp0;
3974           sum2 -= v2[0] * tmp0;
3975           sum3 -= v3[0] * tmp0;
3976           sum4 -= v4[0] * tmp0;
3977           sum5 -= v5[0] * tmp0;
3978         }
3979         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3980         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3981         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3982         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3983         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3984         ibdiag   += 25; row += 5;
3985         break;
3986       default:
3987         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3988       }
3989     }
3990     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3991   }
3992   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3993   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3994   PetscFunctionReturn(0);
3995 }
3996 
3997 #undef __FUNCT__
3998 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3999 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4000 {
4001   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4002   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4003   const MatScalar   *bdiag = a->inode.bdiag;
4004   const PetscScalar *b;
4005   PetscErrorCode    ierr;
4006   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4007   const PetscInt    *sizes = a->inode.size;
4008 
4009   PetscFunctionBegin;
4010   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4011   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4012   cnt  = 0;
4013   for (i=0, row=0; i<m; i++) {
4014     switch (sizes[i]) {
4015     case 1:
4016       x[row] = b[row]*bdiag[cnt++];row++;
4017       break;
4018     case 2:
4019       x1       = b[row]; x2 = b[row+1];
4020       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4021       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4022       x[row++] = tmp1;
4023       x[row++] = tmp2;
4024       cnt     += 4;
4025       break;
4026     case 3:
4027       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4028       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4029       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4030       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4031       x[row++] = tmp1;
4032       x[row++] = tmp2;
4033       x[row++] = tmp3;
4034       cnt     += 9;
4035       break;
4036     case 4:
4037       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4038       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4039       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4040       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4041       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4042       x[row++] = tmp1;
4043       x[row++] = tmp2;
4044       x[row++] = tmp3;
4045       x[row++] = tmp4;
4046       cnt     += 16;
4047       break;
4048     case 5:
4049       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4050       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4051       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4052       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4053       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4054       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4055       x[row++] = tmp1;
4056       x[row++] = tmp2;
4057       x[row++] = tmp3;
4058       x[row++] = tmp4;
4059       x[row++] = tmp5;
4060       cnt     += 25;
4061       break;
4062     default:
4063       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4064     }
4065   }
4066   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4067   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4068   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4069   PetscFunctionReturn(0);
4070 }
4071 
4072 /*
4073     samestructure indicates that the matrix has not changed its nonzero structure so we
4074     do not need to recompute the inodes
4075 */
4076 #undef __FUNCT__
4077 #define __FUNCT__ "Mat_CheckInode"
4078 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
4079 {
4080   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4081   PetscErrorCode ierr;
4082   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4083   PetscBool      flag;
4084   const PetscInt *idx,*idy,*ii;
4085 
4086   PetscFunctionBegin;
4087   if (!a->inode.use) PetscFunctionReturn(0);
4088   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4089 
4090 
4091   m = A->rmap->n;
4092   if (a->inode.size) ns = a->inode.size;
4093   else {
4094     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4095   }
4096 
4097   i          = 0;
4098   node_count = 0;
4099   idx        = a->j;
4100   ii         = a->i;
4101   while (i < m) {                /* For each row */
4102     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4103     /* Limits the number of elements in a node to 'a->inode.limit' */
4104     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4105       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4106       if (nzy != nzx) break;
4107       idy += nzx;              /* Same nonzero pattern */
4108       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4109       if (!flag) break;
4110     }
4111     ns[node_count++] = blk_size;
4112     idx             += blk_size*nzx;
4113     i                = j;
4114   }
4115   /* If not enough inodes found,, do not use inode version of the routines */
4116   if (!m || node_count > .8*m) {
4117     ierr = PetscFree(ns);CHKERRQ(ierr);
4118 
4119     a->inode.node_count       = 0;
4120     a->inode.size             = NULL;
4121     a->inode.use              = PETSC_FALSE;
4122     A->ops->mult              = MatMult_SeqAIJ;
4123     A->ops->sor               = MatSOR_SeqAIJ;
4124     A->ops->multadd           = MatMultAdd_SeqAIJ;
4125     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4126     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4127     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4128     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4129     A->ops->coloringpatch     = 0;
4130     A->ops->multdiagonalblock = 0;
4131 
4132     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4133   } else {
4134     if (!A->factortype) {
4135       A->ops->mult              = MatMult_SeqAIJ_Inode;
4136       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4137       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4138       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139       if (A->rmap->n == A->cmap->n) {
4140         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4141         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4142         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4143         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4144         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4145       }
4146     } else {
4147       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4148     }
4149     a->inode.node_count = node_count;
4150     a->inode.size       = ns;
4151     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4152   }
4153   a->inode.checked = PETSC_TRUE;
4154   PetscFunctionReturn(0);
4155 }
4156 
4157 #undef __FUNCT__
4158 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4159 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4160 {
4161   Mat            B =*C;
4162   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4163   PetscErrorCode ierr;
4164   PetscInt       m=A->rmap->n;
4165 
4166   PetscFunctionBegin;
4167   c->inode.use       = a->inode.use;
4168   c->inode.limit     = a->inode.limit;
4169   c->inode.max_limit = a->inode.max_limit;
4170   if (a->inode.size) {
4171     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4172     c->inode.node_count = a->inode.node_count;
4173     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4174     /* note the table of functions below should match that in Mat_CheckInode() */
4175     if (!B->factortype) {
4176       B->ops->mult              = MatMult_SeqAIJ_Inode;
4177       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4178       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4179       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4180       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4181       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4182       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4183       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4184       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4185     } else {
4186       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4187     }
4188   } else {
4189     c->inode.size       = 0;
4190     c->inode.node_count = 0;
4191   }
4192   c->inode.ibdiagvalid = PETSC_FALSE;
4193   c->inode.ibdiag      = 0;
4194   c->inode.bdiag       = 0;
4195   PetscFunctionReturn(0);
4196 }
4197 
4198 #undef __FUNCT__
4199 #define __FUNCT__ "MatGetRow_FactoredLU"
4200 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4201 {
4202   PetscInt       k;
4203   const PetscInt *vi;
4204 
4205   PetscFunctionBegin;
4206   vi = aj + ai[row];
4207   for (k=0; k<nzl; k++) cols[k] = vi[k];
4208   vi        = aj + adiag[row];
4209   cols[nzl] = vi[0];
4210   vi        = aj + adiag[row+1]+1;
4211   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4212   PetscFunctionReturn(0);
4213 }
4214 /*
4215    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4216    Modified from Mat_CheckInode().
4217 
4218    Input Parameters:
4219 .  Mat A - ILU or LU matrix factor
4220 
4221 */
4222 #undef __FUNCT__
4223 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4224 PetscErrorCode Mat_CheckInode_FactorLU(Mat A)
4225 {
4226   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4227   PetscErrorCode ierr;
4228   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4229   PetscInt       *cols1,*cols2,*ns;
4230   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4231   PetscBool      flag;
4232 
4233   PetscFunctionBegin;
4234   if (!a->inode.use)    PetscFunctionReturn(0);
4235   if (a->inode.checked) PetscFunctionReturn(0);
4236 
4237   m = A->rmap->n;
4238   if (a->inode.size) ns = a->inode.size;
4239   else {
4240     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4241   }
4242 
4243   i          = 0;
4244   node_count = 0;
4245   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4246   while (i < m) {                /* For each row */
4247     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4248     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4249     nzx  = nzl1 + nzu1 + 1;
4250     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4251 
4252     /* Limits the number of elements in a node to 'a->inode.limit' */
4253     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4254       nzl2 = ai[j+1] - ai[j];
4255       nzu2 = adiag[j] - adiag[j+1] - 1;
4256       nzy  = nzl2 + nzu2 + 1;
4257       if (nzy != nzx) break;
4258       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4259       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4260       if (!flag) break;
4261     }
4262     ns[node_count++] = blk_size;
4263     i                = j;
4264   }
4265   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4266   /* If not enough inodes found,, do not use inode version of the routines */
4267   if (!m || node_count > .8*m) {
4268     ierr = PetscFree(ns);CHKERRQ(ierr);
4269 
4270     a->inode.node_count = 0;
4271     a->inode.size       = NULL;
4272     a->inode.use        = PETSC_FALSE;
4273 
4274     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4275   } else {
4276     A->ops->mult              = 0;
4277     A->ops->sor               = 0;
4278     A->ops->multadd           = 0;
4279     A->ops->getrowij          = 0;
4280     A->ops->restorerowij      = 0;
4281     A->ops->getcolumnij       = 0;
4282     A->ops->restorecolumnij   = 0;
4283     A->ops->coloringpatch     = 0;
4284     A->ops->multdiagonalblock = 0;
4285     a->inode.node_count       = node_count;
4286     a->inode.size             = ns;
4287 
4288     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4289   }
4290   a->inode.checked = PETSC_TRUE;
4291   PetscFunctionReturn(0);
4292 }
4293 
4294 #undef __FUNCT__
4295 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4296 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4297 {
4298   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4299 
4300   PetscFunctionBegin;
4301   a->inode.ibdiagvalid = PETSC_FALSE;
4302   PetscFunctionReturn(0);
4303 }
4304 
4305 /*
4306      This is really ugly. if inodes are used this replaces the
4307   permutations with ones that correspond to rows/cols of the matrix
4308   rather then inode blocks
4309 */
4310 #undef __FUNCT__
4311 #define __FUNCT__ "MatInodeAdjustForInodes"
4312 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4313 {
4314   PetscErrorCode ierr;
4315 
4316   PetscFunctionBegin;
4317   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4318   PetscFunctionReturn(0);
4319 }
4320 
4321 #undef __FUNCT__
4322 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4323 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4324 {
4325   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4326   PetscErrorCode ierr;
4327   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4328   const PetscInt *ridx,*cidx;
4329   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4330   PetscInt       nslim_col,*ns_col;
4331   IS             ris = *rperm,cis = *cperm;
4332 
4333   PetscFunctionBegin;
4334   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4335   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4336 
4337   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4338   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4339   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4340 
4341   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4342   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4343 
4344   /* Form the inode structure for the rows of permuted matric using inv perm*/
4345   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4346 
4347   /* Construct the permutations for rows*/
4348   for (i=0,row = 0; i<nslim_row; ++i) {
4349     indx      = ridx[i];
4350     start_val = tns[indx];
4351     end_val   = tns[indx + 1];
4352     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4353   }
4354 
4355   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4356   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4357 
4358   /* Construct permutations for columns */
4359   for (i=0,col=0; i<nslim_col; ++i) {
4360     indx      = cidx[i];
4361     start_val = tns[indx];
4362     end_val   = tns[indx + 1];
4363     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4364   }
4365 
4366   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4367   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4368   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4369   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4370 
4371   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4372   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4373 
4374   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4375   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4376   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4377   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4378   ierr = PetscFree(tns);CHKERRQ(ierr);
4379   PetscFunctionReturn(0);
4380 }
4381 
4382 #undef __FUNCT__
4383 #define __FUNCT__ "MatInodeGetInodeSizes"
4384 /*@C
4385    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4386 
4387    Not Collective
4388 
4389    Input Parameter:
4390 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4391 
4392    Output Parameter:
4393 +  node_count - no of inodes present in the matrix.
4394 .  sizes      - an array of size node_count,with sizes of each inode.
4395 -  limit      - the max size used to generate the inodes.
4396 
4397    Level: advanced
4398 
4399    Notes: This routine returns some internal storage information
4400    of the matrix, it is intended to be used by advanced users.
4401    It should be called after the matrix is assembled.
4402    The contents of the sizes[] array should not be changed.
4403    NULL may be passed for information not requested.
4404 
4405 .keywords: matrix, seqaij, get, inode
4406 
4407 .seealso: MatGetInfo()
4408 @*/
4409 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4410 {
4411   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4412 
4413   PetscFunctionBegin;
4414   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4415   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4416   if (f) {
4417     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4418   }
4419   PetscFunctionReturn(0);
4420 }
4421 
4422 #undef __FUNCT__
4423 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4424 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4425 {
4426   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4427 
4428   PetscFunctionBegin;
4429   if (node_count) *node_count = a->inode.node_count;
4430   if (sizes)      *sizes      = a->inode.size;
4431   if (limit)      *limit      = a->inode.limit;
4432   PetscFunctionReturn(0);
4433 }
4434