xref: /petsc/src/mat/impls/aij/seq/inode.c (revision e6e75211d226c622f451867f53ce5d558649ff4f)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   if (zz != yy) {
601     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
602   } else {
603     z = y;
604   }
605   zt = z;
606 
607   idx = a->j;
608   v1  = a->a;
609   ii  = a->i;
610 
611   for (i = 0,row = 0; i< node_max; ++i) {
612     nsz = ns[i];
613     n   = ii[1] - ii[0];
614     ii += nsz;
615     sz  = n;                    /* No of non zeros in this row */
616                                 /* Switch on the size of Node */
617     switch (nsz) {               /* Each loop in 'case' is unrolled */
618     case 1:
619       sum1 = *zt++;
620 
621       for (n = 0; n< sz-1; n+=2) {
622         i1    = idx[0];         /* The instructions are ordered to */
623         i2    = idx[1];         /* make the compiler's job easy */
624         idx  += 2;
625         tmp0  = x[i1];
626         tmp1  = x[i2];
627         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628       }
629 
630       if (n   == sz-1) {          /* Take care of the last nonzero  */
631         tmp0  = x[*idx++];
632         sum1 += *v1++ * tmp0;
633       }
634       y[row++]=sum1;
635       break;
636     case 2:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       v2   = v1 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649       }
650       if (n   == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654       }
655       y[row++]=sum1;
656       y[row++]=sum2;
657       v1      =v2;              /* Since the next block to be processed starts there*/
658       idx    +=sz;
659       break;
660     case 3:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666 
667       for (n = 0; n< sz-1; n+=2) {
668         i1    = idx[0];
669         i2    = idx[1];
670         idx  += 2;
671         tmp0  = x[i1];
672         tmp1  = x[i2];
673         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676       }
677       if (n == sz-1) {
678         tmp0  = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682       }
683       y[row++]=sum1;
684       y[row++]=sum2;
685       y[row++]=sum3;
686       v1      =v3;              /* Since the next block to be processed starts there*/
687       idx    +=2*sz;
688       break;
689     case 4:
690       sum1 = *zt++;
691       sum2 = *zt++;
692       sum3 = *zt++;
693       sum4 = *zt++;
694       v2   = v1 + n;
695       v3   = v2 + n;
696       v4   = v3 + n;
697 
698       for (n = 0; n< sz-1; n+=2) {
699         i1    = idx[0];
700         i2    = idx[1];
701         idx  += 2;
702         tmp0  = x[i1];
703         tmp1  = x[i2];
704         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708       }
709       if (n == sz-1) {
710         tmp0  = x[*idx++];
711         sum1 += *v1++ * tmp0;
712         sum2 += *v2++ * tmp0;
713         sum3 += *v3++ * tmp0;
714         sum4 += *v4++ * tmp0;
715       }
716       y[row++]=sum1;
717       y[row++]=sum2;
718       y[row++]=sum3;
719       y[row++]=sum4;
720       v1      =v4;              /* Since the next block to be processed starts there*/
721       idx    +=3*sz;
722       break;
723     case 5:
724       sum1 = *zt++;
725       sum2 = *zt++;
726       sum3 = *zt++;
727       sum4 = *zt++;
728       sum5 = *zt++;
729       v2   = v1 + n;
730       v3   = v2 + n;
731       v4   = v3 + n;
732       v5   = v4 + n;
733 
734       for (n = 0; n<sz-1; n+=2) {
735         i1    = idx[0];
736         i2    = idx[1];
737         idx  += 2;
738         tmp0  = x[i1];
739         tmp1  = x[i2];
740         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745       }
746       if (n   == sz-1) {
747         tmp0  = x[*idx++];
748         sum1 += *v1++ * tmp0;
749         sum2 += *v2++ * tmp0;
750         sum3 += *v3++ * tmp0;
751         sum4 += *v4++ * tmp0;
752         sum5 += *v5++ * tmp0;
753       }
754       y[row++]=sum1;
755       y[row++]=sum2;
756       y[row++]=sum3;
757       y[row++]=sum4;
758       y[row++]=sum5;
759       v1      =v5;       /* Since the next block to be processed starts there */
760       idx    +=4*sz;
761       break;
762     default:
763       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764     }
765   }
766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770   }
771   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* ----------------------------------------------------------- */
776 #undef __FUNCT__
777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779 {
780   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781   IS                iscol = a->col,isrow = a->row;
782   PetscErrorCode    ierr;
783   const PetscInt    *r,*c,*rout,*cout;
784   PetscInt          i,j,n = A->rmap->n,nz;
785   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp;
806   aa   = a_a;
807   aj   = a_j;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i) {
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816     if (i < node_max-1) {
817       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818       * but our indexing to determine it's size could. */
819       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823     }
824 
825     switch (nsz) {               /* Each loop in 'case' is unrolled */
826     case 1:
827       sum1 = b[*r++];
828       for (j=0; j<nz-1; j+=2) {
829         i0    = vi[0];
830         i1    = vi[1];
831         vi   +=2;
832         tmp0  = tmps[i0];
833         tmp1  = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839       }
840       tmp[row++]=sum1;
841       break;
842     case 2:
843       sum1 = b[*r++];
844       sum2 = b[*r++];
845       v2   = aa + ai[row+1];
846 
847       for (j=0; j<nz-1; j+=2) {
848         i0    = vi[0];
849         i1    = vi[1];
850         vi   +=2;
851         tmp0  = tmps[i0];
852         tmp1  = tmps[i1];
853         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860       }
861       sum2     -= *v2++ *sum1;
862       tmp[row++]=sum1;
863       tmp[row++]=sum2;
864       break;
865     case 3:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       v2   = aa + ai[row+1];
870       v3   = aa + ai[row+2];
871 
872       for (j=0; j<nz-1; j+=2) {
873         i0    = vi[0];
874         i1    = vi[1];
875         vi   +=2;
876         tmp0  = tmps[i0];
877         tmp1  = tmps[i1];
878         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887       }
888       sum2 -= *v2++ * sum1;
889       sum3 -= *v3++ * sum1;
890       sum3 -= *v3++ * sum2;
891 
892       tmp[row++]=sum1;
893       tmp[row++]=sum2;
894       tmp[row++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2) {
907         i0    = vi[0];
908         i1    = vi[1];
909         vi   +=2;
910         tmp0  = tmps[i0];
911         tmp1  = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1) {
918         tmp0  = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row++]=sum1;
932       tmp[row++]=sum2;
933       tmp[row++]=sum3;
934       tmp[row++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2) {
948         i0    = vi[0];
949         i1    = vi[1];
950         vi   +=2;
951         tmp0  = tmps[i0];
952         tmp1  = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1) {
960         tmp0  = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row++]=sum1;
980       tmp[row++]=sum2;
981       tmp[row++]=sum3;
982       tmp[row++]=sum4;
983       tmp[row++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1,row = n-1; i>=0; i--) {
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz) {               /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1) {
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2:
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz; j>1; j-=2) {
1020         vi   -=2;
1021         i0    = vi[2];
1022         i1    = vi[1];
1023         tmp0  = tmps[i0];
1024         tmp1  = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1) {
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3:
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz; j>1; j-=2) {
1047         vi   -=2;
1048         i0    = vi[2];
1049         i1    = vi[1];
1050         tmp0  = tmps[i0];
1051         tmp1  = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1) {
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4:
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz; j>1; j-=2) {
1083         vi   -=2;
1084         i0    = vi[2];
1085         i1    = vi[1];
1086         tmp0  = tmps[i0];
1087         tmp1  = tmps[i1];
1088         v1   -= 2;
1089         v2   -= 2;
1090         v3   -= 2;
1091         v4   -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz; j>1; j-=2) {
1127         vi   -= 2;
1128         i0    = vi[2];
1129         i1    = vi[1];
1130         tmp0  = tmps[i0];
1131         tmp1  = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1) {
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat             C     =B;
1185   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186   IS              isrow = b->row,isicol = b->icol;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *r,*ic,*ics;
1189   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt        i,j,k,nz,nzL,row,*pj;
1191   const PetscInt  *ajtmp,*bjtmp;
1192   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx  sctx;
1195   const PetscInt  *ddiag;
1196   PetscReal       rs;
1197   MatScalar       d;
1198   PetscInt        inod,nodesz,node_max,col;
1199   const PetscInt  *ns;
1200   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223 
1224   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1225   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226 
1227   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1228   ics   = ic;
1229 
1230   node_max = a->inode.node_count;
1231   ns       = a->inode.size;
1232   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1233 
1234   /* If max inode size > 4, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j) {
1238     if (ns[i] > 4) {
1239       tmp_vec1[j] = 4;
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1262     tmp_vec2[i] = nodesz;
1263     j          += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267 
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.newshift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275       nodesz = ns[inod];
1276 
1277       switch (nodesz) {
1278       case 1:
1279         /*----------*/
1280         /* zero rtmp1 */
1281         /* L part */
1282         nz    = bi[i+1] - bi[i];
1283         bjtmp = bj + bi[i];
1284         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1285 
1286         /* U part */
1287         nz    = bdiag[i]-bdiag[i+1];
1288         bjtmp = bj + bdiag[i+1]+1;
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* load in initial (unfactored row) */
1292         nz    = ai[r[i]+1] - ai[r[i]];
1293         ajtmp = aj + ai[r[i]];
1294         v     = aa + ai[r[i]];
1295         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1296 
1297         /* ZeropivotApply() */
1298         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1299 
1300         /* elimination */
1301         bjtmp = bj + bi[i];
1302         row   = *bjtmp++;
1303         nzL   = bi[i+1] - bi[i];
1304         for (k=0; k < nzL; k++) {
1305           pc = rtmp1 + row;
1306           if (*pc != 0.0) {
1307             pv   = b->a + bdiag[row];
1308             mul1 = *pc * (*pv);
1309             *pc  = mul1;
1310             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1311             pv   = b->a + bdiag[row+1]+1;
1312             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1313             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1314             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1315           }
1316           row = *bjtmp++;
1317         }
1318 
1319         /* finished row so stick it into b->a */
1320         rs = 0.0;
1321         /* L part */
1322         pv = b->a + bi[i];
1323         pj = b->j + bi[i];
1324         nz = bi[i+1] - bi[i];
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* U part */
1330         pv = b->a + bdiag[i+1]+1;
1331         pj = b->j + bdiag[i+1]+1;
1332         nz = bdiag[i] - bdiag[i+1]-1;
1333         for (j=0; j<nz; j++) {
1334           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1335         }
1336 
1337         /* Check zero pivot */
1338         sctx.rs = rs;
1339         sctx.pv = rtmp1[i];
1340         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1341         if (sctx.newshift) break;
1342 
1343         /* Mark diagonal and invert diagonal for simplier triangular solves */
1344         pv  = b->a + bdiag[i];
1345         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1346         break;
1347 
1348       case 2:
1349         /*----------*/
1350         /* zero rtmp1 and rtmp2 */
1351         /* L part */
1352         nz    = bi[i+1] - bi[i];
1353         bjtmp = bj + bi[i];
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* U part */
1360         nz    = bdiag[i]-bdiag[i+1];
1361         bjtmp = bj + bdiag[i+1]+1;
1362         for  (j=0; j<nz; j++) {
1363           col        = bjtmp[j];
1364           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1365         }
1366 
1367         /* load in initial (unfactored row) */
1368         nz    = ai[r[i]+1] - ai[r[i]];
1369         ajtmp = aj + ai[r[i]];
1370         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1371         for (j=0; j<nz; j++) {
1372           col        = ics[ajtmp[j]];
1373           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1374         }
1375         /* ZeropivotApply(): shift the diagonal of the matrix  */
1376         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1377 
1378         /* elimination */
1379         bjtmp = bj + bi[i];
1380         row   = *bjtmp++; /* pivot row */
1381         nzL   = bi[i+1] - bi[i];
1382         for (k=0; k < nzL; k++) {
1383           pc1 = rtmp1 + row;
1384           pc2 = rtmp2 + row;
1385           if (*pc1 != 0.0 || *pc2 != 0.0) {
1386             pv   = b->a + bdiag[row];
1387             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1388             *pc1 = mul1;       *pc2 = mul2;
1389 
1390             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1391             pv = b->a + bdiag[row+1]+1;
1392             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1393             for (j=0; j<nz; j++) {
1394               col         = pj[j];
1395               rtmp1[col] -= mul1 * pv[j];
1396               rtmp2[col] -= mul2 * pv[j];
1397             }
1398             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1399           }
1400           row = *bjtmp++;
1401         }
1402 
1403         /* finished row i; check zero pivot, then stick row i into b->a */
1404         rs = 0.0;
1405         /* L part */
1406         pc1 = b->a + bi[i];
1407         pj  = b->j + bi[i];
1408         nz  = bi[i+1] - bi[i];
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413         /* U part */
1414         pc1 = b->a + bdiag[i+1]+1;
1415         pj  = b->j + bdiag[i+1]+1;
1416         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1417         for (j=0; j<nz; j++) {
1418           col    = pj[j];
1419           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1420         }
1421 
1422         sctx.rs = rs;
1423         sctx.pv = rtmp1[i];
1424         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1425         if (sctx.newshift) break;
1426         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1427         *pc1 = 1.0/sctx.pv;
1428 
1429         /* Now take care of diagonal 2x2 block. */
1430         pc2 = rtmp2 + i;
1431         if (*pc2 != 0.0) {
1432           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1433           *pc2 = mul1;          /* insert L entry */
1434           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1435           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1436           for (j=0; j<nz; j++) {
1437             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1438           }
1439           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1440         }
1441 
1442         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1443         rs = 0.0;
1444         /* L part */
1445         pc2 = b->a + bi[i+1];
1446         pj  = b->j + bi[i+1];
1447         nz  = bi[i+2] - bi[i+1];
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452         /* U part */
1453         pc2 = b->a + bdiag[i+2]+1;
1454         pj  = b->j + bdiag[i+2]+1;
1455         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1456         for (j=0; j<nz; j++) {
1457           col    = pj[j];
1458           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1459         }
1460 
1461         sctx.rs = rs;
1462         sctx.pv = rtmp2[i+1];
1463         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1464         if (sctx.newshift) break;
1465         pc2  = b->a + bdiag[i+1];
1466         *pc2 = 1.0/sctx.pv;
1467         break;
1468 
1469       case 3:
1470         /*----------*/
1471         /* zero rtmp */
1472         /* L part */
1473         nz    = bi[i+1] - bi[i];
1474         bjtmp = bj + bi[i];
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* U part */
1481         nz    = bdiag[i]-bdiag[i+1];
1482         bjtmp = bj + bdiag[i+1]+1;
1483         for  (j=0; j<nz; j++) {
1484           col        = bjtmp[j];
1485           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1486         }
1487 
1488         /* load in initial (unfactored row) */
1489         nz    = ai[r[i]+1] - ai[r[i]];
1490         ajtmp = aj + ai[r[i]];
1491         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1492         for (j=0; j<nz; j++) {
1493           col        = ics[ajtmp[j]];
1494           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1495         }
1496         /* ZeropivotApply(): shift the diagonal of the matrix  */
1497         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1498 
1499         /* elimination */
1500         bjtmp = bj + bi[i];
1501         row   = *bjtmp++; /* pivot row */
1502         nzL   = bi[i+1] - bi[i];
1503         for (k=0; k < nzL; k++) {
1504           pc1 = rtmp1 + row;
1505           pc2 = rtmp2 + row;
1506           pc3 = rtmp3 + row;
1507           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1508             pv   = b->a + bdiag[row];
1509             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1510             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1511 
1512             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1513             pv = b->a + bdiag[row+1]+1;
1514             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1515             for (j=0; j<nz; j++) {
1516               col         = pj[j];
1517               rtmp1[col] -= mul1 * pv[j];
1518               rtmp2[col] -= mul2 * pv[j];
1519               rtmp3[col] -= mul3 * pv[j];
1520             }
1521             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1522           }
1523           row = *bjtmp++;
1524         }
1525 
1526         /* finished row i; check zero pivot, then stick row i into b->a */
1527         rs = 0.0;
1528         /* L part */
1529         pc1 = b->a + bi[i];
1530         pj  = b->j + bi[i];
1531         nz  = bi[i+1] - bi[i];
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536         /* U part */
1537         pc1 = b->a + bdiag[i+1]+1;
1538         pj  = b->j + bdiag[i+1]+1;
1539         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1540         for (j=0; j<nz; j++) {
1541           col    = pj[j];
1542           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1543         }
1544 
1545         sctx.rs = rs;
1546         sctx.pv = rtmp1[i];
1547         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1548         if (sctx.newshift) break;
1549         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1550         *pc1 = 1.0/sctx.pv;
1551 
1552         /* Now take care of 1st column of diagonal 3x3 block. */
1553         pc2 = rtmp2 + i;
1554         pc3 = rtmp3 + i;
1555         if (*pc2 != 0.0 || *pc3 != 0.0) {
1556           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1557           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1558           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1559           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1560           for (j=0; j<nz; j++) {
1561             col         = pj[j];
1562             rtmp2[col] -= mul2 * rtmp1[col];
1563             rtmp3[col] -= mul3 * rtmp1[col];
1564           }
1565           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1566         }
1567 
1568         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1569         rs = 0.0;
1570         /* L part */
1571         pc2 = b->a + bi[i+1];
1572         pj  = b->j + bi[i+1];
1573         nz  = bi[i+2] - bi[i+1];
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578         /* U part */
1579         pc2 = b->a + bdiag[i+2]+1;
1580         pj  = b->j + bdiag[i+2]+1;
1581         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1582         for (j=0; j<nz; j++) {
1583           col    = pj[j];
1584           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1585         }
1586 
1587         sctx.rs = rs;
1588         sctx.pv = rtmp2[i+1];
1589         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1590         if (sctx.newshift) break;
1591         pc2  = b->a + bdiag[i+1];
1592         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1593 
1594         /* Now take care of 2nd column of diagonal 3x3 block. */
1595         pc3 = rtmp3 + i+1;
1596         if (*pc3 != 0.0) {
1597           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1598           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1599           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1600           for (j=0; j<nz; j++) {
1601             col         = pj[j];
1602             rtmp3[col] -= mul3 * rtmp2[col];
1603           }
1604           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1605         }
1606 
1607         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1608         rs = 0.0;
1609         /* L part */
1610         pc3 = b->a + bi[i+2];
1611         pj  = b->j + bi[i+2];
1612         nz  = bi[i+3] - bi[i+2];
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617         /* U part */
1618         pc3 = b->a + bdiag[i+3]+1;
1619         pj  = b->j + bdiag[i+3]+1;
1620         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1621         for (j=0; j<nz; j++) {
1622           col    = pj[j];
1623           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1624         }
1625 
1626         sctx.rs = rs;
1627         sctx.pv = rtmp3[i+2];
1628         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1629         if (sctx.newshift) break;
1630         pc3  = b->a + bdiag[i+2];
1631         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1632         break;
1633       case 4:
1634         /*----------*/
1635         /* zero rtmp */
1636         /* L part */
1637         nz    = bi[i+1] - bi[i];
1638         bjtmp = bj + bi[i];
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1642         }
1643 
1644         /* U part */
1645         nz    = bdiag[i]-bdiag[i+1];
1646         bjtmp = bj + bdiag[i+1]+1;
1647         for  (j=0; j<nz; j++) {
1648           col        = bjtmp[j];
1649           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1650         }
1651 
1652         /* load in initial (unfactored row) */
1653         nz    = ai[r[i]+1] - ai[r[i]];
1654         ajtmp = aj + ai[r[i]];
1655         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1656         for (j=0; j<nz; j++) {
1657           col        = ics[ajtmp[j]];
1658           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1659         }
1660         /* ZeropivotApply(): shift the diagonal of the matrix  */
1661         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1662 
1663         /* elimination */
1664         bjtmp = bj + bi[i];
1665         row   = *bjtmp++; /* pivot row */
1666         nzL   = bi[i+1] - bi[i];
1667         for (k=0; k < nzL; k++) {
1668           pc1 = rtmp1 + row;
1669           pc2 = rtmp2 + row;
1670           pc3 = rtmp3 + row;
1671           pc4 = rtmp4 + row;
1672           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1673             pv   = b->a + bdiag[row];
1674             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1675             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1676 
1677             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1678             pv = b->a + bdiag[row+1]+1;
1679             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1680             for (j=0; j<nz; j++) {
1681               col         = pj[j];
1682               rtmp1[col] -= mul1 * pv[j];
1683               rtmp2[col] -= mul2 * pv[j];
1684               rtmp3[col] -= mul3 * pv[j];
1685               rtmp4[col] -= mul4 * pv[j];
1686             }
1687             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1688           }
1689           row = *bjtmp++;
1690         }
1691 
1692         /* finished row i; check zero pivot, then stick row i into b->a */
1693         rs = 0.0;
1694         /* L part */
1695         pc1 = b->a + bi[i];
1696         pj  = b->j + bi[i];
1697         nz  = bi[i+1] - bi[i];
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702         /* U part */
1703         pc1 = b->a + bdiag[i+1]+1;
1704         pj  = b->j + bdiag[i+1]+1;
1705         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1706         for (j=0; j<nz; j++) {
1707           col    = pj[j];
1708           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1709         }
1710 
1711         sctx.rs = rs;
1712         sctx.pv = rtmp1[i];
1713         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1714         if (sctx.newshift) break;
1715         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1716         *pc1 = 1.0/sctx.pv;
1717 
1718         /* Now take care of 1st column of diagonal 4x4 block. */
1719         pc2 = rtmp2 + i;
1720         pc3 = rtmp3 + i;
1721         pc4 = rtmp4 + i;
1722         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1723           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1724           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1725           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1726           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1727           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1728           for (j=0; j<nz; j++) {
1729             col         = pj[j];
1730             rtmp2[col] -= mul2 * rtmp1[col];
1731             rtmp3[col] -= mul3 * rtmp1[col];
1732             rtmp4[col] -= mul4 * rtmp1[col];
1733           }
1734           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1735         }
1736 
1737         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1738         rs = 0.0;
1739         /* L part */
1740         pc2 = b->a + bi[i+1];
1741         pj  = b->j + bi[i+1];
1742         nz  = bi[i+2] - bi[i+1];
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747         /* U part */
1748         pc2 = b->a + bdiag[i+2]+1;
1749         pj  = b->j + bdiag[i+2]+1;
1750         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1751         for (j=0; j<nz; j++) {
1752           col    = pj[j];
1753           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1754         }
1755 
1756         sctx.rs = rs;
1757         sctx.pv = rtmp2[i+1];
1758         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1759         if (sctx.newshift) break;
1760         pc2  = b->a + bdiag[i+1];
1761         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1762 
1763         /* Now take care of 2nd column of diagonal 4x4 block. */
1764         pc3 = rtmp3 + i+1;
1765         pc4 = rtmp4 + i+1;
1766         if (*pc3 != 0.0 || *pc4 != 0.0) {
1767           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1768           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1769           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1770           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1771           for (j=0; j<nz; j++) {
1772             col         = pj[j];
1773             rtmp3[col] -= mul3 * rtmp2[col];
1774             rtmp4[col] -= mul4 * rtmp2[col];
1775           }
1776           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1777         }
1778 
1779         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1780         rs = 0.0;
1781         /* L part */
1782         pc3 = b->a + bi[i+2];
1783         pj  = b->j + bi[i+2];
1784         nz  = bi[i+3] - bi[i+2];
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789         /* U part */
1790         pc3 = b->a + bdiag[i+3]+1;
1791         pj  = b->j + bdiag[i+3]+1;
1792         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1793         for (j=0; j<nz; j++) {
1794           col    = pj[j];
1795           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1796         }
1797 
1798         sctx.rs = rs;
1799         sctx.pv = rtmp3[i+2];
1800         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1801         if (sctx.newshift) break;
1802         pc3  = b->a + bdiag[i+2];
1803         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1804 
1805         /* Now take care of 3rd column of diagonal 4x4 block. */
1806         pc4 = rtmp4 + i+2;
1807         if (*pc4 != 0.0) {
1808           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1809           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1810           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1811           for (j=0; j<nz; j++) {
1812             col         = pj[j];
1813             rtmp4[col] -= mul4 * rtmp3[col];
1814           }
1815           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1816         }
1817 
1818         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1819         rs = 0.0;
1820         /* L part */
1821         pc4 = b->a + bi[i+3];
1822         pj  = b->j + bi[i+3];
1823         nz  = bi[i+4] - bi[i+3];
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828         /* U part */
1829         pc4 = b->a + bdiag[i+4]+1;
1830         pj  = b->j + bdiag[i+4]+1;
1831         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1832         for (j=0; j<nz; j++) {
1833           col    = pj[j];
1834           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1835         }
1836 
1837         sctx.rs = rs;
1838         sctx.pv = rtmp4[i+3];
1839         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1840         if (sctx.newshift) break;
1841         pc4  = b->a + bdiag[i+3];
1842         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1843         break;
1844 
1845       default:
1846         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1847       }
1848       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1849       i += nodesz;                 /* Update the row */
1850     }
1851 
1852     /* MatPivotRefine() */
1853     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1854       /*
1855        * if no shift in this attempt & shifting & started shifting & can refine,
1856        * then try lower shift
1857        */
1858       sctx.shift_hi       = sctx.shift_fraction;
1859       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1860       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1861       sctx.newshift       = PETSC_TRUE;
1862       sctx.nshift++;
1863     }
1864   } while (sctx.newshift);
1865 
1866   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1867   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1868   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1869   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1870 
1871   if (b->inode.size) {
1872     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1873   } else {
1874     C->ops->solve           = MatSolve_SeqAIJ;
1875   }
1876   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1877   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1878   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1879   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1880   C->assembled              = PETSC_TRUE;
1881   C->preallocated           = PETSC_TRUE;
1882 
1883   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1884 
1885   /* MatShiftView(A,info,&sctx) */
1886   if (sctx.nshift) {
1887     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1888       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1889     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1890       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1891     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1892       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1893     }
1894   }
1895   PetscFunctionReturn(0);
1896 }
1897 
1898 #undef __FUNCT__
1899 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1900 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1901 {
1902   Mat             C     = B;
1903   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1904   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1905   PetscErrorCode  ierr;
1906   const PetscInt  *r,*ic,*c,*ics;
1907   PetscInt        n   = A->rmap->n,*bi = b->i;
1908   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1909   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1910   PetscInt        *ai = a->i,*aj = a->j;
1911   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1912   PetscScalar     mul1,mul2,mul3,tmp;
1913   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1914   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1915   PetscReal       rs=0.0;
1916   FactorShiftCtx  sctx;
1917 
1918   PetscFunctionBegin;
1919   sctx.shift_top      = 0;
1920   sctx.nshift_max     = 0;
1921   sctx.shift_lo       = 0;
1922   sctx.shift_hi       = 0;
1923   sctx.shift_fraction = 0;
1924 
1925   /* if both shift schemes are chosen by user, only use info->shiftpd */
1926   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1927     sctx.shift_top = 0;
1928     for (i=0; i<n; i++) {
1929       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1930       rs    = 0.0;
1931       ajtmp = aj + ai[i];
1932       rtmp1 = aa + ai[i];
1933       nz    = ai[i+1] - ai[i];
1934       for (j=0; j<nz; j++) {
1935         if (*ajtmp != i) {
1936           rs += PetscAbsScalar(*rtmp1++);
1937         } else {
1938           rs -= PetscRealPart(*rtmp1++);
1939         }
1940         ajtmp++;
1941       }
1942       if (rs>sctx.shift_top) sctx.shift_top = rs;
1943     }
1944     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1945     sctx.shift_top *= 1.1;
1946     sctx.nshift_max = 5;
1947     sctx.shift_lo   = 0.;
1948     sctx.shift_hi   = 1.;
1949   }
1950   sctx.shift_amount = 0;
1951   sctx.nshift       = 0;
1952 
1953   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1954   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1955   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1956   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1957   ics    = ic;
1958 
1959   node_max = a->inode.node_count;
1960   ns       = a->inode.size;
1961   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1962 
1963   /* If max inode size > 3, split it into two inodes.*/
1964   /* also map the inode sizes according to the ordering */
1965   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1966   for (i=0,j=0; i<node_max; ++i,++j) {
1967     if (ns[i]>3) {
1968       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1969       ++j;
1970       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1971     } else {
1972       tmp_vec1[j] = ns[i];
1973     }
1974   }
1975   /* Use the correct node_max */
1976   node_max = j;
1977 
1978   /* Now reorder the inode info based on mat re-ordering info */
1979   /* First create a row -> inode_size_array_index map */
1980   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1981   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1982   for (i=0,row=0; i<node_max; i++) {
1983     nodesz = tmp_vec1[i];
1984     for (j=0; j<nodesz; j++,row++) {
1985       nsmap[row] = i;
1986     }
1987   }
1988   /* Using nsmap, create a reordered ns structure */
1989   for (i=0,j=0; i< node_max; i++) {
1990     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1991     tmp_vec2[i] = nodesz;
1992     j          += nodesz;
1993   }
1994   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1995   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1996   /* Now use the correct ns */
1997   ns = tmp_vec2;
1998 
1999   do {
2000     sctx.newshift = PETSC_FALSE;
2001     /* Now loop over each block-row, and do the factorization */
2002     for (i=0,row=0; i<node_max; i++) {
2003       nodesz = ns[i];
2004       nz     = bi[row+1] - bi[row];
2005       bjtmp  = bj + bi[row];
2006 
2007       switch (nodesz) {
2008       case 1:
2009         for  (j=0; j<nz; j++) {
2010           idx         = bjtmp[j];
2011           rtmp11[idx] = 0.0;
2012         }
2013 
2014         /* load in initial (unfactored row) */
2015         idx    = r[row];
2016         nz_tmp = ai[idx+1] - ai[idx];
2017         ajtmp  = aj + ai[idx];
2018         v1     = aa + ai[idx];
2019 
2020         for (j=0; j<nz_tmp; j++) {
2021           idx         = ics[ajtmp[j]];
2022           rtmp11[idx] = v1[j];
2023         }
2024         rtmp11[ics[r[row]]] += sctx.shift_amount;
2025 
2026         prow = *bjtmp++;
2027         while (prow < row) {
2028           pc1 = rtmp11 + prow;
2029           if (*pc1 != 0.0) {
2030             pv     = ba + bd[prow];
2031             pj     = nbj + bd[prow];
2032             mul1   = *pc1 * *pv++;
2033             *pc1   = mul1;
2034             nz_tmp = bi[prow+1] - bd[prow] - 1;
2035             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2036             for (j=0; j<nz_tmp; j++) {
2037               tmp          = pv[j];
2038               idx          = pj[j];
2039               rtmp11[idx] -= mul1 * tmp;
2040             }
2041           }
2042           prow = *bjtmp++;
2043         }
2044         pj  = bj + bi[row];
2045         pc1 = ba + bi[row];
2046 
2047         sctx.pv     = rtmp11[row];
2048         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2049         rs          = 0.0;
2050         for (j=0; j<nz; j++) {
2051           idx    = pj[j];
2052           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2053           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2054         }
2055         sctx.rs = rs;
2056         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2057         if (sctx.newshift) goto endofwhile;
2058         break;
2059 
2060       case 2:
2061         for (j=0; j<nz; j++) {
2062           idx         = bjtmp[j];
2063           rtmp11[idx] = 0.0;
2064           rtmp22[idx] = 0.0;
2065         }
2066 
2067         /* load in initial (unfactored row) */
2068         idx    = r[row];
2069         nz_tmp = ai[idx+1] - ai[idx];
2070         ajtmp  = aj + ai[idx];
2071         v1     = aa + ai[idx];
2072         v2     = aa + ai[idx+1];
2073         for (j=0; j<nz_tmp; j++) {
2074           idx         = ics[ajtmp[j]];
2075           rtmp11[idx] = v1[j];
2076           rtmp22[idx] = v2[j];
2077         }
2078         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2079         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2080 
2081         prow = *bjtmp++;
2082         while (prow < row) {
2083           pc1 = rtmp11 + prow;
2084           pc2 = rtmp22 + prow;
2085           if (*pc1 != 0.0 || *pc2 != 0.0) {
2086             pv   = ba + bd[prow];
2087             pj   = nbj + bd[prow];
2088             mul1 = *pc1 * *pv;
2089             mul2 = *pc2 * *pv;
2090             ++pv;
2091             *pc1 = mul1;
2092             *pc2 = mul2;
2093 
2094             nz_tmp = bi[prow+1] - bd[prow] - 1;
2095             for (j=0; j<nz_tmp; j++) {
2096               tmp          = pv[j];
2097               idx          = pj[j];
2098               rtmp11[idx] -= mul1 * tmp;
2099               rtmp22[idx] -= mul2 * tmp;
2100             }
2101             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2102           }
2103           prow = *bjtmp++;
2104         }
2105 
2106         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2107         pc1 = rtmp11 + prow;
2108         pc2 = rtmp22 + prow;
2109 
2110         sctx.pv = *pc1;
2111         pj      = bj + bi[prow];
2112         rs      = 0.0;
2113         for (j=0; j<nz; j++) {
2114           idx = pj[j];
2115           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2116         }
2117         sctx.rs = rs;
2118         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2119         if (sctx.newshift) goto endofwhile;
2120 
2121         if (*pc2 != 0.0) {
2122           pj     = nbj + bd[prow];
2123           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2124           *pc2   = mul2;
2125           nz_tmp = bi[prow+1] - bd[prow] - 1;
2126           for (j=0; j<nz_tmp; j++) {
2127             idx          = pj[j];
2128             tmp          = rtmp11[idx];
2129             rtmp22[idx] -= mul2 * tmp;
2130           }
2131           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2132         }
2133 
2134         pj  = bj + bi[row];
2135         pc1 = ba + bi[row];
2136         pc2 = ba + bi[row+1];
2137 
2138         sctx.pv       = rtmp22[row+1];
2139         rs            = 0.0;
2140         rtmp11[row]   = 1.0/rtmp11[row];
2141         rtmp22[row+1] = 1.0/rtmp22[row+1];
2142         /* copy row entries from dense representation to sparse */
2143         for (j=0; j<nz; j++) {
2144           idx    = pj[j];
2145           pc1[j] = rtmp11[idx];
2146           pc2[j] = rtmp22[idx];
2147           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2148         }
2149         sctx.rs = rs;
2150         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2151         if (sctx.newshift) goto endofwhile;
2152         break;
2153 
2154       case 3:
2155         for  (j=0; j<nz; j++) {
2156           idx         = bjtmp[j];
2157           rtmp11[idx] = 0.0;
2158           rtmp22[idx] = 0.0;
2159           rtmp33[idx] = 0.0;
2160         }
2161         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2162         idx    = r[row];
2163         nz_tmp = ai[idx+1] - ai[idx];
2164         ajtmp  = aj + ai[idx];
2165         v1     = aa + ai[idx];
2166         v2     = aa + ai[idx+1];
2167         v3     = aa + ai[idx+2];
2168         for (j=0; j<nz_tmp; j++) {
2169           idx         = ics[ajtmp[j]];
2170           rtmp11[idx] = v1[j];
2171           rtmp22[idx] = v2[j];
2172           rtmp33[idx] = v3[j];
2173         }
2174         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2175         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2176         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2177 
2178         /* loop over all pivot row blocks above this row block */
2179         prow = *bjtmp++;
2180         while (prow < row) {
2181           pc1 = rtmp11 + prow;
2182           pc2 = rtmp22 + prow;
2183           pc3 = rtmp33 + prow;
2184           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2185             pv   = ba  + bd[prow];
2186             pj   = nbj + bd[prow];
2187             mul1 = *pc1 * *pv;
2188             mul2 = *pc2 * *pv;
2189             mul3 = *pc3 * *pv;
2190             ++pv;
2191             *pc1 = mul1;
2192             *pc2 = mul2;
2193             *pc3 = mul3;
2194 
2195             nz_tmp = bi[prow+1] - bd[prow] - 1;
2196             /* update this row based on pivot row */
2197             for (j=0; j<nz_tmp; j++) {
2198               tmp          = pv[j];
2199               idx          = pj[j];
2200               rtmp11[idx] -= mul1 * tmp;
2201               rtmp22[idx] -= mul2 * tmp;
2202               rtmp33[idx] -= mul3 * tmp;
2203             }
2204             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2205           }
2206           prow = *bjtmp++;
2207         }
2208 
2209         /* Now take care of diagonal 3x3 block in this set of rows */
2210         /* note: prow = row here */
2211         pc1 = rtmp11 + prow;
2212         pc2 = rtmp22 + prow;
2213         pc3 = rtmp33 + prow;
2214 
2215         sctx.pv = *pc1;
2216         pj      = bj + bi[prow];
2217         rs      = 0.0;
2218         for (j=0; j<nz; j++) {
2219           idx = pj[j];
2220           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2221         }
2222         sctx.rs = rs;
2223         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2224         if (sctx.newshift) goto endofwhile;
2225 
2226         if (*pc2 != 0.0 || *pc3 != 0.0) {
2227           mul2   = (*pc2)/(*pc1);
2228           mul3   = (*pc3)/(*pc1);
2229           *pc2   = mul2;
2230           *pc3   = mul3;
2231           nz_tmp = bi[prow+1] - bd[prow] - 1;
2232           pj     = nbj + bd[prow];
2233           for (j=0; j<nz_tmp; j++) {
2234             idx          = pj[j];
2235             tmp          = rtmp11[idx];
2236             rtmp22[idx] -= mul2 * tmp;
2237             rtmp33[idx] -= mul3 * tmp;
2238           }
2239           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2240         }
2241         ++prow;
2242 
2243         pc2     = rtmp22 + prow;
2244         pc3     = rtmp33 + prow;
2245         sctx.pv = *pc2;
2246         pj      = bj + bi[prow];
2247         rs      = 0.0;
2248         for (j=0; j<nz; j++) {
2249           idx = pj[j];
2250           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2251         }
2252         sctx.rs = rs;
2253         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2254         if (sctx.newshift) goto endofwhile;
2255 
2256         if (*pc3 != 0.0) {
2257           mul3   = (*pc3)/(*pc2);
2258           *pc3   = mul3;
2259           pj     = nbj + bd[prow];
2260           nz_tmp = bi[prow+1] - bd[prow] - 1;
2261           for (j=0; j<nz_tmp; j++) {
2262             idx          = pj[j];
2263             tmp          = rtmp22[idx];
2264             rtmp33[idx] -= mul3 * tmp;
2265           }
2266           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2267         }
2268 
2269         pj  = bj + bi[row];
2270         pc1 = ba + bi[row];
2271         pc2 = ba + bi[row+1];
2272         pc3 = ba + bi[row+2];
2273 
2274         sctx.pv       = rtmp33[row+2];
2275         rs            = 0.0;
2276         rtmp11[row]   = 1.0/rtmp11[row];
2277         rtmp22[row+1] = 1.0/rtmp22[row+1];
2278         rtmp33[row+2] = 1.0/rtmp33[row+2];
2279         /* copy row entries from dense representation to sparse */
2280         for (j=0; j<nz; j++) {
2281           idx    = pj[j];
2282           pc1[j] = rtmp11[idx];
2283           pc2[j] = rtmp22[idx];
2284           pc3[j] = rtmp33[idx];
2285           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2286         }
2287 
2288         sctx.rs = rs;
2289         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2290         if (sctx.newshift) goto endofwhile;
2291         break;
2292 
2293       default:
2294         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2295       }
2296       row += nodesz;                 /* Update the row */
2297     }
2298 endofwhile:;
2299   } while (sctx.newshift);
2300   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2301   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2302   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2303   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2304   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2305 
2306   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2307   /* do not set solve add, since MatSolve_Inode + Add is faster */
2308   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2309   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2310   C->assembled              = PETSC_TRUE;
2311   C->preallocated           = PETSC_TRUE;
2312   if (sctx.nshift) {
2313     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2314       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2315     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2316       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2317     }
2318   }
2319   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2320   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2321   PetscFunctionReturn(0);
2322 }
2323 
2324 
2325 /* ----------------------------------------------------------- */
2326 #undef __FUNCT__
2327 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2328 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2329 {
2330   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2331   IS                iscol = a->col,isrow = a->row;
2332   PetscErrorCode    ierr;
2333   const PetscInt    *r,*c,*rout,*cout;
2334   PetscInt          i,j,n = A->rmap->n;
2335   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2336   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2337   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2338   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2339   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2340   const PetscScalar *b;
2341 
2342   PetscFunctionBegin;
2343   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2344   node_max = a->inode.node_count;
2345   ns       = a->inode.size;     /* Node Size array */
2346 
2347   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2348   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2349   tmp  = a->solve_work;
2350 
2351   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2352   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2353 
2354   /* forward solve the lower triangular */
2355   tmps = tmp;
2356   aa   = a_a;
2357   aj   = a_j;
2358   ad   = a->diag;
2359 
2360   for (i = 0,row = 0; i< node_max; ++i) {
2361     nsz = ns[i];
2362     aii = ai[row];
2363     v1  = aa + aii;
2364     vi  = aj + aii;
2365     nz  = ai[row+1]- ai[row];
2366 
2367     if (i < node_max-1) {
2368       /* Prefetch the indices for the next block */
2369       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2370       /* Prefetch the data for the next block */
2371       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2372     }
2373 
2374     switch (nsz) {               /* Each loop in 'case' is unrolled */
2375     case 1:
2376       sum1 = b[r[row]];
2377       for (j=0; j<nz-1; j+=2) {
2378         i0    = vi[j];
2379         i1    = vi[j+1];
2380         tmp0  = tmps[i0];
2381         tmp1  = tmps[i1];
2382         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2383       }
2384       if (j == nz-1) {
2385         tmp0  = tmps[vi[j]];
2386         sum1 -= v1[j]*tmp0;
2387       }
2388       tmp[row++]=sum1;
2389       break;
2390     case 2:
2391       sum1 = b[r[row]];
2392       sum2 = b[r[row+1]];
2393       v2   = aa + ai[row+1];
2394 
2395       for (j=0; j<nz-1; j+=2) {
2396         i0    = vi[j];
2397         i1    = vi[j+1];
2398         tmp0  = tmps[i0];
2399         tmp1  = tmps[i1];
2400         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2401         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2402       }
2403       if (j == nz-1) {
2404         tmp0  = tmps[vi[j]];
2405         sum1 -= v1[j] *tmp0;
2406         sum2 -= v2[j] *tmp0;
2407       }
2408       sum2     -= v2[nz] * sum1;
2409       tmp[row++]=sum1;
2410       tmp[row++]=sum2;
2411       break;
2412     case 3:
2413       sum1 = b[r[row]];
2414       sum2 = b[r[row+1]];
2415       sum3 = b[r[row+2]];
2416       v2   = aa + ai[row+1];
2417       v3   = aa + ai[row+2];
2418 
2419       for (j=0; j<nz-1; j+=2) {
2420         i0    = vi[j];
2421         i1    = vi[j+1];
2422         tmp0  = tmps[i0];
2423         tmp1  = tmps[i1];
2424         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2425         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2426         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2427       }
2428       if (j == nz-1) {
2429         tmp0  = tmps[vi[j]];
2430         sum1 -= v1[j] *tmp0;
2431         sum2 -= v2[j] *tmp0;
2432         sum3 -= v3[j] *tmp0;
2433       }
2434       sum2     -= v2[nz] * sum1;
2435       sum3     -= v3[nz] * sum1;
2436       sum3     -= v3[nz+1] * sum2;
2437       tmp[row++]=sum1;
2438       tmp[row++]=sum2;
2439       tmp[row++]=sum3;
2440       break;
2441 
2442     case 4:
2443       sum1 = b[r[row]];
2444       sum2 = b[r[row+1]];
2445       sum3 = b[r[row+2]];
2446       sum4 = b[r[row+3]];
2447       v2   = aa + ai[row+1];
2448       v3   = aa + ai[row+2];
2449       v4   = aa + ai[row+3];
2450 
2451       for (j=0; j<nz-1; j+=2) {
2452         i0    = vi[j];
2453         i1    = vi[j+1];
2454         tmp0  = tmps[i0];
2455         tmp1  = tmps[i1];
2456         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2457         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2458         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2459         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2460       }
2461       if (j == nz-1) {
2462         tmp0  = tmps[vi[j]];
2463         sum1 -= v1[j] *tmp0;
2464         sum2 -= v2[j] *tmp0;
2465         sum3 -= v3[j] *tmp0;
2466         sum4 -= v4[j] *tmp0;
2467       }
2468       sum2 -= v2[nz] * sum1;
2469       sum3 -= v3[nz] * sum1;
2470       sum4 -= v4[nz] * sum1;
2471       sum3 -= v3[nz+1] * sum2;
2472       sum4 -= v4[nz+1] * sum2;
2473       sum4 -= v4[nz+2] * sum3;
2474 
2475       tmp[row++]=sum1;
2476       tmp[row++]=sum2;
2477       tmp[row++]=sum3;
2478       tmp[row++]=sum4;
2479       break;
2480     case 5:
2481       sum1 = b[r[row]];
2482       sum2 = b[r[row+1]];
2483       sum3 = b[r[row+2]];
2484       sum4 = b[r[row+3]];
2485       sum5 = b[r[row+4]];
2486       v2   = aa + ai[row+1];
2487       v3   = aa + ai[row+2];
2488       v4   = aa + ai[row+3];
2489       v5   = aa + ai[row+4];
2490 
2491       for (j=0; j<nz-1; j+=2) {
2492         i0    = vi[j];
2493         i1    = vi[j+1];
2494         tmp0  = tmps[i0];
2495         tmp1  = tmps[i1];
2496         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2497         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2498         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2499         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2500         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2501       }
2502       if (j == nz-1) {
2503         tmp0  = tmps[vi[j]];
2504         sum1 -= v1[j] *tmp0;
2505         sum2 -= v2[j] *tmp0;
2506         sum3 -= v3[j] *tmp0;
2507         sum4 -= v4[j] *tmp0;
2508         sum5 -= v5[j] *tmp0;
2509       }
2510 
2511       sum2 -= v2[nz] * sum1;
2512       sum3 -= v3[nz] * sum1;
2513       sum4 -= v4[nz] * sum1;
2514       sum5 -= v5[nz] * sum1;
2515       sum3 -= v3[nz+1] * sum2;
2516       sum4 -= v4[nz+1] * sum2;
2517       sum5 -= v5[nz+1] * sum2;
2518       sum4 -= v4[nz+2] * sum3;
2519       sum5 -= v5[nz+2] * sum3;
2520       sum5 -= v5[nz+3] * sum4;
2521 
2522       tmp[row++]=sum1;
2523       tmp[row++]=sum2;
2524       tmp[row++]=sum3;
2525       tmp[row++]=sum4;
2526       tmp[row++]=sum5;
2527       break;
2528     default:
2529       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2530     }
2531   }
2532   /* backward solve the upper triangular */
2533   for (i=node_max -1,row = n-1; i>=0; i--) {
2534     nsz = ns[i];
2535     aii = ad[row+1] + 1;
2536     v1  = aa + aii;
2537     vi  = aj + aii;
2538     nz  = ad[row]- ad[row+1] - 1;
2539 
2540     if (i > 0) {
2541       /* Prefetch the indices for the next block */
2542       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2543       /* Prefetch the data for the next block */
2544       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2545     }
2546 
2547     switch (nsz) {               /* Each loop in 'case' is unrolled */
2548     case 1:
2549       sum1 = tmp[row];
2550 
2551       for (j=0; j<nz-1; j+=2) {
2552         i0    = vi[j];
2553         i1    = vi[j+1];
2554         tmp0  = tmps[i0];
2555         tmp1  = tmps[i1];
2556         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2557       }
2558       if (j == nz-1) {
2559         tmp0  = tmps[vi[j]];
2560         sum1 -= v1[j]*tmp0;
2561       }
2562       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2563       break;
2564     case 2:
2565       sum1 = tmp[row];
2566       sum2 = tmp[row-1];
2567       v2   = aa + ad[row] + 1;
2568       for (j=0; j<nz-1; j+=2) {
2569         i0    = vi[j];
2570         i1    = vi[j+1];
2571         tmp0  = tmps[i0];
2572         tmp1  = tmps[i1];
2573         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2574         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2575       }
2576       if (j == nz-1) {
2577         tmp0  = tmps[vi[j]];
2578         sum1 -= v1[j]* tmp0;
2579         sum2 -= v2[j+1]* tmp0;
2580       }
2581 
2582       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2583       sum2     -= v2[0] * tmp0;
2584       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2585       break;
2586     case 3:
2587       sum1 = tmp[row];
2588       sum2 = tmp[row -1];
2589       sum3 = tmp[row -2];
2590       v2   = aa + ad[row] + 1;
2591       v3   = aa + ad[row -1] + 1;
2592       for (j=0; j<nz-1; j+=2) {
2593         i0    = vi[j];
2594         i1    = vi[j+1];
2595         tmp0  = tmps[i0];
2596         tmp1  = tmps[i1];
2597         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2598         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2599         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2600       }
2601       if (j== nz-1) {
2602         tmp0  = tmps[vi[j]];
2603         sum1 -= v1[j] * tmp0;
2604         sum2 -= v2[j+1] * tmp0;
2605         sum3 -= v3[j+2] * tmp0;
2606       }
2607       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2608       sum2     -= v2[0]* tmp0;
2609       sum3     -= v3[1] * tmp0;
2610       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2611       sum3     -= v3[0]* tmp0;
2612       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2613 
2614       break;
2615     case 4:
2616       sum1 = tmp[row];
2617       sum2 = tmp[row -1];
2618       sum3 = tmp[row -2];
2619       sum4 = tmp[row -3];
2620       v2   = aa + ad[row]+1;
2621       v3   = aa + ad[row -1]+1;
2622       v4   = aa + ad[row -2]+1;
2623 
2624       for (j=0; j<nz-1; j+=2) {
2625         i0    = vi[j];
2626         i1    = vi[j+1];
2627         tmp0  = tmps[i0];
2628         tmp1  = tmps[i1];
2629         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2630         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633       }
2634       if (j== nz-1) {
2635         tmp0  = tmps[vi[j]];
2636         sum1 -= v1[j] * tmp0;
2637         sum2 -= v2[j+1] * tmp0;
2638         sum3 -= v3[j+2] * tmp0;
2639         sum4 -= v4[j+3] * tmp0;
2640       }
2641 
2642       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2643       sum2     -= v2[0] * tmp0;
2644       sum3     -= v3[1] * tmp0;
2645       sum4     -= v4[2] * tmp0;
2646       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2647       sum3     -= v3[0] * tmp0;
2648       sum4     -= v4[1] * tmp0;
2649       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2650       sum4     -= v4[0] * tmp0;
2651       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2652       break;
2653     case 5:
2654       sum1 = tmp[row];
2655       sum2 = tmp[row -1];
2656       sum3 = tmp[row -2];
2657       sum4 = tmp[row -3];
2658       sum5 = tmp[row -4];
2659       v2   = aa + ad[row]+1;
2660       v3   = aa + ad[row -1]+1;
2661       v4   = aa + ad[row -2]+1;
2662       v5   = aa + ad[row -3]+1;
2663       for (j=0; j<nz-1; j+=2) {
2664         i0    = vi[j];
2665         i1    = vi[j+1];
2666         tmp0  = tmps[i0];
2667         tmp1  = tmps[i1];
2668         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2669         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2670         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2671         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2672         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2673       }
2674       if (j==nz-1) {
2675         tmp0  = tmps[vi[j]];
2676         sum1 -= v1[j] * tmp0;
2677         sum2 -= v2[j+1] * tmp0;
2678         sum3 -= v3[j+2] * tmp0;
2679         sum4 -= v4[j+3] * tmp0;
2680         sum5 -= v5[j+4] * tmp0;
2681       }
2682 
2683       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2684       sum2     -= v2[0] * tmp0;
2685       sum3     -= v3[1] * tmp0;
2686       sum4     -= v4[2] * tmp0;
2687       sum5     -= v5[3] * tmp0;
2688       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2689       sum3     -= v3[0] * tmp0;
2690       sum4     -= v4[1] * tmp0;
2691       sum5     -= v5[2] * tmp0;
2692       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2693       sum4     -= v4[0] * tmp0;
2694       sum5     -= v5[1] * tmp0;
2695       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2696       sum5     -= v5[0] * tmp0;
2697       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2698       break;
2699     default:
2700       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2701     }
2702   }
2703   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2704   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2705   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2706   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2707   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2708   PetscFunctionReturn(0);
2709 }
2710 
2711 
2712 /*
2713      Makes a longer coloring[] array and calls the usual code with that
2714 */
2715 #undef __FUNCT__
2716 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2717 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2718 {
2719   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2720   PetscErrorCode  ierr;
2721   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2722   PetscInt        *colorused,i;
2723   ISColoringValue *newcolor;
2724 
2725   PetscFunctionBegin;
2726   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2727   /* loop over inodes, marking a color for each column*/
2728   row = 0;
2729   for (i=0; i<m; i++) {
2730     for (j=0; j<ns[i]; j++) {
2731       newcolor[row++] = coloring[i] + j*ncolors;
2732     }
2733   }
2734 
2735   /* eliminate unneeded colors */
2736   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2737   for (i=0; i<n; i++) {
2738     colorused[newcolor[i]] = 1;
2739   }
2740 
2741   for (i=1; i<5*ncolors; i++) {
2742     colorused[i] += colorused[i-1];
2743   }
2744   ncolors = colorused[5*ncolors-1];
2745   for (i=0; i<n; i++) {
2746     newcolor[i] = colorused[newcolor[i]]-1;
2747   }
2748   ierr = PetscFree(colorused);CHKERRQ(ierr);
2749   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2750   ierr = PetscFree(coloring);CHKERRQ(ierr);
2751   PetscFunctionReturn(0);
2752 }
2753 
2754 #include <petsc/private/kernels/blockinvert.h>
2755 
2756 #undef __FUNCT__
2757 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2758 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2759 {
2760   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2761   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2762   MatScalar         *ibdiag,*bdiag,work[25],*t;
2763   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2764   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2765   const PetscScalar *xb, *b;
2766   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2767   PetscErrorCode    ierr;
2768   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2769   PetscInt          sz,k,ipvt[5];
2770   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2771 
2772   PetscFunctionBegin;
2773   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2774   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2775   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2776 
2777   if (!a->inode.ibdiagvalid) {
2778     if (!a->inode.ibdiag) {
2779       /* calculate space needed for diagonal blocks */
2780       for (i=0; i<m; i++) {
2781         cnt += sizes[i]*sizes[i];
2782       }
2783       a->inode.bdiagsize = cnt;
2784 
2785       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2786     }
2787 
2788     /* copy over the diagonal blocks and invert them */
2789     ibdiag = a->inode.ibdiag;
2790     bdiag  = a->inode.bdiag;
2791     cnt    = 0;
2792     for (i=0, row = 0; i<m; i++) {
2793       for (j=0; j<sizes[i]; j++) {
2794         for (k=0; k<sizes[i]; k++) {
2795           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2796         }
2797       }
2798       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2799 
2800       switch (sizes[i]) {
2801       case 1:
2802         /* Create matrix data structure */
2803         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2804         ibdiag[cnt] = 1.0/ibdiag[cnt];
2805         break;
2806       case 2:
2807         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2808         break;
2809       case 3:
2810         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2811         break;
2812       case 4:
2813         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2814         break;
2815       case 5:
2816         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2817         break;
2818       default:
2819         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2820       }
2821       cnt += sizes[i]*sizes[i];
2822       row += sizes[i];
2823     }
2824     a->inode.ibdiagvalid = PETSC_TRUE;
2825   }
2826   ibdiag = a->inode.ibdiag;
2827   bdiag  = a->inode.bdiag;
2828   t      = a->inode.ssor_work;
2829 
2830   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2831   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2832   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2833   if (flag & SOR_ZERO_INITIAL_GUESS) {
2834     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2835 
2836       for (i=0, row=0; i<m; i++) {
2837         sz  = diag[row] - ii[row];
2838         v1  = a->a + ii[row];
2839         idx = a->j + ii[row];
2840 
2841         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2842         switch (sizes[i]) {
2843         case 1:
2844 
2845           sum1 = b[row];
2846           for (n = 0; n<sz-1; n+=2) {
2847             i1    = idx[0];
2848             i2    = idx[1];
2849             idx  += 2;
2850             tmp0  = x[i1];
2851             tmp1  = x[i2];
2852             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2853           }
2854 
2855           if (n == sz-1) {
2856             tmp0  = x[*idx];
2857             sum1 -= *v1 * tmp0;
2858           }
2859           t[row]   = sum1;
2860           x[row++] = sum1*(*ibdiag++);
2861           break;
2862         case 2:
2863           v2   = a->a + ii[row+1];
2864           sum1 = b[row];
2865           sum2 = b[row+1];
2866           for (n = 0; n<sz-1; n+=2) {
2867             i1    = idx[0];
2868             i2    = idx[1];
2869             idx  += 2;
2870             tmp0  = x[i1];
2871             tmp1  = x[i2];
2872             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2873             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2874           }
2875 
2876           if (n == sz-1) {
2877             tmp0  = x[*idx];
2878             sum1 -= v1[0] * tmp0;
2879             sum2 -= v2[0] * tmp0;
2880           }
2881           t[row]   = sum1;
2882           t[row+1] = sum2;
2883           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2884           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2885           ibdiag  += 4;
2886           break;
2887         case 3:
2888           v2   = a->a + ii[row+1];
2889           v3   = a->a + ii[row+2];
2890           sum1 = b[row];
2891           sum2 = b[row+1];
2892           sum3 = b[row+2];
2893           for (n = 0; n<sz-1; n+=2) {
2894             i1    = idx[0];
2895             i2    = idx[1];
2896             idx  += 2;
2897             tmp0  = x[i1];
2898             tmp1  = x[i2];
2899             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2900             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2901             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2902           }
2903 
2904           if (n == sz-1) {
2905             tmp0  = x[*idx];
2906             sum1 -= v1[0] * tmp0;
2907             sum2 -= v2[0] * tmp0;
2908             sum3 -= v3[0] * tmp0;
2909           }
2910           t[row]   = sum1;
2911           t[row+1] = sum2;
2912           t[row+2] = sum3;
2913           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2914           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2915           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2916           ibdiag  += 9;
2917           break;
2918         case 4:
2919           v2   = a->a + ii[row+1];
2920           v3   = a->a + ii[row+2];
2921           v4   = a->a + ii[row+3];
2922           sum1 = b[row];
2923           sum2 = b[row+1];
2924           sum3 = b[row+2];
2925           sum4 = b[row+3];
2926           for (n = 0; n<sz-1; n+=2) {
2927             i1    = idx[0];
2928             i2    = idx[1];
2929             idx  += 2;
2930             tmp0  = x[i1];
2931             tmp1  = x[i2];
2932             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2933             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2934             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2935             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2936           }
2937 
2938           if (n == sz-1) {
2939             tmp0  = x[*idx];
2940             sum1 -= v1[0] * tmp0;
2941             sum2 -= v2[0] * tmp0;
2942             sum3 -= v3[0] * tmp0;
2943             sum4 -= v4[0] * tmp0;
2944           }
2945           t[row]   = sum1;
2946           t[row+1] = sum2;
2947           t[row+2] = sum3;
2948           t[row+3] = sum4;
2949           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2950           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2951           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2952           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2953           ibdiag  += 16;
2954           break;
2955         case 5:
2956           v2   = a->a + ii[row+1];
2957           v3   = a->a + ii[row+2];
2958           v4   = a->a + ii[row+3];
2959           v5   = a->a + ii[row+4];
2960           sum1 = b[row];
2961           sum2 = b[row+1];
2962           sum3 = b[row+2];
2963           sum4 = b[row+3];
2964           sum5 = b[row+4];
2965           for (n = 0; n<sz-1; n+=2) {
2966             i1    = idx[0];
2967             i2    = idx[1];
2968             idx  += 2;
2969             tmp0  = x[i1];
2970             tmp1  = x[i2];
2971             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2972             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2973             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2974             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2975             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2976           }
2977 
2978           if (n == sz-1) {
2979             tmp0  = x[*idx];
2980             sum1 -= v1[0] * tmp0;
2981             sum2 -= v2[0] * tmp0;
2982             sum3 -= v3[0] * tmp0;
2983             sum4 -= v4[0] * tmp0;
2984             sum5 -= v5[0] * tmp0;
2985           }
2986           t[row]   = sum1;
2987           t[row+1] = sum2;
2988           t[row+2] = sum3;
2989           t[row+3] = sum4;
2990           t[row+4] = sum5;
2991           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2992           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2993           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2994           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2995           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2996           ibdiag  += 25;
2997           break;
2998         default:
2999           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3000         }
3001       }
3002 
3003       xb   = t;
3004       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3005     } else xb = b;
3006     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3007 
3008       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3009       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3010         ibdiag -= sizes[i]*sizes[i];
3011         sz      = ii[row+1] - diag[row] - 1;
3012         v1      = a->a + diag[row] + 1;
3013         idx     = a->j + diag[row] + 1;
3014 
3015         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3016         switch (sizes[i]) {
3017         case 1:
3018 
3019           sum1 = xb[row];
3020           for (n = 0; n<sz-1; n+=2) {
3021             i1    = idx[0];
3022             i2    = idx[1];
3023             idx  += 2;
3024             tmp0  = x[i1];
3025             tmp1  = x[i2];
3026             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3027           }
3028 
3029           if (n == sz-1) {
3030             tmp0  = x[*idx];
3031             sum1 -= *v1*tmp0;
3032           }
3033           x[row--] = sum1*(*ibdiag);
3034           break;
3035 
3036         case 2:
3037 
3038           sum1 = xb[row];
3039           sum2 = xb[row-1];
3040           /* note that sum1 is associated with the second of the two rows */
3041           v2 = a->a + diag[row-1] + 2;
3042           for (n = 0; n<sz-1; n+=2) {
3043             i1    = idx[0];
3044             i2    = idx[1];
3045             idx  += 2;
3046             tmp0  = x[i1];
3047             tmp1  = x[i2];
3048             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3049             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3050           }
3051 
3052           if (n == sz-1) {
3053             tmp0  = x[*idx];
3054             sum1 -= *v1*tmp0;
3055             sum2 -= *v2*tmp0;
3056           }
3057           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3058           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3059           break;
3060         case 3:
3061 
3062           sum1 = xb[row];
3063           sum2 = xb[row-1];
3064           sum3 = xb[row-2];
3065           v2   = a->a + diag[row-1] + 2;
3066           v3   = a->a + diag[row-2] + 3;
3067           for (n = 0; n<sz-1; n+=2) {
3068             i1    = idx[0];
3069             i2    = idx[1];
3070             idx  += 2;
3071             tmp0  = x[i1];
3072             tmp1  = x[i2];
3073             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3074             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3075             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3076           }
3077 
3078           if (n == sz-1) {
3079             tmp0  = x[*idx];
3080             sum1 -= *v1*tmp0;
3081             sum2 -= *v2*tmp0;
3082             sum3 -= *v3*tmp0;
3083           }
3084           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3085           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3086           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3087           break;
3088         case 4:
3089 
3090           sum1 = xb[row];
3091           sum2 = xb[row-1];
3092           sum3 = xb[row-2];
3093           sum4 = xb[row-3];
3094           v2   = a->a + diag[row-1] + 2;
3095           v3   = a->a + diag[row-2] + 3;
3096           v4   = a->a + diag[row-3] + 4;
3097           for (n = 0; n<sz-1; n+=2) {
3098             i1    = idx[0];
3099             i2    = idx[1];
3100             idx  += 2;
3101             tmp0  = x[i1];
3102             tmp1  = x[i2];
3103             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3104             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3105             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3106             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3107           }
3108 
3109           if (n == sz-1) {
3110             tmp0  = x[*idx];
3111             sum1 -= *v1*tmp0;
3112             sum2 -= *v2*tmp0;
3113             sum3 -= *v3*tmp0;
3114             sum4 -= *v4*tmp0;
3115           }
3116           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3117           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3118           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3119           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3120           break;
3121         case 5:
3122 
3123           sum1 = xb[row];
3124           sum2 = xb[row-1];
3125           sum3 = xb[row-2];
3126           sum4 = xb[row-3];
3127           sum5 = xb[row-4];
3128           v2   = a->a + diag[row-1] + 2;
3129           v3   = a->a + diag[row-2] + 3;
3130           v4   = a->a + diag[row-3] + 4;
3131           v5   = a->a + diag[row-4] + 5;
3132           for (n = 0; n<sz-1; n+=2) {
3133             i1    = idx[0];
3134             i2    = idx[1];
3135             idx  += 2;
3136             tmp0  = x[i1];
3137             tmp1  = x[i2];
3138             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3139             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3140             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3141             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3142             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3143           }
3144 
3145           if (n == sz-1) {
3146             tmp0  = x[*idx];
3147             sum1 -= *v1*tmp0;
3148             sum2 -= *v2*tmp0;
3149             sum3 -= *v3*tmp0;
3150             sum4 -= *v4*tmp0;
3151             sum5 -= *v5*tmp0;
3152           }
3153           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3154           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3155           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3156           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3157           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3158           break;
3159         default:
3160           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3161         }
3162       }
3163 
3164       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3165     }
3166     its--;
3167   }
3168   while (its--) {
3169 
3170     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3171       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3172            i<m;
3173            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3174 
3175         sz  = diag[row] - ii[row];
3176         v1  = a->a + ii[row];
3177         idx = a->j + ii[row];
3178         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3179         switch (sizes[i]) {
3180         case 1:
3181           sum1 = b[row];
3182           for (n = 0; n<sz-1; n+=2) {
3183             i1    = idx[0];
3184             i2    = idx[1];
3185             idx  += 2;
3186             tmp0  = x[i1];
3187             tmp1  = x[i2];
3188             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3189           }
3190           if (n == sz-1) {
3191             tmp0  = x[*idx++];
3192             sum1 -= *v1 * tmp0;
3193             v1++;
3194           }
3195           t[row]   = sum1;
3196           sz      = ii[row+1] - diag[row] - 1;
3197           idx     = a->j + diag[row] + 1;
3198           v1 += 1;
3199           for (n = 0; n<sz-1; n+=2) {
3200             i1    = idx[0];
3201             i2    = idx[1];
3202             idx  += 2;
3203             tmp0  = x[i1];
3204             tmp1  = x[i2];
3205             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3206           }
3207           if (n == sz-1) {
3208             tmp0  = x[*idx++];
3209             sum1 -= *v1 * tmp0;
3210           }
3211           /* in MatSOR_SeqAIJ this line would be
3212            *
3213            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3214            *
3215            * but omega == 1, so this becomes
3216            *
3217            * x[row] = sum1*(*ibdiag++);
3218            *
3219            */
3220           x[row] = sum1*(*ibdiag);
3221           break;
3222         case 2:
3223           v2   = a->a + ii[row+1];
3224           sum1 = b[row];
3225           sum2 = b[row+1];
3226           for (n = 0; n<sz-1; n+=2) {
3227             i1    = idx[0];
3228             i2    = idx[1];
3229             idx  += 2;
3230             tmp0  = x[i1];
3231             tmp1  = x[i2];
3232             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3233             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3234           }
3235           if (n == sz-1) {
3236             tmp0  = x[*idx++];
3237             sum1 -= v1[0] * tmp0;
3238             sum2 -= v2[0] * tmp0;
3239             v1++; v2++;
3240           }
3241           t[row]   = sum1;
3242           t[row+1] = sum2;
3243           sz      = ii[row+1] - diag[row] - 2;
3244           idx     = a->j + diag[row] + 2;
3245           v1 += 2;
3246           v2 += 2;
3247           for (n = 0; n<sz-1; n+=2) {
3248             i1    = idx[0];
3249             i2    = idx[1];
3250             idx  += 2;
3251             tmp0  = x[i1];
3252             tmp1  = x[i2];
3253             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3254             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3255           }
3256           if (n == sz-1) {
3257             tmp0  = x[*idx];
3258             sum1 -= v1[0] * tmp0;
3259             sum2 -= v2[0] * tmp0;
3260           }
3261           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3262           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3263           break;
3264         case 3:
3265           v2   = a->a + ii[row+1];
3266           v3   = a->a + ii[row+2];
3267           sum1 = b[row];
3268           sum2 = b[row+1];
3269           sum3 = b[row+2];
3270           for (n = 0; n<sz-1; n+=2) {
3271             i1    = idx[0];
3272             i2    = idx[1];
3273             idx  += 2;
3274             tmp0  = x[i1];
3275             tmp1  = x[i2];
3276             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3277             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3278             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3279           }
3280           if (n == sz-1) {
3281             tmp0  = x[*idx++];
3282             sum1 -= v1[0] * tmp0;
3283             sum2 -= v2[0] * tmp0;
3284             sum3 -= v3[0] * tmp0;
3285             v1++; v2++; v3++;
3286           }
3287           t[row]   = sum1;
3288           t[row+1] = sum2;
3289           t[row+2] = sum3;
3290           sz      = ii[row+1] - diag[row] - 3;
3291           idx     = a->j + diag[row] + 3;
3292           v1 += 3;
3293           v2 += 3;
3294           v3 += 3;
3295           for (n = 0; n<sz-1; n+=2) {
3296             i1    = idx[0];
3297             i2    = idx[1];
3298             idx  += 2;
3299             tmp0  = x[i1];
3300             tmp1  = x[i2];
3301             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3302             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3303             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3304           }
3305           if (n == sz-1) {
3306             tmp0  = x[*idx];
3307             sum1 -= v1[0] * tmp0;
3308             sum2 -= v2[0] * tmp0;
3309             sum3 -= v3[0] * tmp0;
3310           }
3311           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3312           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3313           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3314           break;
3315         case 4:
3316           v2   = a->a + ii[row+1];
3317           v3   = a->a + ii[row+2];
3318           v4   = a->a + ii[row+3];
3319           sum1 = b[row];
3320           sum2 = b[row+1];
3321           sum3 = b[row+2];
3322           sum4 = b[row+3];
3323           for (n = 0; n<sz-1; n+=2) {
3324             i1    = idx[0];
3325             i2    = idx[1];
3326             idx  += 2;
3327             tmp0  = x[i1];
3328             tmp1  = x[i2];
3329             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333           }
3334           if (n == sz-1) {
3335             tmp0  = x[*idx++];
3336             sum1 -= v1[0] * tmp0;
3337             sum2 -= v2[0] * tmp0;
3338             sum3 -= v3[0] * tmp0;
3339             sum4 -= v4[0] * tmp0;
3340             v1++; v2++; v3++; v4++;
3341           }
3342           t[row]   = sum1;
3343           t[row+1] = sum2;
3344           t[row+2] = sum3;
3345           t[row+3] = sum4;
3346           sz      = ii[row+1] - diag[row] - 4;
3347           idx     = a->j + diag[row] + 4;
3348           v1 += 4;
3349           v2 += 4;
3350           v3 += 4;
3351           v4 += 4;
3352           for (n = 0; n<sz-1; n+=2) {
3353             i1    = idx[0];
3354             i2    = idx[1];
3355             idx  += 2;
3356             tmp0  = x[i1];
3357             tmp1  = x[i2];
3358             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3359             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3360             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3361             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3362           }
3363           if (n == sz-1) {
3364             tmp0  = x[*idx];
3365             sum1 -= v1[0] * tmp0;
3366             sum2 -= v2[0] * tmp0;
3367             sum3 -= v3[0] * tmp0;
3368             sum4 -= v4[0] * tmp0;
3369           }
3370           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3371           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3372           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3373           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3374           break;
3375         case 5:
3376           v2   = a->a + ii[row+1];
3377           v3   = a->a + ii[row+2];
3378           v4   = a->a + ii[row+3];
3379           v5   = a->a + ii[row+4];
3380           sum1 = b[row];
3381           sum2 = b[row+1];
3382           sum3 = b[row+2];
3383           sum4 = b[row+3];
3384           sum5 = b[row+4];
3385           for (n = 0; n<sz-1; n+=2) {
3386             i1    = idx[0];
3387             i2    = idx[1];
3388             idx  += 2;
3389             tmp0  = x[i1];
3390             tmp1  = x[i2];
3391             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3392             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3393             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3394             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3395             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3396           }
3397           if (n == sz-1) {
3398             tmp0  = x[*idx++];
3399             sum1 -= v1[0] * tmp0;
3400             sum2 -= v2[0] * tmp0;
3401             sum3 -= v3[0] * tmp0;
3402             sum4 -= v4[0] * tmp0;
3403             sum5 -= v5[0] * tmp0;
3404             v1++; v2++; v3++; v4++; v5++;
3405           }
3406           t[row]   = sum1;
3407           t[row+1] = sum2;
3408           t[row+2] = sum3;
3409           t[row+3] = sum4;
3410           t[row+4] = sum5;
3411           sz      = ii[row+1] - diag[row] - 5;
3412           idx     = a->j + diag[row] + 5;
3413           v1 += 5;
3414           v2 += 5;
3415           v3 += 5;
3416           v4 += 5;
3417           v5 += 5;
3418           for (n = 0; n<sz-1; n+=2) {
3419             i1    = idx[0];
3420             i2    = idx[1];
3421             idx  += 2;
3422             tmp0  = x[i1];
3423             tmp1  = x[i2];
3424             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3425             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3426             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3427             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3428             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3429           }
3430           if (n == sz-1) {
3431             tmp0  = x[*idx];
3432             sum1 -= v1[0] * tmp0;
3433             sum2 -= v2[0] * tmp0;
3434             sum3 -= v3[0] * tmp0;
3435             sum4 -= v4[0] * tmp0;
3436             sum5 -= v5[0] * tmp0;
3437           }
3438           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3439           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3440           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3441           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3442           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3443           break;
3444         default:
3445           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3446         }
3447       }
3448       xb   = t;
3449       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3450     } else xb = b;
3451 
3452     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3453 
3454       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3455       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3456         ibdiag -= sizes[i]*sizes[i];
3457 
3458         /* set RHS */
3459         if (xb == b) {
3460           /* whole (old way) */
3461           sz      = ii[row+1] - ii[row];
3462           idx     = a->j + ii[row];
3463           switch (sizes[i]) {
3464           case 5:
3465             v5      = a->a + ii[row-4];
3466           case 4: /* fall through */
3467             v4      = a->a + ii[row-3];
3468           case 3:
3469             v3      = a->a + ii[row-2];
3470           case 2:
3471             v2      = a->a + ii[row-1];
3472           case 1:
3473             v1      = a->a + ii[row];
3474             break;
3475           default:
3476             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3477           }
3478         } else {
3479           /* upper, no diag */
3480           sz      = ii[row+1] - diag[row] - 1;
3481           idx     = a->j + diag[row] + 1;
3482           switch (sizes[i]) {
3483           case 5:
3484             v5      = a->a + diag[row-4] + 5;
3485           case 4: /* fall through */
3486             v4      = a->a + diag[row-3] + 4;
3487           case 3:
3488             v3      = a->a + diag[row-2] + 3;
3489           case 2:
3490             v2      = a->a + diag[row-1] + 2;
3491           case 1:
3492             v1      = a->a + diag[row] + 1;
3493           }
3494         }
3495         /* set sum */
3496         switch (sizes[i]) {
3497         case 5:
3498           sum5 = xb[row-4];
3499         case 4: /* fall through */
3500           sum4 = xb[row-3];
3501         case 3:
3502           sum3 = xb[row-2];
3503         case 2:
3504           sum2 = xb[row-1];
3505         case 1:
3506           /* note that sum1 is associated with the last row */
3507           sum1 = xb[row];
3508         }
3509         /* do sums */
3510         for (n = 0; n<sz-1; n+=2) {
3511             i1    = idx[0];
3512             i2    = idx[1];
3513             idx  += 2;
3514             tmp0  = x[i1];
3515             tmp1  = x[i2];
3516             switch (sizes[i]) {
3517             case 5:
3518               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3519             case 4: /* fall through */
3520               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3521             case 3:
3522               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3523             case 2:
3524               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3525             case 1:
3526               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3527             }
3528         }
3529         /* ragged edge */
3530         if (n == sz-1) {
3531           tmp0  = x[*idx];
3532           switch (sizes[i]) {
3533           case 5:
3534             sum5 -= *v5*tmp0;
3535           case 4: /* fall through */
3536             sum4 -= *v4*tmp0;
3537           case 3:
3538             sum3 -= *v3*tmp0;
3539           case 2:
3540             sum2 -= *v2*tmp0;
3541           case 1:
3542             sum1 -= *v1*tmp0;
3543           }
3544         }
3545         /* update */
3546         if (xb == b) {
3547           /* whole (old way) w/ diag */
3548           switch (sizes[i]) {
3549           case 5:
3550             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555             break;
3556           case 4:
3557             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561             break;
3562           case 3:
3563             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566             break;
3567           case 2:
3568             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3569             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3570             break;
3571           case 1:
3572             x[row--] += sum1*(*ibdiag);
3573             break;
3574           }
3575         } else {
3576           /* no diag so set =  */
3577           switch (sizes[i]) {
3578           case 5:
3579             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3580             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3581             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3582             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3583             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3584             break;
3585           case 4:
3586             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3587             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3588             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3589             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3590             break;
3591           case 3:
3592             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3593             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3594             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3595             break;
3596           case 2:
3597             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3598             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3599             break;
3600           case 1:
3601             x[row--] = sum1*(*ibdiag);
3602             break;
3603           }
3604         }
3605       }
3606       if (xb == b) {
3607         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3608       } else {
3609         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3610       }
3611     }
3612   }
3613   if (flag & SOR_EISENSTAT) {
3614     /*
3615           Apply  (U + D)^-1  where D is now the block diagonal
3616     */
3617     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3618     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3619       ibdiag -= sizes[i]*sizes[i];
3620       sz      = ii[row+1] - diag[row] - 1;
3621       v1      = a->a + diag[row] + 1;
3622       idx     = a->j + diag[row] + 1;
3623       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3624       switch (sizes[i]) {
3625       case 1:
3626 
3627         sum1 = b[row];
3628         for (n = 0; n<sz-1; n+=2) {
3629           i1    = idx[0];
3630           i2    = idx[1];
3631           idx  += 2;
3632           tmp0  = x[i1];
3633           tmp1  = x[i2];
3634           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3635         }
3636 
3637         if (n == sz-1) {
3638           tmp0  = x[*idx];
3639           sum1 -= *v1*tmp0;
3640         }
3641         x[row] = sum1*(*ibdiag);row--;
3642         break;
3643 
3644       case 2:
3645 
3646         sum1 = b[row];
3647         sum2 = b[row-1];
3648         /* note that sum1 is associated with the second of the two rows */
3649         v2 = a->a + diag[row-1] + 2;
3650         for (n = 0; n<sz-1; n+=2) {
3651           i1    = idx[0];
3652           i2    = idx[1];
3653           idx  += 2;
3654           tmp0  = x[i1];
3655           tmp1  = x[i2];
3656           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3657           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3658         }
3659 
3660         if (n == sz-1) {
3661           tmp0  = x[*idx];
3662           sum1 -= *v1*tmp0;
3663           sum2 -= *v2*tmp0;
3664         }
3665         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3666         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3667         row     -= 2;
3668         break;
3669       case 3:
3670 
3671         sum1 = b[row];
3672         sum2 = b[row-1];
3673         sum3 = b[row-2];
3674         v2   = a->a + diag[row-1] + 2;
3675         v3   = a->a + diag[row-2] + 3;
3676         for (n = 0; n<sz-1; n+=2) {
3677           i1    = idx[0];
3678           i2    = idx[1];
3679           idx  += 2;
3680           tmp0  = x[i1];
3681           tmp1  = x[i2];
3682           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3683           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3684           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3685         }
3686 
3687         if (n == sz-1) {
3688           tmp0  = x[*idx];
3689           sum1 -= *v1*tmp0;
3690           sum2 -= *v2*tmp0;
3691           sum3 -= *v3*tmp0;
3692         }
3693         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3694         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3695         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3696         row     -= 3;
3697         break;
3698       case 4:
3699 
3700         sum1 = b[row];
3701         sum2 = b[row-1];
3702         sum3 = b[row-2];
3703         sum4 = b[row-3];
3704         v2   = a->a + diag[row-1] + 2;
3705         v3   = a->a + diag[row-2] + 3;
3706         v4   = a->a + diag[row-3] + 4;
3707         for (n = 0; n<sz-1; n+=2) {
3708           i1    = idx[0];
3709           i2    = idx[1];
3710           idx  += 2;
3711           tmp0  = x[i1];
3712           tmp1  = x[i2];
3713           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3714           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3715           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3716           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3717         }
3718 
3719         if (n == sz-1) {
3720           tmp0  = x[*idx];
3721           sum1 -= *v1*tmp0;
3722           sum2 -= *v2*tmp0;
3723           sum3 -= *v3*tmp0;
3724           sum4 -= *v4*tmp0;
3725         }
3726         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3727         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3728         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3729         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3730         row     -= 4;
3731         break;
3732       case 5:
3733 
3734         sum1 = b[row];
3735         sum2 = b[row-1];
3736         sum3 = b[row-2];
3737         sum4 = b[row-3];
3738         sum5 = b[row-4];
3739         v2   = a->a + diag[row-1] + 2;
3740         v3   = a->a + diag[row-2] + 3;
3741         v4   = a->a + diag[row-3] + 4;
3742         v5   = a->a + diag[row-4] + 5;
3743         for (n = 0; n<sz-1; n+=2) {
3744           i1    = idx[0];
3745           i2    = idx[1];
3746           idx  += 2;
3747           tmp0  = x[i1];
3748           tmp1  = x[i2];
3749           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3750           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3751           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3752           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3753           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3754         }
3755 
3756         if (n == sz-1) {
3757           tmp0  = x[*idx];
3758           sum1 -= *v1*tmp0;
3759           sum2 -= *v2*tmp0;
3760           sum3 -= *v3*tmp0;
3761           sum4 -= *v4*tmp0;
3762           sum5 -= *v5*tmp0;
3763         }
3764         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3765         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3766         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3767         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3768         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3769         row     -= 5;
3770         break;
3771       default:
3772         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3773       }
3774     }
3775     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3776 
3777     /*
3778            t = b - D x    where D is the block diagonal
3779     */
3780     cnt = 0;
3781     for (i=0, row=0; i<m; i++) {
3782       switch (sizes[i]) {
3783       case 1:
3784         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3785         break;
3786       case 2:
3787         x1       = x[row]; x2 = x[row+1];
3788         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3789         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3790         t[row]   = b[row] - tmp1;
3791         t[row+1] = b[row+1] - tmp2; row += 2;
3792         cnt     += 4;
3793         break;
3794       case 3:
3795         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3796         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3797         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3798         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3799         t[row]   = b[row] - tmp1;
3800         t[row+1] = b[row+1] - tmp2;
3801         t[row+2] = b[row+2] - tmp3; row += 3;
3802         cnt     += 9;
3803         break;
3804       case 4:
3805         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3806         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3807         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3808         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3809         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3810         t[row]   = b[row] - tmp1;
3811         t[row+1] = b[row+1] - tmp2;
3812         t[row+2] = b[row+2] - tmp3;
3813         t[row+3] = b[row+3] - tmp4; row += 4;
3814         cnt     += 16;
3815         break;
3816       case 5:
3817         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3818         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3819         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3820         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3821         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3822         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3823         t[row]   = b[row] - tmp1;
3824         t[row+1] = b[row+1] - tmp2;
3825         t[row+2] = b[row+2] - tmp3;
3826         t[row+3] = b[row+3] - tmp4;
3827         t[row+4] = b[row+4] - tmp5;row += 5;
3828         cnt     += 25;
3829         break;
3830       default:
3831         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3832       }
3833     }
3834     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3835 
3836 
3837 
3838     /*
3839           Apply (L + D)^-1 where D is the block diagonal
3840     */
3841     for (i=0, row=0; i<m; i++) {
3842       sz  = diag[row] - ii[row];
3843       v1  = a->a + ii[row];
3844       idx = a->j + ii[row];
3845       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3846       switch (sizes[i]) {
3847       case 1:
3848 
3849         sum1 = t[row];
3850         for (n = 0; n<sz-1; n+=2) {
3851           i1    = idx[0];
3852           i2    = idx[1];
3853           idx  += 2;
3854           tmp0  = t[i1];
3855           tmp1  = t[i2];
3856           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3857         }
3858 
3859         if (n == sz-1) {
3860           tmp0  = t[*idx];
3861           sum1 -= *v1 * tmp0;
3862         }
3863         x[row] += t[row] = sum1*(*ibdiag++); row++;
3864         break;
3865       case 2:
3866         v2   = a->a + ii[row+1];
3867         sum1 = t[row];
3868         sum2 = t[row+1];
3869         for (n = 0; n<sz-1; n+=2) {
3870           i1    = idx[0];
3871           i2    = idx[1];
3872           idx  += 2;
3873           tmp0  = t[i1];
3874           tmp1  = t[i2];
3875           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3876           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3877         }
3878 
3879         if (n == sz-1) {
3880           tmp0  = t[*idx];
3881           sum1 -= v1[0] * tmp0;
3882           sum2 -= v2[0] * tmp0;
3883         }
3884         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3885         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3886         ibdiag   += 4; row += 2;
3887         break;
3888       case 3:
3889         v2   = a->a + ii[row+1];
3890         v3   = a->a + ii[row+2];
3891         sum1 = t[row];
3892         sum2 = t[row+1];
3893         sum3 = t[row+2];
3894         for (n = 0; n<sz-1; n+=2) {
3895           i1    = idx[0];
3896           i2    = idx[1];
3897           idx  += 2;
3898           tmp0  = t[i1];
3899           tmp1  = t[i2];
3900           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3901           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3902           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3903         }
3904 
3905         if (n == sz-1) {
3906           tmp0  = t[*idx];
3907           sum1 -= v1[0] * tmp0;
3908           sum2 -= v2[0] * tmp0;
3909           sum3 -= v3[0] * tmp0;
3910         }
3911         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3912         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3913         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3914         ibdiag   += 9; row += 3;
3915         break;
3916       case 4:
3917         v2   = a->a + ii[row+1];
3918         v3   = a->a + ii[row+2];
3919         v4   = a->a + ii[row+3];
3920         sum1 = t[row];
3921         sum2 = t[row+1];
3922         sum3 = t[row+2];
3923         sum4 = t[row+3];
3924         for (n = 0; n<sz-1; n+=2) {
3925           i1    = idx[0];
3926           i2    = idx[1];
3927           idx  += 2;
3928           tmp0  = t[i1];
3929           tmp1  = t[i2];
3930           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3931           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3932           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3933           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3934         }
3935 
3936         if (n == sz-1) {
3937           tmp0  = t[*idx];
3938           sum1 -= v1[0] * tmp0;
3939           sum2 -= v2[0] * tmp0;
3940           sum3 -= v3[0] * tmp0;
3941           sum4 -= v4[0] * tmp0;
3942         }
3943         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3944         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3945         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3946         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3947         ibdiag   += 16; row += 4;
3948         break;
3949       case 5:
3950         v2   = a->a + ii[row+1];
3951         v3   = a->a + ii[row+2];
3952         v4   = a->a + ii[row+3];
3953         v5   = a->a + ii[row+4];
3954         sum1 = t[row];
3955         sum2 = t[row+1];
3956         sum3 = t[row+2];
3957         sum4 = t[row+3];
3958         sum5 = t[row+4];
3959         for (n = 0; n<sz-1; n+=2) {
3960           i1    = idx[0];
3961           i2    = idx[1];
3962           idx  += 2;
3963           tmp0  = t[i1];
3964           tmp1  = t[i2];
3965           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3966           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3967           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3968           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3969           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3970         }
3971 
3972         if (n == sz-1) {
3973           tmp0  = t[*idx];
3974           sum1 -= v1[0] * tmp0;
3975           sum2 -= v2[0] * tmp0;
3976           sum3 -= v3[0] * tmp0;
3977           sum4 -= v4[0] * tmp0;
3978           sum5 -= v5[0] * tmp0;
3979         }
3980         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3981         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3982         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3983         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3984         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3985         ibdiag   += 25; row += 5;
3986         break;
3987       default:
3988         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3989       }
3990     }
3991     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3992   }
3993   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3994   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3995   PetscFunctionReturn(0);
3996 }
3997 
3998 #undef __FUNCT__
3999 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
4000 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4001 {
4002   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4003   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4004   const MatScalar   *bdiag = a->inode.bdiag;
4005   const PetscScalar *b;
4006   PetscErrorCode    ierr;
4007   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4008   const PetscInt    *sizes = a->inode.size;
4009 
4010   PetscFunctionBegin;
4011   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4012   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4013   cnt  = 0;
4014   for (i=0, row=0; i<m; i++) {
4015     switch (sizes[i]) {
4016     case 1:
4017       x[row] = b[row]*bdiag[cnt++];row++;
4018       break;
4019     case 2:
4020       x1       = b[row]; x2 = b[row+1];
4021       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4022       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4023       x[row++] = tmp1;
4024       x[row++] = tmp2;
4025       cnt     += 4;
4026       break;
4027     case 3:
4028       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4029       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4030       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4031       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4032       x[row++] = tmp1;
4033       x[row++] = tmp2;
4034       x[row++] = tmp3;
4035       cnt     += 9;
4036       break;
4037     case 4:
4038       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4039       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4040       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4041       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4042       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4043       x[row++] = tmp1;
4044       x[row++] = tmp2;
4045       x[row++] = tmp3;
4046       x[row++] = tmp4;
4047       cnt     += 16;
4048       break;
4049     case 5:
4050       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4051       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4052       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4053       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4054       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4055       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4056       x[row++] = tmp1;
4057       x[row++] = tmp2;
4058       x[row++] = tmp3;
4059       x[row++] = tmp4;
4060       x[row++] = tmp5;
4061       cnt     += 25;
4062       break;
4063     default:
4064       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4065     }
4066   }
4067   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4068   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4069   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4070   PetscFunctionReturn(0);
4071 }
4072 
4073 /*
4074     samestructure indicates that the matrix has not changed its nonzero structure so we
4075     do not need to recompute the inodes
4076 */
4077 #undef __FUNCT__
4078 #define __FUNCT__ "MatSeqAIJCheckInode"
4079 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4080 {
4081   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4082   PetscErrorCode ierr;
4083   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4084   PetscBool      flag;
4085   const PetscInt *idx,*idy,*ii;
4086 
4087   PetscFunctionBegin;
4088   if (!a->inode.use) PetscFunctionReturn(0);
4089   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4090 
4091   m = A->rmap->n;
4092   if (a->inode.size) ns = a->inode.size;
4093   else {
4094     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4095   }
4096 
4097   i          = 0;
4098   node_count = 0;
4099   idx        = a->j;
4100   ii         = a->i;
4101   while (i < m) {                /* For each row */
4102     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4103     /* Limits the number of elements in a node to 'a->inode.limit' */
4104     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4105       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4106       if (nzy != nzx) break;
4107       idy += nzx;              /* Same nonzero pattern */
4108       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4109       if (!flag) break;
4110     }
4111     ns[node_count++] = blk_size;
4112     idx             += blk_size*nzx;
4113     i                = j;
4114   }
4115   /* If not enough inodes found,, do not use inode version of the routines */
4116   if (!m || node_count > .8*m) {
4117     ierr = PetscFree(ns);CHKERRQ(ierr);
4118 
4119     a->inode.node_count       = 0;
4120     a->inode.size             = NULL;
4121     a->inode.use              = PETSC_FALSE;
4122     A->ops->mult              = MatMult_SeqAIJ;
4123     A->ops->sor               = MatSOR_SeqAIJ;
4124     A->ops->multadd           = MatMultAdd_SeqAIJ;
4125     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4126     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4127     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4128     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4129     A->ops->coloringpatch     = 0;
4130     A->ops->multdiagonalblock = 0;
4131 
4132     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4133   } else {
4134     if (!A->factortype) {
4135       A->ops->mult              = MatMult_SeqAIJ_Inode;
4136       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4137       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4138       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4139       if (A->rmap->n == A->cmap->n) {
4140         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4141         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4142         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4143         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4144         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4145       }
4146     } else {
4147       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4148     }
4149     a->inode.node_count = node_count;
4150     a->inode.size       = ns;
4151     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4152   }
4153   a->inode.checked          = PETSC_TRUE;
4154   a->inode.mat_nonzerostate = A->nonzerostate;
4155   PetscFunctionReturn(0);
4156 }
4157 
4158 #undef __FUNCT__
4159 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4160 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4161 {
4162   Mat            B =*C;
4163   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4164   PetscErrorCode ierr;
4165   PetscInt       m=A->rmap->n;
4166 
4167   PetscFunctionBegin;
4168   c->inode.use       = a->inode.use;
4169   c->inode.limit     = a->inode.limit;
4170   c->inode.max_limit = a->inode.max_limit;
4171   if (a->inode.size) {
4172     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4173     c->inode.node_count = a->inode.node_count;
4174     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4175     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4176     if (!B->factortype) {
4177       B->ops->mult              = MatMult_SeqAIJ_Inode;
4178       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4179       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4180       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4181       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4182       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4183       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4184       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4185       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4186     } else {
4187       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4188     }
4189   } else {
4190     c->inode.size       = 0;
4191     c->inode.node_count = 0;
4192   }
4193   c->inode.ibdiagvalid = PETSC_FALSE;
4194   c->inode.ibdiag      = 0;
4195   c->inode.bdiag       = 0;
4196   PetscFunctionReturn(0);
4197 }
4198 
4199 #undef __FUNCT__
4200 #define __FUNCT__ "MatGetRow_FactoredLU"
4201 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4202 {
4203   PetscInt       k;
4204   const PetscInt *vi;
4205 
4206   PetscFunctionBegin;
4207   vi = aj + ai[row];
4208   for (k=0; k<nzl; k++) cols[k] = vi[k];
4209   vi        = aj + adiag[row];
4210   cols[nzl] = vi[0];
4211   vi        = aj + adiag[row+1]+1;
4212   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4213   PetscFunctionReturn(0);
4214 }
4215 /*
4216    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4217    Modified from MatSeqAIJCheckInode().
4218 
4219    Input Parameters:
4220 .  Mat A - ILU or LU matrix factor
4221 
4222 */
4223 #undef __FUNCT__
4224 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4225 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4226 {
4227   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4228   PetscErrorCode ierr;
4229   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4230   PetscInt       *cols1,*cols2,*ns;
4231   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4232   PetscBool      flag;
4233 
4234   PetscFunctionBegin;
4235   if (!a->inode.use)    PetscFunctionReturn(0);
4236   if (a->inode.checked) PetscFunctionReturn(0);
4237 
4238   m = A->rmap->n;
4239   if (a->inode.size) ns = a->inode.size;
4240   else {
4241     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4242   }
4243 
4244   i          = 0;
4245   node_count = 0;
4246   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4247   while (i < m) {                /* For each row */
4248     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4249     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4250     nzx  = nzl1 + nzu1 + 1;
4251     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4252 
4253     /* Limits the number of elements in a node to 'a->inode.limit' */
4254     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4255       nzl2 = ai[j+1] - ai[j];
4256       nzu2 = adiag[j] - adiag[j+1] - 1;
4257       nzy  = nzl2 + nzu2 + 1;
4258       if (nzy != nzx) break;
4259       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4260       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4261       if (!flag) break;
4262     }
4263     ns[node_count++] = blk_size;
4264     i                = j;
4265   }
4266   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4267   /* If not enough inodes found,, do not use inode version of the routines */
4268   if (!m || node_count > .8*m) {
4269     ierr = PetscFree(ns);CHKERRQ(ierr);
4270 
4271     a->inode.node_count = 0;
4272     a->inode.size       = NULL;
4273     a->inode.use        = PETSC_FALSE;
4274 
4275     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4276   } else {
4277     A->ops->mult              = 0;
4278     A->ops->sor               = 0;
4279     A->ops->multadd           = 0;
4280     A->ops->getrowij          = 0;
4281     A->ops->restorerowij      = 0;
4282     A->ops->getcolumnij       = 0;
4283     A->ops->restorecolumnij   = 0;
4284     A->ops->coloringpatch     = 0;
4285     A->ops->multdiagonalblock = 0;
4286     a->inode.node_count       = node_count;
4287     a->inode.size             = ns;
4288 
4289     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4290   }
4291   a->inode.checked = PETSC_TRUE;
4292   PetscFunctionReturn(0);
4293 }
4294 
4295 #undef __FUNCT__
4296 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4297 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4298 {
4299   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4300 
4301   PetscFunctionBegin;
4302   a->inode.ibdiagvalid = PETSC_FALSE;
4303   PetscFunctionReturn(0);
4304 }
4305 
4306 /*
4307      This is really ugly. if inodes are used this replaces the
4308   permutations with ones that correspond to rows/cols of the matrix
4309   rather then inode blocks
4310 */
4311 #undef __FUNCT__
4312 #define __FUNCT__ "MatInodeAdjustForInodes"
4313 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4314 {
4315   PetscErrorCode ierr;
4316 
4317   PetscFunctionBegin;
4318   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4319   PetscFunctionReturn(0);
4320 }
4321 
4322 #undef __FUNCT__
4323 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4324 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4325 {
4326   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4327   PetscErrorCode ierr;
4328   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4329   const PetscInt *ridx,*cidx;
4330   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4331   PetscInt       nslim_col,*ns_col;
4332   IS             ris = *rperm,cis = *cperm;
4333 
4334   PetscFunctionBegin;
4335   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4336   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4337 
4338   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4339   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4340   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4341 
4342   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4343   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4344 
4345   /* Form the inode structure for the rows of permuted matric using inv perm*/
4346   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4347 
4348   /* Construct the permutations for rows*/
4349   for (i=0,row = 0; i<nslim_row; ++i) {
4350     indx      = ridx[i];
4351     start_val = tns[indx];
4352     end_val   = tns[indx + 1];
4353     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4354   }
4355 
4356   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4357   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4358 
4359   /* Construct permutations for columns */
4360   for (i=0,col=0; i<nslim_col; ++i) {
4361     indx      = cidx[i];
4362     start_val = tns[indx];
4363     end_val   = tns[indx + 1];
4364     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4365   }
4366 
4367   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4368   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4369   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4370   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4371 
4372   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4373   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4374 
4375   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4376   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4377   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4378   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4379   ierr = PetscFree(tns);CHKERRQ(ierr);
4380   PetscFunctionReturn(0);
4381 }
4382 
4383 #undef __FUNCT__
4384 #define __FUNCT__ "MatInodeGetInodeSizes"
4385 /*@C
4386    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4387 
4388    Not Collective
4389 
4390    Input Parameter:
4391 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4392 
4393    Output Parameter:
4394 +  node_count - no of inodes present in the matrix.
4395 .  sizes      - an array of size node_count,with sizes of each inode.
4396 -  limit      - the max size used to generate the inodes.
4397 
4398    Level: advanced
4399 
4400    Notes: This routine returns some internal storage information
4401    of the matrix, it is intended to be used by advanced users.
4402    It should be called after the matrix is assembled.
4403    The contents of the sizes[] array should not be changed.
4404    NULL may be passed for information not requested.
4405 
4406 .keywords: matrix, seqaij, get, inode
4407 
4408 .seealso: MatGetInfo()
4409 @*/
4410 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4411 {
4412   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4413 
4414   PetscFunctionBegin;
4415   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4416   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4417   if (f) {
4418     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4419   }
4420   PetscFunctionReturn(0);
4421 }
4422 
4423 #undef __FUNCT__
4424 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4425 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4426 {
4427   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4428 
4429   PetscFunctionBegin;
4430   if (node_count) *node_count = a->inode.node_count;
4431   if (sizes)      *sizes      = a->inode.size;
4432   if (limit)      *limit      = a->inode.limit;
4433   PetscFunctionReturn(0);
4434 }
4435