xref: /petsc/src/mat/impls/aij/seq/inode.c (revision dcca6d9d80ebd869fe6029bd05a3aa9faafef49e)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
600   if (zz != yy) {
601     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
602   } else {
603     z = y;
604   }
605   zt = z;
606 
607   idx = a->j;
608   v1  = a->a;
609   ii  = a->i;
610 
611   for (i = 0,row = 0; i< node_max; ++i) {
612     nsz = ns[i];
613     n   = ii[1] - ii[0];
614     ii += nsz;
615     sz  = n;                    /* No of non zeros in this row */
616                                 /* Switch on the size of Node */
617     switch (nsz) {               /* Each loop in 'case' is unrolled */
618     case 1:
619       sum1 = *zt++;
620 
621       for (n = 0; n< sz-1; n+=2) {
622         i1    = idx[0];         /* The instructions are ordered to */
623         i2    = idx[1];         /* make the compiler's job easy */
624         idx  += 2;
625         tmp0  = x[i1];
626         tmp1  = x[i2];
627         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
628       }
629 
630       if (n   == sz-1) {          /* Take care of the last nonzero  */
631         tmp0  = x[*idx++];
632         sum1 += *v1++ * tmp0;
633       }
634       y[row++]=sum1;
635       break;
636     case 2:
637       sum1 = *zt++;
638       sum2 = *zt++;
639       v2   = v1 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649       }
650       if (n   == sz-1) {
651         tmp0  = x[*idx++];
652         sum1 += *v1++ * tmp0;
653         sum2 += *v2++ * tmp0;
654       }
655       y[row++]=sum1;
656       y[row++]=sum2;
657       v1      =v2;              /* Since the next block to be processed starts there*/
658       idx    +=sz;
659       break;
660     case 3:
661       sum1 = *zt++;
662       sum2 = *zt++;
663       sum3 = *zt++;
664       v2   = v1 + n;
665       v3   = v2 + n;
666 
667       for (n = 0; n< sz-1; n+=2) {
668         i1    = idx[0];
669         i2    = idx[1];
670         idx  += 2;
671         tmp0  = x[i1];
672         tmp1  = x[i2];
673         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
674         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
675         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
676       }
677       if (n == sz-1) {
678         tmp0  = x[*idx++];
679         sum1 += *v1++ * tmp0;
680         sum2 += *v2++ * tmp0;
681         sum3 += *v3++ * tmp0;
682       }
683       y[row++]=sum1;
684       y[row++]=sum2;
685       y[row++]=sum3;
686       v1      =v3;              /* Since the next block to be processed starts there*/
687       idx    +=2*sz;
688       break;
689     case 4:
690       sum1 = *zt++;
691       sum2 = *zt++;
692       sum3 = *zt++;
693       sum4 = *zt++;
694       v2   = v1 + n;
695       v3   = v2 + n;
696       v4   = v3 + n;
697 
698       for (n = 0; n< sz-1; n+=2) {
699         i1    = idx[0];
700         i2    = idx[1];
701         idx  += 2;
702         tmp0  = x[i1];
703         tmp1  = x[i2];
704         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
705         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
706         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
707         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
708       }
709       if (n == sz-1) {
710         tmp0  = x[*idx++];
711         sum1 += *v1++ * tmp0;
712         sum2 += *v2++ * tmp0;
713         sum3 += *v3++ * tmp0;
714         sum4 += *v4++ * tmp0;
715       }
716       y[row++]=sum1;
717       y[row++]=sum2;
718       y[row++]=sum3;
719       y[row++]=sum4;
720       v1      =v4;              /* Since the next block to be processed starts there*/
721       idx    +=3*sz;
722       break;
723     case 5:
724       sum1 = *zt++;
725       sum2 = *zt++;
726       sum3 = *zt++;
727       sum4 = *zt++;
728       sum5 = *zt++;
729       v2   = v1 + n;
730       v3   = v2 + n;
731       v4   = v3 + n;
732       v5   = v4 + n;
733 
734       for (n = 0; n<sz-1; n+=2) {
735         i1    = idx[0];
736         i2    = idx[1];
737         idx  += 2;
738         tmp0  = x[i1];
739         tmp1  = x[i2];
740         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
741         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
742         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
743         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
744         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
745       }
746       if (n   == sz-1) {
747         tmp0  = x[*idx++];
748         sum1 += *v1++ * tmp0;
749         sum2 += *v2++ * tmp0;
750         sum3 += *v3++ * tmp0;
751         sum4 += *v4++ * tmp0;
752         sum5 += *v5++ * tmp0;
753       }
754       y[row++]=sum1;
755       y[row++]=sum2;
756       y[row++]=sum3;
757       y[row++]=sum4;
758       y[row++]=sum5;
759       v1      =v5;       /* Since the next block to be processed starts there */
760       idx    +=4*sz;
761       break;
762     default:
763       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
764     }
765   }
766   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
767   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
768   if (zz != yy) {
769     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
770   }
771   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
772   PetscFunctionReturn(0);
773 }
774 
775 /* ----------------------------------------------------------- */
776 #undef __FUNCT__
777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
779 {
780   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
781   IS                iscol = a->col,isrow = a->row;
782   PetscErrorCode    ierr;
783   const PetscInt    *r,*c,*rout,*cout;
784   PetscInt          i,j,n = A->rmap->n,nz;
785   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
786   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
787   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
788   PetscScalar       sum1,sum2,sum3,sum4,sum5;
789   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
790   const PetscScalar *b;
791 
792   PetscFunctionBegin;
793   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
794   node_max = a->inode.node_count;
795   ns       = a->inode.size;     /* Node Size array */
796 
797   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
799   tmp  = a->solve_work;
800 
801   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
802   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
803 
804   /* forward solve the lower triangular */
805   tmps = tmp;
806   aa   = a_a;
807   aj   = a_j;
808   ad   = a->diag;
809 
810   for (i = 0,row = 0; i< node_max; ++i) {
811     nsz = ns[i];
812     aii = ai[row];
813     v1  = aa + aii;
814     vi  = aj + aii;
815     nz  = ad[row]- aii;
816     if (i < node_max-1) {
817       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
818       * but our indexing to determine it's size could. */
819       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
820       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
821       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
822       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
823     }
824 
825     switch (nsz) {               /* Each loop in 'case' is unrolled */
826     case 1:
827       sum1 = b[*r++];
828       for (j=0; j<nz-1; j+=2) {
829         i0    = vi[0];
830         i1    = vi[1];
831         vi   +=2;
832         tmp0  = tmps[i0];
833         tmp1  = tmps[i1];
834         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
835       }
836       if (j == nz-1) {
837         tmp0  = tmps[*vi++];
838         sum1 -= *v1++ *tmp0;
839       }
840       tmp[row++]=sum1;
841       break;
842     case 2:
843       sum1 = b[*r++];
844       sum2 = b[*r++];
845       v2   = aa + ai[row+1];
846 
847       for (j=0; j<nz-1; j+=2) {
848         i0    = vi[0];
849         i1    = vi[1];
850         vi   +=2;
851         tmp0  = tmps[i0];
852         tmp1  = tmps[i1];
853         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
854         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
855       }
856       if (j == nz-1) {
857         tmp0  = tmps[*vi++];
858         sum1 -= *v1++ *tmp0;
859         sum2 -= *v2++ *tmp0;
860       }
861       sum2     -= *v2++ *sum1;
862       tmp[row++]=sum1;
863       tmp[row++]=sum2;
864       break;
865     case 3:
866       sum1 = b[*r++];
867       sum2 = b[*r++];
868       sum3 = b[*r++];
869       v2   = aa + ai[row+1];
870       v3   = aa + ai[row+2];
871 
872       for (j=0; j<nz-1; j+=2) {
873         i0    = vi[0];
874         i1    = vi[1];
875         vi   +=2;
876         tmp0  = tmps[i0];
877         tmp1  = tmps[i1];
878         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
879         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
880         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
881       }
882       if (j == nz-1) {
883         tmp0  = tmps[*vi++];
884         sum1 -= *v1++ *tmp0;
885         sum2 -= *v2++ *tmp0;
886         sum3 -= *v3++ *tmp0;
887       }
888       sum2 -= *v2++ * sum1;
889       sum3 -= *v3++ * sum1;
890       sum3 -= *v3++ * sum2;
891 
892       tmp[row++]=sum1;
893       tmp[row++]=sum2;
894       tmp[row++]=sum3;
895       break;
896 
897     case 4:
898       sum1 = b[*r++];
899       sum2 = b[*r++];
900       sum3 = b[*r++];
901       sum4 = b[*r++];
902       v2   = aa + ai[row+1];
903       v3   = aa + ai[row+2];
904       v4   = aa + ai[row+3];
905 
906       for (j=0; j<nz-1; j+=2) {
907         i0    = vi[0];
908         i1    = vi[1];
909         vi   +=2;
910         tmp0  = tmps[i0];
911         tmp1  = tmps[i1];
912         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
913         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
914         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
915         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
916       }
917       if (j == nz-1) {
918         tmp0  = tmps[*vi++];
919         sum1 -= *v1++ *tmp0;
920         sum2 -= *v2++ *tmp0;
921         sum3 -= *v3++ *tmp0;
922         sum4 -= *v4++ *tmp0;
923       }
924       sum2 -= *v2++ * sum1;
925       sum3 -= *v3++ * sum1;
926       sum4 -= *v4++ * sum1;
927       sum3 -= *v3++ * sum2;
928       sum4 -= *v4++ * sum2;
929       sum4 -= *v4++ * sum3;
930 
931       tmp[row++]=sum1;
932       tmp[row++]=sum2;
933       tmp[row++]=sum3;
934       tmp[row++]=sum4;
935       break;
936     case 5:
937       sum1 = b[*r++];
938       sum2 = b[*r++];
939       sum3 = b[*r++];
940       sum4 = b[*r++];
941       sum5 = b[*r++];
942       v2   = aa + ai[row+1];
943       v3   = aa + ai[row+2];
944       v4   = aa + ai[row+3];
945       v5   = aa + ai[row+4];
946 
947       for (j=0; j<nz-1; j+=2) {
948         i0    = vi[0];
949         i1    = vi[1];
950         vi   +=2;
951         tmp0  = tmps[i0];
952         tmp1  = tmps[i1];
953         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
954         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
955         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
956         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
957         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
958       }
959       if (j == nz-1) {
960         tmp0  = tmps[*vi++];
961         sum1 -= *v1++ *tmp0;
962         sum2 -= *v2++ *tmp0;
963         sum3 -= *v3++ *tmp0;
964         sum4 -= *v4++ *tmp0;
965         sum5 -= *v5++ *tmp0;
966       }
967 
968       sum2 -= *v2++ * sum1;
969       sum3 -= *v3++ * sum1;
970       sum4 -= *v4++ * sum1;
971       sum5 -= *v5++ * sum1;
972       sum3 -= *v3++ * sum2;
973       sum4 -= *v4++ * sum2;
974       sum5 -= *v5++ * sum2;
975       sum4 -= *v4++ * sum3;
976       sum5 -= *v5++ * sum3;
977       sum5 -= *v5++ * sum4;
978 
979       tmp[row++]=sum1;
980       tmp[row++]=sum2;
981       tmp[row++]=sum3;
982       tmp[row++]=sum4;
983       tmp[row++]=sum5;
984       break;
985     default:
986       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
987     }
988   }
989   /* backward solve the upper triangular */
990   for (i=node_max -1,row = n-1; i>=0; i--) {
991     nsz = ns[i];
992     aii = ai[row+1] -1;
993     v1  = aa + aii;
994     vi  = aj + aii;
995     nz  = aii- ad[row];
996     switch (nsz) {               /* Each loop in 'case' is unrolled */
997     case 1:
998       sum1 = tmp[row];
999 
1000       for (j=nz; j>1; j-=2) {
1001         vi   -=2;
1002         i0    = vi[2];
1003         i1    = vi[1];
1004         tmp0  = tmps[i0];
1005         tmp1  = tmps[i1];
1006         v1   -= 2;
1007         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1008       }
1009       if (j==1) {
1010         tmp0  = tmps[*vi--];
1011         sum1 -= *v1-- * tmp0;
1012       }
1013       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1014       break;
1015     case 2:
1016       sum1 = tmp[row];
1017       sum2 = tmp[row -1];
1018       v2   = aa + ai[row]-1;
1019       for (j=nz; j>1; j-=2) {
1020         vi   -=2;
1021         i0    = vi[2];
1022         i1    = vi[1];
1023         tmp0  = tmps[i0];
1024         tmp1  = tmps[i1];
1025         v1   -= 2;
1026         v2   -= 2;
1027         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1028         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1029       }
1030       if (j==1) {
1031         tmp0  = tmps[*vi--];
1032         sum1 -= *v1-- * tmp0;
1033         sum2 -= *v2-- * tmp0;
1034       }
1035 
1036       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1037       sum2   -= *v2-- * tmp0;
1038       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1039       break;
1040     case 3:
1041       sum1 = tmp[row];
1042       sum2 = tmp[row -1];
1043       sum3 = tmp[row -2];
1044       v2   = aa + ai[row]-1;
1045       v3   = aa + ai[row -1]-1;
1046       for (j=nz; j>1; j-=2) {
1047         vi   -=2;
1048         i0    = vi[2];
1049         i1    = vi[1];
1050         tmp0  = tmps[i0];
1051         tmp1  = tmps[i1];
1052         v1   -= 2;
1053         v2   -= 2;
1054         v3   -= 2;
1055         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1056         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1057         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1058       }
1059       if (j==1) {
1060         tmp0  = tmps[*vi--];
1061         sum1 -= *v1-- * tmp0;
1062         sum2 -= *v2-- * tmp0;
1063         sum3 -= *v3-- * tmp0;
1064       }
1065       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1066       sum2   -= *v2-- * tmp0;
1067       sum3   -= *v3-- * tmp0;
1068       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1069       sum3   -= *v3-- * tmp0;
1070       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1071 
1072       break;
1073     case 4:
1074       sum1 = tmp[row];
1075       sum2 = tmp[row -1];
1076       sum3 = tmp[row -2];
1077       sum4 = tmp[row -3];
1078       v2   = aa + ai[row]-1;
1079       v3   = aa + ai[row -1]-1;
1080       v4   = aa + ai[row -2]-1;
1081 
1082       for (j=nz; j>1; j-=2) {
1083         vi   -=2;
1084         i0    = vi[2];
1085         i1    = vi[1];
1086         tmp0  = tmps[i0];
1087         tmp1  = tmps[i1];
1088         v1   -= 2;
1089         v2   -= 2;
1090         v3   -= 2;
1091         v4   -= 2;
1092         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1093         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1094         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1095         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1096       }
1097       if (j==1) {
1098         tmp0  = tmps[*vi--];
1099         sum1 -= *v1-- * tmp0;
1100         sum2 -= *v2-- * tmp0;
1101         sum3 -= *v3-- * tmp0;
1102         sum4 -= *v4-- * tmp0;
1103       }
1104 
1105       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1106       sum2   -= *v2-- * tmp0;
1107       sum3   -= *v3-- * tmp0;
1108       sum4   -= *v4-- * tmp0;
1109       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1110       sum3   -= *v3-- * tmp0;
1111       sum4   -= *v4-- * tmp0;
1112       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1113       sum4   -= *v4-- * tmp0;
1114       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1115       break;
1116     case 5:
1117       sum1 = tmp[row];
1118       sum2 = tmp[row -1];
1119       sum3 = tmp[row -2];
1120       sum4 = tmp[row -3];
1121       sum5 = tmp[row -4];
1122       v2   = aa + ai[row]-1;
1123       v3   = aa + ai[row -1]-1;
1124       v4   = aa + ai[row -2]-1;
1125       v5   = aa + ai[row -3]-1;
1126       for (j=nz; j>1; j-=2) {
1127         vi   -= 2;
1128         i0    = vi[2];
1129         i1    = vi[1];
1130         tmp0  = tmps[i0];
1131         tmp1  = tmps[i1];
1132         v1   -= 2;
1133         v2   -= 2;
1134         v3   -= 2;
1135         v4   -= 2;
1136         v5   -= 2;
1137         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1138         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1139         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1140         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1141         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1142       }
1143       if (j==1) {
1144         tmp0  = tmps[*vi--];
1145         sum1 -= *v1-- * tmp0;
1146         sum2 -= *v2-- * tmp0;
1147         sum3 -= *v3-- * tmp0;
1148         sum4 -= *v4-- * tmp0;
1149         sum5 -= *v5-- * tmp0;
1150       }
1151 
1152       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1153       sum2   -= *v2-- * tmp0;
1154       sum3   -= *v3-- * tmp0;
1155       sum4   -= *v4-- * tmp0;
1156       sum5   -= *v5-- * tmp0;
1157       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1158       sum3   -= *v3-- * tmp0;
1159       sum4   -= *v4-- * tmp0;
1160       sum5   -= *v5-- * tmp0;
1161       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1162       sum4   -= *v4-- * tmp0;
1163       sum5   -= *v5-- * tmp0;
1164       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1165       sum5   -= *v5-- * tmp0;
1166       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1167       break;
1168     default:
1169       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1170     }
1171   }
1172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1174   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1176   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 #undef __FUNCT__
1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1183 {
1184   Mat             C     =B;
1185   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1186   IS              isrow = b->row,isicol = b->icol;
1187   PetscErrorCode  ierr;
1188   const PetscInt  *r,*ic,*ics;
1189   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1190   PetscInt        i,j,k,nz,nzL,row,*pj;
1191   const PetscInt  *ajtmp,*bjtmp;
1192   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1193   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1194   FactorShiftCtx  sctx;
1195   const PetscInt  *ddiag;
1196   PetscReal       rs;
1197   MatScalar       d;
1198   PetscInt        inod,nodesz,node_max,col;
1199   const PetscInt  *ns;
1200   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1201 
1202   PetscFunctionBegin;
1203   /* MatPivotSetUp(): initialize shift context sctx */
1204   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1205 
1206   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1207     ddiag          = a->diag;
1208     sctx.shift_top = info->zeropivot;
1209     for (i=0; i<n; i++) {
1210       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1211       d  = (aa)[ddiag[i]];
1212       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1213       v  = aa+ai[i];
1214       nz = ai[i+1] - ai[i];
1215       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1216       if (rs>sctx.shift_top) sctx.shift_top = rs;
1217     }
1218     sctx.shift_top *= 1.1;
1219     sctx.nshift_max = 5;
1220     sctx.shift_lo   = 0.;
1221     sctx.shift_hi   = 1.;
1222   }
1223 
1224   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1225   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1226 
1227   ierr  = PetscMalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1228   ierr  = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr);
1229   ierr  = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr);
1230   ierr  = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr);
1231   ierr  = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr);
1232   ics   = ic;
1233 
1234   node_max = a->inode.node_count;
1235   ns       = a->inode.size;
1236   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1237 
1238   /* If max inode size > 4, split it into two inodes.*/
1239   /* also map the inode sizes according to the ordering */
1240   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1241   for (i=0,j=0; i<node_max; ++i,++j) {
1242     if (ns[i] > 4) {
1243       tmp_vec1[j] = 4;
1244       ++j;
1245       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1246     } else {
1247       tmp_vec1[j] = ns[i];
1248     }
1249   }
1250   /* Use the correct node_max */
1251   node_max = j;
1252 
1253   /* Now reorder the inode info based on mat re-ordering info */
1254   /* First create a row -> inode_size_array_index map */
1255   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1256   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1257   for (i=0,row=0; i<node_max; i++) {
1258     nodesz = tmp_vec1[i];
1259     for (j=0; j<nodesz; j++,row++) {
1260       nsmap[row] = i;
1261     }
1262   }
1263   /* Using nsmap, create a reordered ns structure */
1264   for (i=0,j=0; i< node_max; i++) {
1265     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1266     tmp_vec2[i] = nodesz;
1267     j          += nodesz;
1268   }
1269   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1270   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1271 
1272   /* Now use the correct ns */
1273   ns = tmp_vec2;
1274 
1275   do {
1276     sctx.newshift = PETSC_FALSE;
1277     /* Now loop over each block-row, and do the factorization */
1278     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1279       nodesz = ns[inod];
1280 
1281       switch (nodesz) {
1282       case 1:
1283         /*----------*/
1284         /* zero rtmp1 */
1285         /* L part */
1286         nz    = bi[i+1] - bi[i];
1287         bjtmp = bj + bi[i];
1288         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1289 
1290         /* U part */
1291         nz    = bdiag[i]-bdiag[i+1];
1292         bjtmp = bj + bdiag[i+1]+1;
1293         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1294 
1295         /* load in initial (unfactored row) */
1296         nz    = ai[r[i]+1] - ai[r[i]];
1297         ajtmp = aj + ai[r[i]];
1298         v     = aa + ai[r[i]];
1299         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1300 
1301         /* ZeropivotApply() */
1302         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1303 
1304         /* elimination */
1305         bjtmp = bj + bi[i];
1306         row   = *bjtmp++;
1307         nzL   = bi[i+1] - bi[i];
1308         for (k=0; k < nzL; k++) {
1309           pc = rtmp1 + row;
1310           if (*pc != 0.0) {
1311             pv   = b->a + bdiag[row];
1312             mul1 = *pc * (*pv);
1313             *pc  = mul1;
1314             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1315             pv   = b->a + bdiag[row+1]+1;
1316             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1317             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1318             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1319           }
1320           row = *bjtmp++;
1321         }
1322 
1323         /* finished row so stick it into b->a */
1324         rs = 0.0;
1325         /* L part */
1326         pv = b->a + bi[i];
1327         pj = b->j + bi[i];
1328         nz = bi[i+1] - bi[i];
1329         for (j=0; j<nz; j++) {
1330           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1331         }
1332 
1333         /* U part */
1334         pv = b->a + bdiag[i+1]+1;
1335         pj = b->j + bdiag[i+1]+1;
1336         nz = bdiag[i] - bdiag[i+1]-1;
1337         for (j=0; j<nz; j++) {
1338           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1339         }
1340 
1341         /* Check zero pivot */
1342         sctx.rs = rs;
1343         sctx.pv = rtmp1[i];
1344         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1345         if (sctx.newshift) break;
1346 
1347         /* Mark diagonal and invert diagonal for simplier triangular solves */
1348         pv  = b->a + bdiag[i];
1349         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1350         break;
1351 
1352       case 2:
1353         /*----------*/
1354         /* zero rtmp1 and rtmp2 */
1355         /* L part */
1356         nz    = bi[i+1] - bi[i];
1357         bjtmp = bj + bi[i];
1358         for  (j=0; j<nz; j++) {
1359           col        = bjtmp[j];
1360           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1361         }
1362 
1363         /* U part */
1364         nz    = bdiag[i]-bdiag[i+1];
1365         bjtmp = bj + bdiag[i+1]+1;
1366         for  (j=0; j<nz; j++) {
1367           col        = bjtmp[j];
1368           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1369         }
1370 
1371         /* load in initial (unfactored row) */
1372         nz    = ai[r[i]+1] - ai[r[i]];
1373         ajtmp = aj + ai[r[i]];
1374         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1375         for (j=0; j<nz; j++) {
1376           col        = ics[ajtmp[j]];
1377           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1378         }
1379         /* ZeropivotApply(): shift the diagonal of the matrix  */
1380         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1381 
1382         /* elimination */
1383         bjtmp = bj + bi[i];
1384         row   = *bjtmp++; /* pivot row */
1385         nzL   = bi[i+1] - bi[i];
1386         for (k=0; k < nzL; k++) {
1387           pc1 = rtmp1 + row;
1388           pc2 = rtmp2 + row;
1389           if (*pc1 != 0.0 || *pc2 != 0.0) {
1390             pv   = b->a + bdiag[row];
1391             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1392             *pc1 = mul1;       *pc2 = mul2;
1393 
1394             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1395             pv = b->a + bdiag[row+1]+1;
1396             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1397             for (j=0; j<nz; j++) {
1398               col         = pj[j];
1399               rtmp1[col] -= mul1 * pv[j];
1400               rtmp2[col] -= mul2 * pv[j];
1401             }
1402             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1403           }
1404           row = *bjtmp++;
1405         }
1406 
1407         /* finished row i; check zero pivot, then stick row i into b->a */
1408         rs = 0.0;
1409         /* L part */
1410         pc1 = b->a + bi[i];
1411         pj  = b->j + bi[i];
1412         nz  = bi[i+1] - bi[i];
1413         for (j=0; j<nz; j++) {
1414           col    = pj[j];
1415           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1416         }
1417         /* U part */
1418         pc1 = b->a + bdiag[i+1]+1;
1419         pj  = b->j + bdiag[i+1]+1;
1420         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1421         for (j=0; j<nz; j++) {
1422           col    = pj[j];
1423           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1424         }
1425 
1426         sctx.rs = rs;
1427         sctx.pv = rtmp1[i];
1428         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1429         if (sctx.newshift) break;
1430         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1431         *pc1 = 1.0/sctx.pv;
1432 
1433         /* Now take care of diagonal 2x2 block. */
1434         pc2 = rtmp2 + i;
1435         if (*pc2 != 0.0) {
1436           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1437           *pc2 = mul1;          /* insert L entry */
1438           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1439           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1440           for (j=0; j<nz; j++) {
1441             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1442           }
1443           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1444         }
1445 
1446         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1447         rs = 0.0;
1448         /* L part */
1449         pc2 = b->a + bi[i+1];
1450         pj  = b->j + bi[i+1];
1451         nz  = bi[i+2] - bi[i+1];
1452         for (j=0; j<nz; j++) {
1453           col    = pj[j];
1454           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1455         }
1456         /* U part */
1457         pc2 = b->a + bdiag[i+2]+1;
1458         pj  = b->j + bdiag[i+2]+1;
1459         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1460         for (j=0; j<nz; j++) {
1461           col    = pj[j];
1462           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1463         }
1464 
1465         sctx.rs = rs;
1466         sctx.pv = rtmp2[i+1];
1467         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1468         if (sctx.newshift) break;
1469         pc2  = b->a + bdiag[i+1];
1470         *pc2 = 1.0/sctx.pv;
1471         break;
1472 
1473       case 3:
1474         /*----------*/
1475         /* zero rtmp */
1476         /* L part */
1477         nz    = bi[i+1] - bi[i];
1478         bjtmp = bj + bi[i];
1479         for  (j=0; j<nz; j++) {
1480           col        = bjtmp[j];
1481           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1482         }
1483 
1484         /* U part */
1485         nz    = bdiag[i]-bdiag[i+1];
1486         bjtmp = bj + bdiag[i+1]+1;
1487         for  (j=0; j<nz; j++) {
1488           col        = bjtmp[j];
1489           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1490         }
1491 
1492         /* load in initial (unfactored row) */
1493         nz    = ai[r[i]+1] - ai[r[i]];
1494         ajtmp = aj + ai[r[i]];
1495         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1496         for (j=0; j<nz; j++) {
1497           col        = ics[ajtmp[j]];
1498           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1499         }
1500         /* ZeropivotApply(): shift the diagonal of the matrix  */
1501         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1502 
1503         /* elimination */
1504         bjtmp = bj + bi[i];
1505         row   = *bjtmp++; /* pivot row */
1506         nzL   = bi[i+1] - bi[i];
1507         for (k=0; k < nzL; k++) {
1508           pc1 = rtmp1 + row;
1509           pc2 = rtmp2 + row;
1510           pc3 = rtmp3 + row;
1511           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1512             pv   = b->a + bdiag[row];
1513             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1514             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1515 
1516             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1517             pv = b->a + bdiag[row+1]+1;
1518             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1519             for (j=0; j<nz; j++) {
1520               col         = pj[j];
1521               rtmp1[col] -= mul1 * pv[j];
1522               rtmp2[col] -= mul2 * pv[j];
1523               rtmp3[col] -= mul3 * pv[j];
1524             }
1525             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1526           }
1527           row = *bjtmp++;
1528         }
1529 
1530         /* finished row i; check zero pivot, then stick row i into b->a */
1531         rs = 0.0;
1532         /* L part */
1533         pc1 = b->a + bi[i];
1534         pj  = b->j + bi[i];
1535         nz  = bi[i+1] - bi[i];
1536         for (j=0; j<nz; j++) {
1537           col    = pj[j];
1538           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1539         }
1540         /* U part */
1541         pc1 = b->a + bdiag[i+1]+1;
1542         pj  = b->j + bdiag[i+1]+1;
1543         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1544         for (j=0; j<nz; j++) {
1545           col    = pj[j];
1546           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1547         }
1548 
1549         sctx.rs = rs;
1550         sctx.pv = rtmp1[i];
1551         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1552         if (sctx.newshift) break;
1553         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1554         *pc1 = 1.0/sctx.pv;
1555 
1556         /* Now take care of 1st column of diagonal 3x3 block. */
1557         pc2 = rtmp2 + i;
1558         pc3 = rtmp3 + i;
1559         if (*pc2 != 0.0 || *pc3 != 0.0) {
1560           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1561           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1562           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1563           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1564           for (j=0; j<nz; j++) {
1565             col         = pj[j];
1566             rtmp2[col] -= mul2 * rtmp1[col];
1567             rtmp3[col] -= mul3 * rtmp1[col];
1568           }
1569           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1570         }
1571 
1572         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1573         rs = 0.0;
1574         /* L part */
1575         pc2 = b->a + bi[i+1];
1576         pj  = b->j + bi[i+1];
1577         nz  = bi[i+2] - bi[i+1];
1578         for (j=0; j<nz; j++) {
1579           col    = pj[j];
1580           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1581         }
1582         /* U part */
1583         pc2 = b->a + bdiag[i+2]+1;
1584         pj  = b->j + bdiag[i+2]+1;
1585         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1586         for (j=0; j<nz; j++) {
1587           col    = pj[j];
1588           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1589         }
1590 
1591         sctx.rs = rs;
1592         sctx.pv = rtmp2[i+1];
1593         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1594         if (sctx.newshift) break;
1595         pc2  = b->a + bdiag[i+1];
1596         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1597 
1598         /* Now take care of 2nd column of diagonal 3x3 block. */
1599         pc3 = rtmp3 + i+1;
1600         if (*pc3 != 0.0) {
1601           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1602           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1603           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1604           for (j=0; j<nz; j++) {
1605             col         = pj[j];
1606             rtmp3[col] -= mul3 * rtmp2[col];
1607           }
1608           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1609         }
1610 
1611         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1612         rs = 0.0;
1613         /* L part */
1614         pc3 = b->a + bi[i+2];
1615         pj  = b->j + bi[i+2];
1616         nz  = bi[i+3] - bi[i+2];
1617         for (j=0; j<nz; j++) {
1618           col    = pj[j];
1619           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1620         }
1621         /* U part */
1622         pc3 = b->a + bdiag[i+3]+1;
1623         pj  = b->j + bdiag[i+3]+1;
1624         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1625         for (j=0; j<nz; j++) {
1626           col    = pj[j];
1627           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1628         }
1629 
1630         sctx.rs = rs;
1631         sctx.pv = rtmp3[i+2];
1632         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1633         if (sctx.newshift) break;
1634         pc3  = b->a + bdiag[i+2];
1635         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1636         break;
1637       case 4:
1638         /*----------*/
1639         /* zero rtmp */
1640         /* L part */
1641         nz    = bi[i+1] - bi[i];
1642         bjtmp = bj + bi[i];
1643         for  (j=0; j<nz; j++) {
1644           col        = bjtmp[j];
1645           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1646         }
1647 
1648         /* U part */
1649         nz    = bdiag[i]-bdiag[i+1];
1650         bjtmp = bj + bdiag[i+1]+1;
1651         for  (j=0; j<nz; j++) {
1652           col        = bjtmp[j];
1653           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1654         }
1655 
1656         /* load in initial (unfactored row) */
1657         nz    = ai[r[i]+1] - ai[r[i]];
1658         ajtmp = aj + ai[r[i]];
1659         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1660         for (j=0; j<nz; j++) {
1661           col        = ics[ajtmp[j]];
1662           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1663         }
1664         /* ZeropivotApply(): shift the diagonal of the matrix  */
1665         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1666 
1667         /* elimination */
1668         bjtmp = bj + bi[i];
1669         row   = *bjtmp++; /* pivot row */
1670         nzL   = bi[i+1] - bi[i];
1671         for (k=0; k < nzL; k++) {
1672           pc1 = rtmp1 + row;
1673           pc2 = rtmp2 + row;
1674           pc3 = rtmp3 + row;
1675           pc4 = rtmp4 + row;
1676           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1677             pv   = b->a + bdiag[row];
1678             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1679             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1680 
1681             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1682             pv = b->a + bdiag[row+1]+1;
1683             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1684             for (j=0; j<nz; j++) {
1685               col         = pj[j];
1686               rtmp1[col] -= mul1 * pv[j];
1687               rtmp2[col] -= mul2 * pv[j];
1688               rtmp3[col] -= mul3 * pv[j];
1689               rtmp4[col] -= mul4 * pv[j];
1690             }
1691             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1692           }
1693           row = *bjtmp++;
1694         }
1695 
1696         /* finished row i; check zero pivot, then stick row i into b->a */
1697         rs = 0.0;
1698         /* L part */
1699         pc1 = b->a + bi[i];
1700         pj  = b->j + bi[i];
1701         nz  = bi[i+1] - bi[i];
1702         for (j=0; j<nz; j++) {
1703           col    = pj[j];
1704           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1705         }
1706         /* U part */
1707         pc1 = b->a + bdiag[i+1]+1;
1708         pj  = b->j + bdiag[i+1]+1;
1709         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1710         for (j=0; j<nz; j++) {
1711           col    = pj[j];
1712           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1713         }
1714 
1715         sctx.rs = rs;
1716         sctx.pv = rtmp1[i];
1717         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1718         if (sctx.newshift) break;
1719         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1720         *pc1 = 1.0/sctx.pv;
1721 
1722         /* Now take care of 1st column of diagonal 4x4 block. */
1723         pc2 = rtmp2 + i;
1724         pc3 = rtmp3 + i;
1725         pc4 = rtmp4 + i;
1726         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1727           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1728           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1729           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1730           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1731           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1732           for (j=0; j<nz; j++) {
1733             col         = pj[j];
1734             rtmp2[col] -= mul2 * rtmp1[col];
1735             rtmp3[col] -= mul3 * rtmp1[col];
1736             rtmp4[col] -= mul4 * rtmp1[col];
1737           }
1738           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1739         }
1740 
1741         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1742         rs = 0.0;
1743         /* L part */
1744         pc2 = b->a + bi[i+1];
1745         pj  = b->j + bi[i+1];
1746         nz  = bi[i+2] - bi[i+1];
1747         for (j=0; j<nz; j++) {
1748           col    = pj[j];
1749           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1750         }
1751         /* U part */
1752         pc2 = b->a + bdiag[i+2]+1;
1753         pj  = b->j + bdiag[i+2]+1;
1754         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1755         for (j=0; j<nz; j++) {
1756           col    = pj[j];
1757           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1758         }
1759 
1760         sctx.rs = rs;
1761         sctx.pv = rtmp2[i+1];
1762         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1763         if (sctx.newshift) break;
1764         pc2  = b->a + bdiag[i+1];
1765         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1766 
1767         /* Now take care of 2nd column of diagonal 4x4 block. */
1768         pc3 = rtmp3 + i+1;
1769         pc4 = rtmp4 + i+1;
1770         if (*pc3 != 0.0 || *pc4 != 0.0) {
1771           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1772           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1773           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1774           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1775           for (j=0; j<nz; j++) {
1776             col         = pj[j];
1777             rtmp3[col] -= mul3 * rtmp2[col];
1778             rtmp4[col] -= mul4 * rtmp2[col];
1779           }
1780           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1781         }
1782 
1783         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1784         rs = 0.0;
1785         /* L part */
1786         pc3 = b->a + bi[i+2];
1787         pj  = b->j + bi[i+2];
1788         nz  = bi[i+3] - bi[i+2];
1789         for (j=0; j<nz; j++) {
1790           col    = pj[j];
1791           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1792         }
1793         /* U part */
1794         pc3 = b->a + bdiag[i+3]+1;
1795         pj  = b->j + bdiag[i+3]+1;
1796         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1797         for (j=0; j<nz; j++) {
1798           col    = pj[j];
1799           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1800         }
1801 
1802         sctx.rs = rs;
1803         sctx.pv = rtmp3[i+2];
1804         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1805         if (sctx.newshift) break;
1806         pc3  = b->a + bdiag[i+2];
1807         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1808 
1809         /* Now take care of 3rd column of diagonal 4x4 block. */
1810         pc4 = rtmp4 + i+2;
1811         if (*pc4 != 0.0) {
1812           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1813           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1814           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1815           for (j=0; j<nz; j++) {
1816             col         = pj[j];
1817             rtmp4[col] -= mul4 * rtmp3[col];
1818           }
1819           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1820         }
1821 
1822         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1823         rs = 0.0;
1824         /* L part */
1825         pc4 = b->a + bi[i+3];
1826         pj  = b->j + bi[i+3];
1827         nz  = bi[i+4] - bi[i+3];
1828         for (j=0; j<nz; j++) {
1829           col    = pj[j];
1830           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1831         }
1832         /* U part */
1833         pc4 = b->a + bdiag[i+4]+1;
1834         pj  = b->j + bdiag[i+4]+1;
1835         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1836         for (j=0; j<nz; j++) {
1837           col    = pj[j];
1838           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1839         }
1840 
1841         sctx.rs = rs;
1842         sctx.pv = rtmp4[i+3];
1843         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1844         if (sctx.newshift) break;
1845         pc4  = b->a + bdiag[i+3];
1846         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1847         break;
1848 
1849       default:
1850         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1851       }
1852       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1853       i += nodesz;                 /* Update the row */
1854     }
1855 
1856     /* MatPivotRefine() */
1857     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1858       /*
1859        * if no shift in this attempt & shifting & started shifting & can refine,
1860        * then try lower shift
1861        */
1862       sctx.shift_hi       = sctx.shift_fraction;
1863       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1864       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1865       sctx.newshift       = PETSC_TRUE;
1866       sctx.nshift++;
1867     }
1868   } while (sctx.newshift);
1869 
1870   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1871   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1872   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1873   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1874 
1875   if (b->inode.size) {
1876     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1877   } else {
1878     C->ops->solve           = MatSolve_SeqAIJ;
1879   }
1880   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1881   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1882   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1883   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1884   C->assembled              = PETSC_TRUE;
1885   C->preallocated           = PETSC_TRUE;
1886 
1887   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1888 
1889   /* MatShiftView(A,info,&sctx) */
1890   if (sctx.nshift) {
1891     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1892       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1893     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1894       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1895     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1896       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1897     }
1898   }
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 #undef __FUNCT__
1903 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1904 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1905 {
1906   Mat             C     = B;
1907   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1908   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1909   PetscErrorCode  ierr;
1910   const PetscInt  *r,*ic,*c,*ics;
1911   PetscInt        n   = A->rmap->n,*bi = b->i;
1912   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1913   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1914   PetscInt        *ai = a->i,*aj = a->j;
1915   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1916   PetscScalar     mul1,mul2,mul3,tmp;
1917   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1918   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1919   PetscReal       rs=0.0;
1920   FactorShiftCtx  sctx;
1921 
1922   PetscFunctionBegin;
1923   sctx.shift_top      = 0;
1924   sctx.nshift_max     = 0;
1925   sctx.shift_lo       = 0;
1926   sctx.shift_hi       = 0;
1927   sctx.shift_fraction = 0;
1928 
1929   /* if both shift schemes are chosen by user, only use info->shiftpd */
1930   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1931     sctx.shift_top = 0;
1932     for (i=0; i<n; i++) {
1933       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1934       rs    = 0.0;
1935       ajtmp = aj + ai[i];
1936       rtmp1 = aa + ai[i];
1937       nz    = ai[i+1] - ai[i];
1938       for (j=0; j<nz; j++) {
1939         if (*ajtmp != i) {
1940           rs += PetscAbsScalar(*rtmp1++);
1941         } else {
1942           rs -= PetscRealPart(*rtmp1++);
1943         }
1944         ajtmp++;
1945       }
1946       if (rs>sctx.shift_top) sctx.shift_top = rs;
1947     }
1948     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1949     sctx.shift_top *= 1.1;
1950     sctx.nshift_max = 5;
1951     sctx.shift_lo   = 0.;
1952     sctx.shift_hi   = 1.;
1953   }
1954   sctx.shift_amount = 0;
1955   sctx.nshift       = 0;
1956 
1957   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1958   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1959   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1960   ierr   = PetscMalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1961   ierr   = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr);
1962   ierr   = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr);
1963   ierr   = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr);
1964   ics    = ic;
1965 
1966   node_max = a->inode.node_count;
1967   ns       = a->inode.size;
1968   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1969 
1970   /* If max inode size > 3, split it into two inodes.*/
1971   /* also map the inode sizes according to the ordering */
1972   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1973   for (i=0,j=0; i<node_max; ++i,++j) {
1974     if (ns[i]>3) {
1975       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1976       ++j;
1977       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1978     } else {
1979       tmp_vec1[j] = ns[i];
1980     }
1981   }
1982   /* Use the correct node_max */
1983   node_max = j;
1984 
1985   /* Now reorder the inode info based on mat re-ordering info */
1986   /* First create a row -> inode_size_array_index map */
1987   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1988   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1989   for (i=0,row=0; i<node_max; i++) {
1990     nodesz = tmp_vec1[i];
1991     for (j=0; j<nodesz; j++,row++) {
1992       nsmap[row] = i;
1993     }
1994   }
1995   /* Using nsmap, create a reordered ns structure */
1996   for (i=0,j=0; i< node_max; i++) {
1997     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1998     tmp_vec2[i] = nodesz;
1999     j          += nodesz;
2000   }
2001   ierr = PetscFree(nsmap);CHKERRQ(ierr);
2002   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
2003   /* Now use the correct ns */
2004   ns = tmp_vec2;
2005 
2006   do {
2007     sctx.newshift = PETSC_FALSE;
2008     /* Now loop over each block-row, and do the factorization */
2009     for (i=0,row=0; i<node_max; i++) {
2010       nodesz = ns[i];
2011       nz     = bi[row+1] - bi[row];
2012       bjtmp  = bj + bi[row];
2013 
2014       switch (nodesz) {
2015       case 1:
2016         for  (j=0; j<nz; j++) {
2017           idx         = bjtmp[j];
2018           rtmp11[idx] = 0.0;
2019         }
2020 
2021         /* load in initial (unfactored row) */
2022         idx    = r[row];
2023         nz_tmp = ai[idx+1] - ai[idx];
2024         ajtmp  = aj + ai[idx];
2025         v1     = aa + ai[idx];
2026 
2027         for (j=0; j<nz_tmp; j++) {
2028           idx         = ics[ajtmp[j]];
2029           rtmp11[idx] = v1[j];
2030         }
2031         rtmp11[ics[r[row]]] += sctx.shift_amount;
2032 
2033         prow = *bjtmp++;
2034         while (prow < row) {
2035           pc1 = rtmp11 + prow;
2036           if (*pc1 != 0.0) {
2037             pv     = ba + bd[prow];
2038             pj     = nbj + bd[prow];
2039             mul1   = *pc1 * *pv++;
2040             *pc1   = mul1;
2041             nz_tmp = bi[prow+1] - bd[prow] - 1;
2042             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2043             for (j=0; j<nz_tmp; j++) {
2044               tmp          = pv[j];
2045               idx          = pj[j];
2046               rtmp11[idx] -= mul1 * tmp;
2047             }
2048           }
2049           prow = *bjtmp++;
2050         }
2051         pj  = bj + bi[row];
2052         pc1 = ba + bi[row];
2053 
2054         sctx.pv     = rtmp11[row];
2055         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2056         rs          = 0.0;
2057         for (j=0; j<nz; j++) {
2058           idx    = pj[j];
2059           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2060           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2061         }
2062         sctx.rs = rs;
2063         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2064         if (sctx.newshift) goto endofwhile;
2065         break;
2066 
2067       case 2:
2068         for (j=0; j<nz; j++) {
2069           idx         = bjtmp[j];
2070           rtmp11[idx] = 0.0;
2071           rtmp22[idx] = 0.0;
2072         }
2073 
2074         /* load in initial (unfactored row) */
2075         idx    = r[row];
2076         nz_tmp = ai[idx+1] - ai[idx];
2077         ajtmp  = aj + ai[idx];
2078         v1     = aa + ai[idx];
2079         v2     = aa + ai[idx+1];
2080         for (j=0; j<nz_tmp; j++) {
2081           idx         = ics[ajtmp[j]];
2082           rtmp11[idx] = v1[j];
2083           rtmp22[idx] = v2[j];
2084         }
2085         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2086         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2087 
2088         prow = *bjtmp++;
2089         while (prow < row) {
2090           pc1 = rtmp11 + prow;
2091           pc2 = rtmp22 + prow;
2092           if (*pc1 != 0.0 || *pc2 != 0.0) {
2093             pv   = ba + bd[prow];
2094             pj   = nbj + bd[prow];
2095             mul1 = *pc1 * *pv;
2096             mul2 = *pc2 * *pv;
2097             ++pv;
2098             *pc1 = mul1;
2099             *pc2 = mul2;
2100 
2101             nz_tmp = bi[prow+1] - bd[prow] - 1;
2102             for (j=0; j<nz_tmp; j++) {
2103               tmp          = pv[j];
2104               idx          = pj[j];
2105               rtmp11[idx] -= mul1 * tmp;
2106               rtmp22[idx] -= mul2 * tmp;
2107             }
2108             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2109           }
2110           prow = *bjtmp++;
2111         }
2112 
2113         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2114         pc1 = rtmp11 + prow;
2115         pc2 = rtmp22 + prow;
2116 
2117         sctx.pv = *pc1;
2118         pj      = bj + bi[prow];
2119         rs      = 0.0;
2120         for (j=0; j<nz; j++) {
2121           idx = pj[j];
2122           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2123         }
2124         sctx.rs = rs;
2125         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2126         if (sctx.newshift) goto endofwhile;
2127 
2128         if (*pc2 != 0.0) {
2129           pj     = nbj + bd[prow];
2130           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2131           *pc2   = mul2;
2132           nz_tmp = bi[prow+1] - bd[prow] - 1;
2133           for (j=0; j<nz_tmp; j++) {
2134             idx          = pj[j];
2135             tmp          = rtmp11[idx];
2136             rtmp22[idx] -= mul2 * tmp;
2137           }
2138           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2139         }
2140 
2141         pj  = bj + bi[row];
2142         pc1 = ba + bi[row];
2143         pc2 = ba + bi[row+1];
2144 
2145         sctx.pv       = rtmp22[row+1];
2146         rs            = 0.0;
2147         rtmp11[row]   = 1.0/rtmp11[row];
2148         rtmp22[row+1] = 1.0/rtmp22[row+1];
2149         /* copy row entries from dense representation to sparse */
2150         for (j=0; j<nz; j++) {
2151           idx    = pj[j];
2152           pc1[j] = rtmp11[idx];
2153           pc2[j] = rtmp22[idx];
2154           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2155         }
2156         sctx.rs = rs;
2157         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2158         if (sctx.newshift) goto endofwhile;
2159         break;
2160 
2161       case 3:
2162         for  (j=0; j<nz; j++) {
2163           idx         = bjtmp[j];
2164           rtmp11[idx] = 0.0;
2165           rtmp22[idx] = 0.0;
2166           rtmp33[idx] = 0.0;
2167         }
2168         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2169         idx    = r[row];
2170         nz_tmp = ai[idx+1] - ai[idx];
2171         ajtmp  = aj + ai[idx];
2172         v1     = aa + ai[idx];
2173         v2     = aa + ai[idx+1];
2174         v3     = aa + ai[idx+2];
2175         for (j=0; j<nz_tmp; j++) {
2176           idx         = ics[ajtmp[j]];
2177           rtmp11[idx] = v1[j];
2178           rtmp22[idx] = v2[j];
2179           rtmp33[idx] = v3[j];
2180         }
2181         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2182         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2183         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2184 
2185         /* loop over all pivot row blocks above this row block */
2186         prow = *bjtmp++;
2187         while (prow < row) {
2188           pc1 = rtmp11 + prow;
2189           pc2 = rtmp22 + prow;
2190           pc3 = rtmp33 + prow;
2191           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2192             pv   = ba  + bd[prow];
2193             pj   = nbj + bd[prow];
2194             mul1 = *pc1 * *pv;
2195             mul2 = *pc2 * *pv;
2196             mul3 = *pc3 * *pv;
2197             ++pv;
2198             *pc1 = mul1;
2199             *pc2 = mul2;
2200             *pc3 = mul3;
2201 
2202             nz_tmp = bi[prow+1] - bd[prow] - 1;
2203             /* update this row based on pivot row */
2204             for (j=0; j<nz_tmp; j++) {
2205               tmp          = pv[j];
2206               idx          = pj[j];
2207               rtmp11[idx] -= mul1 * tmp;
2208               rtmp22[idx] -= mul2 * tmp;
2209               rtmp33[idx] -= mul3 * tmp;
2210             }
2211             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2212           }
2213           prow = *bjtmp++;
2214         }
2215 
2216         /* Now take care of diagonal 3x3 block in this set of rows */
2217         /* note: prow = row here */
2218         pc1 = rtmp11 + prow;
2219         pc2 = rtmp22 + prow;
2220         pc3 = rtmp33 + prow;
2221 
2222         sctx.pv = *pc1;
2223         pj      = bj + bi[prow];
2224         rs      = 0.0;
2225         for (j=0; j<nz; j++) {
2226           idx = pj[j];
2227           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2228         }
2229         sctx.rs = rs;
2230         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2231         if (sctx.newshift) goto endofwhile;
2232 
2233         if (*pc2 != 0.0 || *pc3 != 0.0) {
2234           mul2   = (*pc2)/(*pc1);
2235           mul3   = (*pc3)/(*pc1);
2236           *pc2   = mul2;
2237           *pc3   = mul3;
2238           nz_tmp = bi[prow+1] - bd[prow] - 1;
2239           pj     = nbj + bd[prow];
2240           for (j=0; j<nz_tmp; j++) {
2241             idx          = pj[j];
2242             tmp          = rtmp11[idx];
2243             rtmp22[idx] -= mul2 * tmp;
2244             rtmp33[idx] -= mul3 * tmp;
2245           }
2246           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2247         }
2248         ++prow;
2249 
2250         pc2     = rtmp22 + prow;
2251         pc3     = rtmp33 + prow;
2252         sctx.pv = *pc2;
2253         pj      = bj + bi[prow];
2254         rs      = 0.0;
2255         for (j=0; j<nz; j++) {
2256           idx = pj[j];
2257           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2258         }
2259         sctx.rs = rs;
2260         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2261         if (sctx.newshift) goto endofwhile;
2262 
2263         if (*pc3 != 0.0) {
2264           mul3   = (*pc3)/(*pc2);
2265           *pc3   = mul3;
2266           pj     = nbj + bd[prow];
2267           nz_tmp = bi[prow+1] - bd[prow] - 1;
2268           for (j=0; j<nz_tmp; j++) {
2269             idx          = pj[j];
2270             tmp          = rtmp22[idx];
2271             rtmp33[idx] -= mul3 * tmp;
2272           }
2273           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2274         }
2275 
2276         pj  = bj + bi[row];
2277         pc1 = ba + bi[row];
2278         pc2 = ba + bi[row+1];
2279         pc3 = ba + bi[row+2];
2280 
2281         sctx.pv       = rtmp33[row+2];
2282         rs            = 0.0;
2283         rtmp11[row]   = 1.0/rtmp11[row];
2284         rtmp22[row+1] = 1.0/rtmp22[row+1];
2285         rtmp33[row+2] = 1.0/rtmp33[row+2];
2286         /* copy row entries from dense representation to sparse */
2287         for (j=0; j<nz; j++) {
2288           idx    = pj[j];
2289           pc1[j] = rtmp11[idx];
2290           pc2[j] = rtmp22[idx];
2291           pc3[j] = rtmp33[idx];
2292           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2293         }
2294 
2295         sctx.rs = rs;
2296         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2297         if (sctx.newshift) goto endofwhile;
2298         break;
2299 
2300       default:
2301         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2302       }
2303       row += nodesz;                 /* Update the row */
2304     }
2305 endofwhile:;
2306   } while (sctx.newshift);
2307   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2308   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2309   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2310   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2311   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2312 
2313   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2314   /* do not set solve add, since MatSolve_Inode + Add is faster */
2315   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2316   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2317   C->assembled              = PETSC_TRUE;
2318   C->preallocated           = PETSC_TRUE;
2319   if (sctx.nshift) {
2320     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2321       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2322     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2323       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2324     }
2325   }
2326   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2327   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2328   PetscFunctionReturn(0);
2329 }
2330 
2331 
2332 /* ----------------------------------------------------------- */
2333 #undef __FUNCT__
2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2336 {
2337   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2338   IS                iscol = a->col,isrow = a->row;
2339   PetscErrorCode    ierr;
2340   const PetscInt    *r,*c,*rout,*cout;
2341   PetscInt          i,j,n = A->rmap->n;
2342   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2343   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2344   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2345   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2346   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2347   const PetscScalar *b;
2348 
2349   PetscFunctionBegin;
2350   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2351   node_max = a->inode.node_count;
2352   ns       = a->inode.size;     /* Node Size array */
2353 
2354   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2355   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2356   tmp  = a->solve_work;
2357 
2358   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2359   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2360 
2361   /* forward solve the lower triangular */
2362   tmps = tmp;
2363   aa   = a_a;
2364   aj   = a_j;
2365   ad   = a->diag;
2366 
2367   for (i = 0,row = 0; i< node_max; ++i) {
2368     nsz = ns[i];
2369     aii = ai[row];
2370     v1  = aa + aii;
2371     vi  = aj + aii;
2372     nz  = ai[row+1]- ai[row];
2373 
2374     if (i < node_max-1) {
2375       /* Prefetch the indices for the next block */
2376       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2377       /* Prefetch the data for the next block */
2378       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2379     }
2380 
2381     switch (nsz) {               /* Each loop in 'case' is unrolled */
2382     case 1:
2383       sum1 = b[r[row]];
2384       for (j=0; j<nz-1; j+=2) {
2385         i0    = vi[j];
2386         i1    = vi[j+1];
2387         tmp0  = tmps[i0];
2388         tmp1  = tmps[i1];
2389         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2390       }
2391       if (j == nz-1) {
2392         tmp0  = tmps[vi[j]];
2393         sum1 -= v1[j]*tmp0;
2394       }
2395       tmp[row++]=sum1;
2396       break;
2397     case 2:
2398       sum1 = b[r[row]];
2399       sum2 = b[r[row+1]];
2400       v2   = aa + ai[row+1];
2401 
2402       for (j=0; j<nz-1; j+=2) {
2403         i0    = vi[j];
2404         i1    = vi[j+1];
2405         tmp0  = tmps[i0];
2406         tmp1  = tmps[i1];
2407         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2408         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2409       }
2410       if (j == nz-1) {
2411         tmp0  = tmps[vi[j]];
2412         sum1 -= v1[j] *tmp0;
2413         sum2 -= v2[j] *tmp0;
2414       }
2415       sum2     -= v2[nz] * sum1;
2416       tmp[row++]=sum1;
2417       tmp[row++]=sum2;
2418       break;
2419     case 3:
2420       sum1 = b[r[row]];
2421       sum2 = b[r[row+1]];
2422       sum3 = b[r[row+2]];
2423       v2   = aa + ai[row+1];
2424       v3   = aa + ai[row+2];
2425 
2426       for (j=0; j<nz-1; j+=2) {
2427         i0    = vi[j];
2428         i1    = vi[j+1];
2429         tmp0  = tmps[i0];
2430         tmp1  = tmps[i1];
2431         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2432         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2433         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2434       }
2435       if (j == nz-1) {
2436         tmp0  = tmps[vi[j]];
2437         sum1 -= v1[j] *tmp0;
2438         sum2 -= v2[j] *tmp0;
2439         sum3 -= v3[j] *tmp0;
2440       }
2441       sum2     -= v2[nz] * sum1;
2442       sum3     -= v3[nz] * sum1;
2443       sum3     -= v3[nz+1] * sum2;
2444       tmp[row++]=sum1;
2445       tmp[row++]=sum2;
2446       tmp[row++]=sum3;
2447       break;
2448 
2449     case 4:
2450       sum1 = b[r[row]];
2451       sum2 = b[r[row+1]];
2452       sum3 = b[r[row+2]];
2453       sum4 = b[r[row+3]];
2454       v2   = aa + ai[row+1];
2455       v3   = aa + ai[row+2];
2456       v4   = aa + ai[row+3];
2457 
2458       for (j=0; j<nz-1; j+=2) {
2459         i0    = vi[j];
2460         i1    = vi[j+1];
2461         tmp0  = tmps[i0];
2462         tmp1  = tmps[i1];
2463         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2464         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2465         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2466         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2467       }
2468       if (j == nz-1) {
2469         tmp0  = tmps[vi[j]];
2470         sum1 -= v1[j] *tmp0;
2471         sum2 -= v2[j] *tmp0;
2472         sum3 -= v3[j] *tmp0;
2473         sum4 -= v4[j] *tmp0;
2474       }
2475       sum2 -= v2[nz] * sum1;
2476       sum3 -= v3[nz] * sum1;
2477       sum4 -= v4[nz] * sum1;
2478       sum3 -= v3[nz+1] * sum2;
2479       sum4 -= v4[nz+1] * sum2;
2480       sum4 -= v4[nz+2] * sum3;
2481 
2482       tmp[row++]=sum1;
2483       tmp[row++]=sum2;
2484       tmp[row++]=sum3;
2485       tmp[row++]=sum4;
2486       break;
2487     case 5:
2488       sum1 = b[r[row]];
2489       sum2 = b[r[row+1]];
2490       sum3 = b[r[row+2]];
2491       sum4 = b[r[row+3]];
2492       sum5 = b[r[row+4]];
2493       v2   = aa + ai[row+1];
2494       v3   = aa + ai[row+2];
2495       v4   = aa + ai[row+3];
2496       v5   = aa + ai[row+4];
2497 
2498       for (j=0; j<nz-1; j+=2) {
2499         i0    = vi[j];
2500         i1    = vi[j+1];
2501         tmp0  = tmps[i0];
2502         tmp1  = tmps[i1];
2503         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2504         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2505         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2506         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2507         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2508       }
2509       if (j == nz-1) {
2510         tmp0  = tmps[vi[j]];
2511         sum1 -= v1[j] *tmp0;
2512         sum2 -= v2[j] *tmp0;
2513         sum3 -= v3[j] *tmp0;
2514         sum4 -= v4[j] *tmp0;
2515         sum5 -= v5[j] *tmp0;
2516       }
2517 
2518       sum2 -= v2[nz] * sum1;
2519       sum3 -= v3[nz] * sum1;
2520       sum4 -= v4[nz] * sum1;
2521       sum5 -= v5[nz] * sum1;
2522       sum3 -= v3[nz+1] * sum2;
2523       sum4 -= v4[nz+1] * sum2;
2524       sum5 -= v5[nz+1] * sum2;
2525       sum4 -= v4[nz+2] * sum3;
2526       sum5 -= v5[nz+2] * sum3;
2527       sum5 -= v5[nz+3] * sum4;
2528 
2529       tmp[row++]=sum1;
2530       tmp[row++]=sum2;
2531       tmp[row++]=sum3;
2532       tmp[row++]=sum4;
2533       tmp[row++]=sum5;
2534       break;
2535     default:
2536       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2537     }
2538   }
2539   /* backward solve the upper triangular */
2540   for (i=node_max -1,row = n-1; i>=0; i--) {
2541     nsz = ns[i];
2542     aii = ad[row+1] + 1;
2543     v1  = aa + aii;
2544     vi  = aj + aii;
2545     nz  = ad[row]- ad[row+1] - 1;
2546 
2547     if (i > 0) {
2548       /* Prefetch the indices for the next block */
2549       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2550       /* Prefetch the data for the next block */
2551       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2552     }
2553 
2554     switch (nsz) {               /* Each loop in 'case' is unrolled */
2555     case 1:
2556       sum1 = tmp[row];
2557 
2558       for (j=0; j<nz-1; j+=2) {
2559         i0    = vi[j];
2560         i1    = vi[j+1];
2561         tmp0  = tmps[i0];
2562         tmp1  = tmps[i1];
2563         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2564       }
2565       if (j == nz-1) {
2566         tmp0  = tmps[vi[j]];
2567         sum1 -= v1[j]*tmp0;
2568       }
2569       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2570       break;
2571     case 2:
2572       sum1 = tmp[row];
2573       sum2 = tmp[row-1];
2574       v2   = aa + ad[row] + 1;
2575       for (j=0; j<nz-1; j+=2) {
2576         i0    = vi[j];
2577         i1    = vi[j+1];
2578         tmp0  = tmps[i0];
2579         tmp1  = tmps[i1];
2580         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2581         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2582       }
2583       if (j == nz-1) {
2584         tmp0  = tmps[vi[j]];
2585         sum1 -= v1[j]* tmp0;
2586         sum2 -= v2[j+1]* tmp0;
2587       }
2588 
2589       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2590       sum2     -= v2[0] * tmp0;
2591       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2592       break;
2593     case 3:
2594       sum1 = tmp[row];
2595       sum2 = tmp[row -1];
2596       sum3 = tmp[row -2];
2597       v2   = aa + ad[row] + 1;
2598       v3   = aa + ad[row -1] + 1;
2599       for (j=0; j<nz-1; j+=2) {
2600         i0    = vi[j];
2601         i1    = vi[j+1];
2602         tmp0  = tmps[i0];
2603         tmp1  = tmps[i1];
2604         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2605         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2606         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2607       }
2608       if (j== nz-1) {
2609         tmp0  = tmps[vi[j]];
2610         sum1 -= v1[j] * tmp0;
2611         sum2 -= v2[j+1] * tmp0;
2612         sum3 -= v3[j+2] * tmp0;
2613       }
2614       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2615       sum2     -= v2[0]* tmp0;
2616       sum3     -= v3[1] * tmp0;
2617       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2618       sum3     -= v3[0]* tmp0;
2619       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2620 
2621       break;
2622     case 4:
2623       sum1 = tmp[row];
2624       sum2 = tmp[row -1];
2625       sum3 = tmp[row -2];
2626       sum4 = tmp[row -3];
2627       v2   = aa + ad[row]+1;
2628       v3   = aa + ad[row -1]+1;
2629       v4   = aa + ad[row -2]+1;
2630 
2631       for (j=0; j<nz-1; j+=2) {
2632         i0    = vi[j];
2633         i1    = vi[j+1];
2634         tmp0  = tmps[i0];
2635         tmp1  = tmps[i1];
2636         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2637         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2638         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2639         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2640       }
2641       if (j== nz-1) {
2642         tmp0  = tmps[vi[j]];
2643         sum1 -= v1[j] * tmp0;
2644         sum2 -= v2[j+1] * tmp0;
2645         sum3 -= v3[j+2] * tmp0;
2646         sum4 -= v4[j+3] * tmp0;
2647       }
2648 
2649       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2650       sum2     -= v2[0] * tmp0;
2651       sum3     -= v3[1] * tmp0;
2652       sum4     -= v4[2] * tmp0;
2653       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2654       sum3     -= v3[0] * tmp0;
2655       sum4     -= v4[1] * tmp0;
2656       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2657       sum4     -= v4[0] * tmp0;
2658       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2659       break;
2660     case 5:
2661       sum1 = tmp[row];
2662       sum2 = tmp[row -1];
2663       sum3 = tmp[row -2];
2664       sum4 = tmp[row -3];
2665       sum5 = tmp[row -4];
2666       v2   = aa + ad[row]+1;
2667       v3   = aa + ad[row -1]+1;
2668       v4   = aa + ad[row -2]+1;
2669       v5   = aa + ad[row -3]+1;
2670       for (j=0; j<nz-1; j+=2) {
2671         i0    = vi[j];
2672         i1    = vi[j+1];
2673         tmp0  = tmps[i0];
2674         tmp1  = tmps[i1];
2675         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2676         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2677         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2678         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2679         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2680       }
2681       if (j==nz-1) {
2682         tmp0  = tmps[vi[j]];
2683         sum1 -= v1[j] * tmp0;
2684         sum2 -= v2[j+1] * tmp0;
2685         sum3 -= v3[j+2] * tmp0;
2686         sum4 -= v4[j+3] * tmp0;
2687         sum5 -= v5[j+4] * tmp0;
2688       }
2689 
2690       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2691       sum2     -= v2[0] * tmp0;
2692       sum3     -= v3[1] * tmp0;
2693       sum4     -= v4[2] * tmp0;
2694       sum5     -= v5[3] * tmp0;
2695       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2696       sum3     -= v3[0] * tmp0;
2697       sum4     -= v4[1] * tmp0;
2698       sum5     -= v5[2] * tmp0;
2699       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2700       sum4     -= v4[0] * tmp0;
2701       sum5     -= v5[1] * tmp0;
2702       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2703       sum5     -= v5[0] * tmp0;
2704       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2705       break;
2706     default:
2707       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2708     }
2709   }
2710   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2711   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2712   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2714   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2715   PetscFunctionReturn(0);
2716 }
2717 
2718 
2719 /*
2720      Makes a longer coloring[] array and calls the usual code with that
2721 */
2722 #undef __FUNCT__
2723 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2724 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2725 {
2726   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2727   PetscErrorCode  ierr;
2728   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2729   PetscInt        *colorused,i;
2730   ISColoringValue *newcolor;
2731 
2732   PetscFunctionBegin;
2733   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2734   /* loop over inodes, marking a color for each column*/
2735   row = 0;
2736   for (i=0; i<m; i++) {
2737     for (j=0; j<ns[i]; j++) {
2738       newcolor[row++] = coloring[i] + j*ncolors;
2739     }
2740   }
2741 
2742   /* eliminate unneeded colors */
2743   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2744   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2745   for (i=0; i<n; i++) {
2746     colorused[newcolor[i]] = 1;
2747   }
2748 
2749   for (i=1; i<5*ncolors; i++) {
2750     colorused[i] += colorused[i-1];
2751   }
2752   ncolors = colorused[5*ncolors-1];
2753   for (i=0; i<n; i++) {
2754     newcolor[i] = colorused[newcolor[i]]-1;
2755   }
2756   ierr = PetscFree(colorused);CHKERRQ(ierr);
2757   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2758   ierr = PetscFree(coloring);CHKERRQ(ierr);
2759   PetscFunctionReturn(0);
2760 }
2761 
2762 #include <petsc-private/kernels/blockinvert.h>
2763 
2764 #undef __FUNCT__
2765 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2766 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2767 {
2768   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2769   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2770   MatScalar         *ibdiag,*bdiag,work[25],*t;
2771   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2772   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2773   const PetscScalar *xb, *b;
2774   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2775   PetscErrorCode    ierr;
2776   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2777   PetscInt          sz,k,ipvt[5];
2778   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2779 
2780   PetscFunctionBegin;
2781   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2782   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2783 
2784   if (!a->inode.ibdiagvalid) {
2785     if (!a->inode.ibdiag) {
2786       /* calculate space needed for diagonal blocks */
2787       for (i=0; i<m; i++) {
2788         cnt += sizes[i]*sizes[i];
2789       }
2790       a->inode.bdiagsize = cnt;
2791 
2792       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2793     }
2794 
2795     /* copy over the diagonal blocks and invert them */
2796     ibdiag = a->inode.ibdiag;
2797     bdiag  = a->inode.bdiag;
2798     cnt    = 0;
2799     for (i=0, row = 0; i<m; i++) {
2800       for (j=0; j<sizes[i]; j++) {
2801         for (k=0; k<sizes[i]; k++) {
2802           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2803         }
2804       }
2805       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2806 
2807       switch (sizes[i]) {
2808       case 1:
2809         /* Create matrix data structure */
2810         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2811         ibdiag[cnt] = 1.0/ibdiag[cnt];
2812         break;
2813       case 2:
2814         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2815         break;
2816       case 3:
2817         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2818         break;
2819       case 4:
2820         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2821         break;
2822       case 5:
2823         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2824         break;
2825       default:
2826         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2827       }
2828       cnt += sizes[i]*sizes[i];
2829       row += sizes[i];
2830     }
2831     a->inode.ibdiagvalid = PETSC_TRUE;
2832   }
2833   ibdiag = a->inode.ibdiag;
2834   bdiag  = a->inode.bdiag;
2835   t      = a->inode.ssor_work;
2836 
2837   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2839   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2840   if (flag & SOR_ZERO_INITIAL_GUESS) {
2841     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2842 
2843       for (i=0, row=0; i<m; i++) {
2844         sz  = diag[row] - ii[row];
2845         v1  = a->a + ii[row];
2846         idx = a->j + ii[row];
2847 
2848         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2849         switch (sizes[i]) {
2850         case 1:
2851 
2852           sum1 = b[row];
2853           for (n = 0; n<sz-1; n+=2) {
2854             i1    = idx[0];
2855             i2    = idx[1];
2856             idx  += 2;
2857             tmp0  = x[i1];
2858             tmp1  = x[i2];
2859             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2860           }
2861 
2862           if (n == sz-1) {
2863             tmp0  = x[*idx];
2864             sum1 -= *v1 * tmp0;
2865           }
2866           t[row]   = sum1;
2867           x[row++] = sum1*(*ibdiag++);
2868           break;
2869         case 2:
2870           v2   = a->a + ii[row+1];
2871           sum1 = b[row];
2872           sum2 = b[row+1];
2873           for (n = 0; n<sz-1; n+=2) {
2874             i1    = idx[0];
2875             i2    = idx[1];
2876             idx  += 2;
2877             tmp0  = x[i1];
2878             tmp1  = x[i2];
2879             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2880             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2881           }
2882 
2883           if (n == sz-1) {
2884             tmp0  = x[*idx];
2885             sum1 -= v1[0] * tmp0;
2886             sum2 -= v2[0] * tmp0;
2887           }
2888           t[row]   = sum1;
2889           t[row+1] = sum2;
2890           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2891           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2892           ibdiag  += 4;
2893           break;
2894         case 3:
2895           v2   = a->a + ii[row+1];
2896           v3   = a->a + ii[row+2];
2897           sum1 = b[row];
2898           sum2 = b[row+1];
2899           sum3 = b[row+2];
2900           for (n = 0; n<sz-1; n+=2) {
2901             i1    = idx[0];
2902             i2    = idx[1];
2903             idx  += 2;
2904             tmp0  = x[i1];
2905             tmp1  = x[i2];
2906             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2907             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2908             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2909           }
2910 
2911           if (n == sz-1) {
2912             tmp0  = x[*idx];
2913             sum1 -= v1[0] * tmp0;
2914             sum2 -= v2[0] * tmp0;
2915             sum3 -= v3[0] * tmp0;
2916           }
2917           t[row]   = sum1;
2918           t[row+1] = sum2;
2919           t[row+2] = sum3;
2920           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2921           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2922           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2923           ibdiag  += 9;
2924           break;
2925         case 4:
2926           v2   = a->a + ii[row+1];
2927           v3   = a->a + ii[row+2];
2928           v4   = a->a + ii[row+3];
2929           sum1 = b[row];
2930           sum2 = b[row+1];
2931           sum3 = b[row+2];
2932           sum4 = b[row+3];
2933           for (n = 0; n<sz-1; n+=2) {
2934             i1    = idx[0];
2935             i2    = idx[1];
2936             idx  += 2;
2937             tmp0  = x[i1];
2938             tmp1  = x[i2];
2939             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2940             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2941             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2942             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2943           }
2944 
2945           if (n == sz-1) {
2946             tmp0  = x[*idx];
2947             sum1 -= v1[0] * tmp0;
2948             sum2 -= v2[0] * tmp0;
2949             sum3 -= v3[0] * tmp0;
2950             sum4 -= v4[0] * tmp0;
2951           }
2952           t[row]   = sum1;
2953           t[row+1] = sum2;
2954           t[row+2] = sum3;
2955           t[row+3] = sum4;
2956           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2957           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2958           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2959           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2960           ibdiag  += 16;
2961           break;
2962         case 5:
2963           v2   = a->a + ii[row+1];
2964           v3   = a->a + ii[row+2];
2965           v4   = a->a + ii[row+3];
2966           v5   = a->a + ii[row+4];
2967           sum1 = b[row];
2968           sum2 = b[row+1];
2969           sum3 = b[row+2];
2970           sum4 = b[row+3];
2971           sum5 = b[row+4];
2972           for (n = 0; n<sz-1; n+=2) {
2973             i1    = idx[0];
2974             i2    = idx[1];
2975             idx  += 2;
2976             tmp0  = x[i1];
2977             tmp1  = x[i2];
2978             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2979             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2980             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2981             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2982             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2983           }
2984 
2985           if (n == sz-1) {
2986             tmp0  = x[*idx];
2987             sum1 -= v1[0] * tmp0;
2988             sum2 -= v2[0] * tmp0;
2989             sum3 -= v3[0] * tmp0;
2990             sum4 -= v4[0] * tmp0;
2991             sum5 -= v5[0] * tmp0;
2992           }
2993           t[row]   = sum1;
2994           t[row+1] = sum2;
2995           t[row+2] = sum3;
2996           t[row+3] = sum4;
2997           t[row+4] = sum5;
2998           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2999           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3000           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3001           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3002           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3003           ibdiag  += 25;
3004           break;
3005         default:
3006           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3007         }
3008       }
3009 
3010       xb   = t;
3011       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3012     } else xb = b;
3013     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3014 
3015       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3016       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3017         ibdiag -= sizes[i]*sizes[i];
3018         sz      = ii[row+1] - diag[row] - 1;
3019         v1      = a->a + diag[row] + 1;
3020         idx     = a->j + diag[row] + 1;
3021 
3022         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3023         switch (sizes[i]) {
3024         case 1:
3025 
3026           sum1 = xb[row];
3027           for (n = 0; n<sz-1; n+=2) {
3028             i1    = idx[0];
3029             i2    = idx[1];
3030             idx  += 2;
3031             tmp0  = x[i1];
3032             tmp1  = x[i2];
3033             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3034           }
3035 
3036           if (n == sz-1) {
3037             tmp0  = x[*idx];
3038             sum1 -= *v1*tmp0;
3039           }
3040           x[row--] = sum1*(*ibdiag);
3041           break;
3042 
3043         case 2:
3044 
3045           sum1 = xb[row];
3046           sum2 = xb[row-1];
3047           /* note that sum1 is associated with the second of the two rows */
3048           v2 = a->a + diag[row-1] + 2;
3049           for (n = 0; n<sz-1; n+=2) {
3050             i1    = idx[0];
3051             i2    = idx[1];
3052             idx  += 2;
3053             tmp0  = x[i1];
3054             tmp1  = x[i2];
3055             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3056             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3057           }
3058 
3059           if (n == sz-1) {
3060             tmp0  = x[*idx];
3061             sum1 -= *v1*tmp0;
3062             sum2 -= *v2*tmp0;
3063           }
3064           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3065           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3066           break;
3067         case 3:
3068 
3069           sum1 = xb[row];
3070           sum2 = xb[row-1];
3071           sum3 = xb[row-2];
3072           v2   = a->a + diag[row-1] + 2;
3073           v3   = a->a + diag[row-2] + 3;
3074           for (n = 0; n<sz-1; n+=2) {
3075             i1    = idx[0];
3076             i2    = idx[1];
3077             idx  += 2;
3078             tmp0  = x[i1];
3079             tmp1  = x[i2];
3080             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3081             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3082             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3083           }
3084 
3085           if (n == sz-1) {
3086             tmp0  = x[*idx];
3087             sum1 -= *v1*tmp0;
3088             sum2 -= *v2*tmp0;
3089             sum3 -= *v3*tmp0;
3090           }
3091           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3092           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3093           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3094           break;
3095         case 4:
3096 
3097           sum1 = xb[row];
3098           sum2 = xb[row-1];
3099           sum3 = xb[row-2];
3100           sum4 = xb[row-3];
3101           v2   = a->a + diag[row-1] + 2;
3102           v3   = a->a + diag[row-2] + 3;
3103           v4   = a->a + diag[row-3] + 4;
3104           for (n = 0; n<sz-1; n+=2) {
3105             i1    = idx[0];
3106             i2    = idx[1];
3107             idx  += 2;
3108             tmp0  = x[i1];
3109             tmp1  = x[i2];
3110             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3111             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3112             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3113             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3114           }
3115 
3116           if (n == sz-1) {
3117             tmp0  = x[*idx];
3118             sum1 -= *v1*tmp0;
3119             sum2 -= *v2*tmp0;
3120             sum3 -= *v3*tmp0;
3121             sum4 -= *v4*tmp0;
3122           }
3123           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3124           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3125           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3126           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3127           break;
3128         case 5:
3129 
3130           sum1 = xb[row];
3131           sum2 = xb[row-1];
3132           sum3 = xb[row-2];
3133           sum4 = xb[row-3];
3134           sum5 = xb[row-4];
3135           v2   = a->a + diag[row-1] + 2;
3136           v3   = a->a + diag[row-2] + 3;
3137           v4   = a->a + diag[row-3] + 4;
3138           v5   = a->a + diag[row-4] + 5;
3139           for (n = 0; n<sz-1; n+=2) {
3140             i1    = idx[0];
3141             i2    = idx[1];
3142             idx  += 2;
3143             tmp0  = x[i1];
3144             tmp1  = x[i2];
3145             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3146             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3147             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3148             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3149             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3150           }
3151 
3152           if (n == sz-1) {
3153             tmp0  = x[*idx];
3154             sum1 -= *v1*tmp0;
3155             sum2 -= *v2*tmp0;
3156             sum3 -= *v3*tmp0;
3157             sum4 -= *v4*tmp0;
3158             sum5 -= *v5*tmp0;
3159           }
3160           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3161           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3162           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3163           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3164           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3165           break;
3166         default:
3167           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3168         }
3169       }
3170 
3171       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3172     }
3173     its--;
3174   }
3175   while (its--) {
3176 
3177     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3178       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3179            i<m;
3180            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3181 
3182         sz  = diag[row] - ii[row];
3183         v1  = a->a + ii[row];
3184         idx = a->j + ii[row];
3185         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3186         switch (sizes[i]) {
3187         case 1:
3188           sum1 = b[row];
3189           for (n = 0; n<sz-1; n+=2) {
3190             i1    = idx[0];
3191             i2    = idx[1];
3192             idx  += 2;
3193             tmp0  = x[i1];
3194             tmp1  = x[i2];
3195             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3196           }
3197           if (n == sz-1) {
3198             tmp0  = x[*idx++];
3199             sum1 -= *v1 * tmp0;
3200             v1++;
3201           }
3202           t[row]   = sum1;
3203           sz      = ii[row+1] - diag[row] - 1;
3204           idx     = a->j + diag[row] + 1;
3205           v1 += 1;
3206           for (n = 0; n<sz-1; n+=2) {
3207             i1    = idx[0];
3208             i2    = idx[1];
3209             idx  += 2;
3210             tmp0  = x[i1];
3211             tmp1  = x[i2];
3212             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3213           }
3214           if (n == sz-1) {
3215             tmp0  = x[*idx++];
3216             sum1 -= *v1 * tmp0;
3217           }
3218           /* in MatSOR_SeqAIJ this line would be
3219            *
3220            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3221            *
3222            * but omega == 1, so this becomes
3223            *
3224            * x[row] = sum1*(*ibdiag++);
3225            *
3226            */
3227           x[row] = sum1*(*ibdiag);
3228           break;
3229         case 2:
3230           v2   = a->a + ii[row+1];
3231           sum1 = b[row];
3232           sum2 = b[row+1];
3233           for (n = 0; n<sz-1; n+=2) {
3234             i1    = idx[0];
3235             i2    = idx[1];
3236             idx  += 2;
3237             tmp0  = x[i1];
3238             tmp1  = x[i2];
3239             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3240             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3241           }
3242           if (n == sz-1) {
3243             tmp0  = x[*idx++];
3244             sum1 -= v1[0] * tmp0;
3245             sum2 -= v2[0] * tmp0;
3246             v1++; v2++;
3247           }
3248           t[row]   = sum1;
3249           t[row+1] = sum2;
3250           sz      = ii[row+1] - diag[row] - 2;
3251           idx     = a->j + diag[row] + 2;
3252           v1 += 2;
3253           v2 += 2;
3254           for (n = 0; n<sz-1; n+=2) {
3255             i1    = idx[0];
3256             i2    = idx[1];
3257             idx  += 2;
3258             tmp0  = x[i1];
3259             tmp1  = x[i2];
3260             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3261             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3262           }
3263           if (n == sz-1) {
3264             tmp0  = x[*idx];
3265             sum1 -= v1[0] * tmp0;
3266             sum2 -= v2[0] * tmp0;
3267           }
3268           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3269           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3270           break;
3271         case 3:
3272           v2   = a->a + ii[row+1];
3273           v3   = a->a + ii[row+2];
3274           sum1 = b[row];
3275           sum2 = b[row+1];
3276           sum3 = b[row+2];
3277           for (n = 0; n<sz-1; n+=2) {
3278             i1    = idx[0];
3279             i2    = idx[1];
3280             idx  += 2;
3281             tmp0  = x[i1];
3282             tmp1  = x[i2];
3283             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3284             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3285             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3286           }
3287           if (n == sz-1) {
3288             tmp0  = x[*idx++];
3289             sum1 -= v1[0] * tmp0;
3290             sum2 -= v2[0] * tmp0;
3291             sum3 -= v3[0] * tmp0;
3292             v1++; v2++; v3++;
3293           }
3294           t[row]   = sum1;
3295           t[row+1] = sum2;
3296           t[row+2] = sum3;
3297           sz      = ii[row+1] - diag[row] - 3;
3298           idx     = a->j + diag[row] + 3;
3299           v1 += 3;
3300           v2 += 3;
3301           v3 += 3;
3302           for (n = 0; n<sz-1; n+=2) {
3303             i1    = idx[0];
3304             i2    = idx[1];
3305             idx  += 2;
3306             tmp0  = x[i1];
3307             tmp1  = x[i2];
3308             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3309             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3310             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3311           }
3312           if (n == sz-1) {
3313             tmp0  = x[*idx];
3314             sum1 -= v1[0] * tmp0;
3315             sum2 -= v2[0] * tmp0;
3316             sum3 -= v3[0] * tmp0;
3317           }
3318           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3319           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3320           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3321           break;
3322         case 4:
3323           v2   = a->a + ii[row+1];
3324           v3   = a->a + ii[row+2];
3325           v4   = a->a + ii[row+3];
3326           sum1 = b[row];
3327           sum2 = b[row+1];
3328           sum3 = b[row+2];
3329           sum4 = b[row+3];
3330           for (n = 0; n<sz-1; n+=2) {
3331             i1    = idx[0];
3332             i2    = idx[1];
3333             idx  += 2;
3334             tmp0  = x[i1];
3335             tmp1  = x[i2];
3336             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3337             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3338             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3339             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3340           }
3341           if (n == sz-1) {
3342             tmp0  = x[*idx++];
3343             sum1 -= v1[0] * tmp0;
3344             sum2 -= v2[0] * tmp0;
3345             sum3 -= v3[0] * tmp0;
3346             sum4 -= v4[0] * tmp0;
3347             v1++; v2++; v3++; v4++;
3348           }
3349           t[row]   = sum1;
3350           t[row+1] = sum2;
3351           t[row+2] = sum3;
3352           t[row+3] = sum4;
3353           sz      = ii[row+1] - diag[row] - 4;
3354           idx     = a->j + diag[row] + 4;
3355           v1 += 4;
3356           v2 += 4;
3357           v3 += 4;
3358           v4 += 4;
3359           for (n = 0; n<sz-1; n+=2) {
3360             i1    = idx[0];
3361             i2    = idx[1];
3362             idx  += 2;
3363             tmp0  = x[i1];
3364             tmp1  = x[i2];
3365             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3366             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3367             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3368             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3369           }
3370           if (n == sz-1) {
3371             tmp0  = x[*idx];
3372             sum1 -= v1[0] * tmp0;
3373             sum2 -= v2[0] * tmp0;
3374             sum3 -= v3[0] * tmp0;
3375             sum4 -= v4[0] * tmp0;
3376           }
3377           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3378           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3379           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3380           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3381           break;
3382         case 5:
3383           v2   = a->a + ii[row+1];
3384           v3   = a->a + ii[row+2];
3385           v4   = a->a + ii[row+3];
3386           v5   = a->a + ii[row+4];
3387           sum1 = b[row];
3388           sum2 = b[row+1];
3389           sum3 = b[row+2];
3390           sum4 = b[row+3];
3391           sum5 = b[row+4];
3392           for (n = 0; n<sz-1; n+=2) {
3393             i1    = idx[0];
3394             i2    = idx[1];
3395             idx  += 2;
3396             tmp0  = x[i1];
3397             tmp1  = x[i2];
3398             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3399             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3400             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3401             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3402             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3403           }
3404           if (n == sz-1) {
3405             tmp0  = x[*idx++];
3406             sum1 -= v1[0] * tmp0;
3407             sum2 -= v2[0] * tmp0;
3408             sum3 -= v3[0] * tmp0;
3409             sum4 -= v4[0] * tmp0;
3410             sum5 -= v5[0] * tmp0;
3411             v1++; v2++; v3++; v4++; v5++;
3412           }
3413           t[row]   = sum1;
3414           t[row+1] = sum2;
3415           t[row+2] = sum3;
3416           t[row+3] = sum4;
3417           t[row+4] = sum5;
3418           sz      = ii[row+1] - diag[row] - 5;
3419           idx     = a->j + diag[row] + 5;
3420           v1 += 5;
3421           v2 += 5;
3422           v3 += 5;
3423           v4 += 5;
3424           v5 += 5;
3425           for (n = 0; n<sz-1; n+=2) {
3426             i1    = idx[0];
3427             i2    = idx[1];
3428             idx  += 2;
3429             tmp0  = x[i1];
3430             tmp1  = x[i2];
3431             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3432             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3433             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3434             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3435             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3436           }
3437           if (n == sz-1) {
3438             tmp0  = x[*idx];
3439             sum1 -= v1[0] * tmp0;
3440             sum2 -= v2[0] * tmp0;
3441             sum3 -= v3[0] * tmp0;
3442             sum4 -= v4[0] * tmp0;
3443             sum5 -= v5[0] * tmp0;
3444           }
3445           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3446           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3447           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3448           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3449           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3450           break;
3451         default:
3452           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3453         }
3454       }
3455       xb   = t;
3456       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3457     } else xb = b;
3458 
3459     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3460 
3461       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3462       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3463         ibdiag -= sizes[i]*sizes[i];
3464 
3465         /* set RHS */
3466         if (xb == b) {
3467           /* whole (old way) */
3468           sz      = ii[row+1] - ii[row];
3469           idx     = a->j + ii[row];
3470           switch (sizes[i]) {
3471           case 5:
3472             v5      = a->a + ii[row-4];
3473           case 4: /* fall through */
3474             v4      = a->a + ii[row-3];
3475           case 3:
3476             v3      = a->a + ii[row-2];
3477           case 2:
3478             v2      = a->a + ii[row-1];
3479           case 1:
3480             v1      = a->a + ii[row];
3481             break;
3482           default:
3483             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3484           }
3485         } else {
3486           /* upper, no diag */
3487           sz      = ii[row+1] - diag[row] - 1;
3488           idx     = a->j + diag[row] + 1;
3489           switch (sizes[i]) {
3490           case 5:
3491             v5      = a->a + diag[row-4] + 5;
3492           case 4: /* fall through */
3493             v4      = a->a + diag[row-3] + 4;
3494           case 3:
3495             v3      = a->a + diag[row-2] + 3;
3496           case 2:
3497             v2      = a->a + diag[row-1] + 2;
3498           case 1:
3499             v1      = a->a + diag[row] + 1;
3500           }
3501         }
3502         /* set sum */
3503         switch (sizes[i]) {
3504         case 5:
3505           sum5 = xb[row-4];
3506         case 4: /* fall through */
3507           sum4 = xb[row-3];
3508         case 3:
3509           sum3 = xb[row-2];
3510         case 2:
3511           sum2 = xb[row-1];
3512         case 1:
3513           /* note that sum1 is associated with the last row */
3514           sum1 = xb[row];
3515         }
3516         /* do sums */
3517         for (n = 0; n<sz-1; n+=2) {
3518             i1    = idx[0];
3519             i2    = idx[1];
3520             idx  += 2;
3521             tmp0  = x[i1];
3522             tmp1  = x[i2];
3523             switch (sizes[i]) {
3524             case 5:
3525               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3526             case 4: /* fall through */
3527               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3528             case 3:
3529               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3530             case 2:
3531               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3532             case 1:
3533               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3534             }
3535         }
3536         /* ragged edge */
3537         if (n == sz-1) {
3538           tmp0  = x[*idx];
3539           switch (sizes[i]) {
3540           case 5:
3541             sum5 -= *v5*tmp0;
3542           case 4: /* fall through */
3543             sum4 -= *v4*tmp0;
3544           case 3:
3545             sum3 -= *v3*tmp0;
3546           case 2:
3547             sum2 -= *v2*tmp0;
3548           case 1:
3549             sum1 -= *v1*tmp0;
3550           }
3551         }
3552         /* update */
3553         if (xb == b) {
3554           /* whole (old way) w/ diag */
3555           switch (sizes[i]) {
3556           case 5:
3557             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3558             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3559             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3560             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3561             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3562             break;
3563           case 4:
3564             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3565             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3566             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3567             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3568             break;
3569           case 3:
3570             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3571             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3572             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3573             break;
3574           case 2:
3575             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3576             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3577             break;
3578           case 1:
3579             x[row--] += sum1*(*ibdiag);
3580             break;
3581           }
3582         } else {
3583           /* no diag so set =  */
3584           switch (sizes[i]) {
3585           case 5:
3586             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3587             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3588             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3589             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3590             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3591             break;
3592           case 4:
3593             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3594             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3595             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3596             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3597             break;
3598           case 3:
3599             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3600             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3601             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3602             break;
3603           case 2:
3604             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3605             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3606             break;
3607           case 1:
3608             x[row--] = sum1*(*ibdiag);
3609             break;
3610           }
3611         }
3612       }
3613       if (xb == b) {
3614         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3615       } else {
3616         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3617       }
3618     }
3619   }
3620   if (flag & SOR_EISENSTAT) {
3621     /*
3622           Apply  (U + D)^-1  where D is now the block diagonal
3623     */
3624     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3625     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3626       ibdiag -= sizes[i]*sizes[i];
3627       sz      = ii[row+1] - diag[row] - 1;
3628       v1      = a->a + diag[row] + 1;
3629       idx     = a->j + diag[row] + 1;
3630       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3631       switch (sizes[i]) {
3632       case 1:
3633 
3634         sum1 = b[row];
3635         for (n = 0; n<sz-1; n+=2) {
3636           i1    = idx[0];
3637           i2    = idx[1];
3638           idx  += 2;
3639           tmp0  = x[i1];
3640           tmp1  = x[i2];
3641           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3642         }
3643 
3644         if (n == sz-1) {
3645           tmp0  = x[*idx];
3646           sum1 -= *v1*tmp0;
3647         }
3648         x[row] = sum1*(*ibdiag);row--;
3649         break;
3650 
3651       case 2:
3652 
3653         sum1 = b[row];
3654         sum2 = b[row-1];
3655         /* note that sum1 is associated with the second of the two rows */
3656         v2 = a->a + diag[row-1] + 2;
3657         for (n = 0; n<sz-1; n+=2) {
3658           i1    = idx[0];
3659           i2    = idx[1];
3660           idx  += 2;
3661           tmp0  = x[i1];
3662           tmp1  = x[i2];
3663           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3664           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3665         }
3666 
3667         if (n == sz-1) {
3668           tmp0  = x[*idx];
3669           sum1 -= *v1*tmp0;
3670           sum2 -= *v2*tmp0;
3671         }
3672         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3673         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3674         row     -= 2;
3675         break;
3676       case 3:
3677 
3678         sum1 = b[row];
3679         sum2 = b[row-1];
3680         sum3 = b[row-2];
3681         v2   = a->a + diag[row-1] + 2;
3682         v3   = a->a + diag[row-2] + 3;
3683         for (n = 0; n<sz-1; n+=2) {
3684           i1    = idx[0];
3685           i2    = idx[1];
3686           idx  += 2;
3687           tmp0  = x[i1];
3688           tmp1  = x[i2];
3689           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3690           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3691           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3692         }
3693 
3694         if (n == sz-1) {
3695           tmp0  = x[*idx];
3696           sum1 -= *v1*tmp0;
3697           sum2 -= *v2*tmp0;
3698           sum3 -= *v3*tmp0;
3699         }
3700         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3701         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3702         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3703         row     -= 3;
3704         break;
3705       case 4:
3706 
3707         sum1 = b[row];
3708         sum2 = b[row-1];
3709         sum3 = b[row-2];
3710         sum4 = b[row-3];
3711         v2   = a->a + diag[row-1] + 2;
3712         v3   = a->a + diag[row-2] + 3;
3713         v4   = a->a + diag[row-3] + 4;
3714         for (n = 0; n<sz-1; n+=2) {
3715           i1    = idx[0];
3716           i2    = idx[1];
3717           idx  += 2;
3718           tmp0  = x[i1];
3719           tmp1  = x[i2];
3720           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724         }
3725 
3726         if (n == sz-1) {
3727           tmp0  = x[*idx];
3728           sum1 -= *v1*tmp0;
3729           sum2 -= *v2*tmp0;
3730           sum3 -= *v3*tmp0;
3731           sum4 -= *v4*tmp0;
3732         }
3733         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3734         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3735         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3736         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3737         row     -= 4;
3738         break;
3739       case 5:
3740 
3741         sum1 = b[row];
3742         sum2 = b[row-1];
3743         sum3 = b[row-2];
3744         sum4 = b[row-3];
3745         sum5 = b[row-4];
3746         v2   = a->a + diag[row-1] + 2;
3747         v3   = a->a + diag[row-2] + 3;
3748         v4   = a->a + diag[row-3] + 4;
3749         v5   = a->a + diag[row-4] + 5;
3750         for (n = 0; n<sz-1; n+=2) {
3751           i1    = idx[0];
3752           i2    = idx[1];
3753           idx  += 2;
3754           tmp0  = x[i1];
3755           tmp1  = x[i2];
3756           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3757           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3758           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3759           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3760           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3761         }
3762 
3763         if (n == sz-1) {
3764           tmp0  = x[*idx];
3765           sum1 -= *v1*tmp0;
3766           sum2 -= *v2*tmp0;
3767           sum3 -= *v3*tmp0;
3768           sum4 -= *v4*tmp0;
3769           sum5 -= *v5*tmp0;
3770         }
3771         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3772         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3773         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3774         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3775         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3776         row     -= 5;
3777         break;
3778       default:
3779         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3780       }
3781     }
3782     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3783 
3784     /*
3785            t = b - D x    where D is the block diagonal
3786     */
3787     cnt = 0;
3788     for (i=0, row=0; i<m; i++) {
3789       switch (sizes[i]) {
3790       case 1:
3791         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3792         break;
3793       case 2:
3794         x1       = x[row]; x2 = x[row+1];
3795         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3796         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3797         t[row]   = b[row] - tmp1;
3798         t[row+1] = b[row+1] - tmp2; row += 2;
3799         cnt     += 4;
3800         break;
3801       case 3:
3802         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3803         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3804         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3805         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3806         t[row]   = b[row] - tmp1;
3807         t[row+1] = b[row+1] - tmp2;
3808         t[row+2] = b[row+2] - tmp3; row += 3;
3809         cnt     += 9;
3810         break;
3811       case 4:
3812         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3813         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3814         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3815         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3816         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3817         t[row]   = b[row] - tmp1;
3818         t[row+1] = b[row+1] - tmp2;
3819         t[row+2] = b[row+2] - tmp3;
3820         t[row+3] = b[row+3] - tmp4; row += 4;
3821         cnt     += 16;
3822         break;
3823       case 5:
3824         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3825         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3826         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3827         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3828         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3829         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3830         t[row]   = b[row] - tmp1;
3831         t[row+1] = b[row+1] - tmp2;
3832         t[row+2] = b[row+2] - tmp3;
3833         t[row+3] = b[row+3] - tmp4;
3834         t[row+4] = b[row+4] - tmp5;row += 5;
3835         cnt     += 25;
3836         break;
3837       default:
3838         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3839       }
3840     }
3841     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3842 
3843 
3844 
3845     /*
3846           Apply (L + D)^-1 where D is the block diagonal
3847     */
3848     for (i=0, row=0; i<m; i++) {
3849       sz  = diag[row] - ii[row];
3850       v1  = a->a + ii[row];
3851       idx = a->j + ii[row];
3852       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3853       switch (sizes[i]) {
3854       case 1:
3855 
3856         sum1 = t[row];
3857         for (n = 0; n<sz-1; n+=2) {
3858           i1    = idx[0];
3859           i2    = idx[1];
3860           idx  += 2;
3861           tmp0  = t[i1];
3862           tmp1  = t[i2];
3863           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3864         }
3865 
3866         if (n == sz-1) {
3867           tmp0  = t[*idx];
3868           sum1 -= *v1 * tmp0;
3869         }
3870         x[row] += t[row] = sum1*(*ibdiag++); row++;
3871         break;
3872       case 2:
3873         v2   = a->a + ii[row+1];
3874         sum1 = t[row];
3875         sum2 = t[row+1];
3876         for (n = 0; n<sz-1; n+=2) {
3877           i1    = idx[0];
3878           i2    = idx[1];
3879           idx  += 2;
3880           tmp0  = t[i1];
3881           tmp1  = t[i2];
3882           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3883           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3884         }
3885 
3886         if (n == sz-1) {
3887           tmp0  = t[*idx];
3888           sum1 -= v1[0] * tmp0;
3889           sum2 -= v2[0] * tmp0;
3890         }
3891         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3892         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3893         ibdiag   += 4; row += 2;
3894         break;
3895       case 3:
3896         v2   = a->a + ii[row+1];
3897         v3   = a->a + ii[row+2];
3898         sum1 = t[row];
3899         sum2 = t[row+1];
3900         sum3 = t[row+2];
3901         for (n = 0; n<sz-1; n+=2) {
3902           i1    = idx[0];
3903           i2    = idx[1];
3904           idx  += 2;
3905           tmp0  = t[i1];
3906           tmp1  = t[i2];
3907           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3908           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3909           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3910         }
3911 
3912         if (n == sz-1) {
3913           tmp0  = t[*idx];
3914           sum1 -= v1[0] * tmp0;
3915           sum2 -= v2[0] * tmp0;
3916           sum3 -= v3[0] * tmp0;
3917         }
3918         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3919         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3920         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3921         ibdiag   += 9; row += 3;
3922         break;
3923       case 4:
3924         v2   = a->a + ii[row+1];
3925         v3   = a->a + ii[row+2];
3926         v4   = a->a + ii[row+3];
3927         sum1 = t[row];
3928         sum2 = t[row+1];
3929         sum3 = t[row+2];
3930         sum4 = t[row+3];
3931         for (n = 0; n<sz-1; n+=2) {
3932           i1    = idx[0];
3933           i2    = idx[1];
3934           idx  += 2;
3935           tmp0  = t[i1];
3936           tmp1  = t[i2];
3937           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3938           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3939           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3940           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3941         }
3942 
3943         if (n == sz-1) {
3944           tmp0  = t[*idx];
3945           sum1 -= v1[0] * tmp0;
3946           sum2 -= v2[0] * tmp0;
3947           sum3 -= v3[0] * tmp0;
3948           sum4 -= v4[0] * tmp0;
3949         }
3950         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3951         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3952         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3953         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3954         ibdiag   += 16; row += 4;
3955         break;
3956       case 5:
3957         v2   = a->a + ii[row+1];
3958         v3   = a->a + ii[row+2];
3959         v4   = a->a + ii[row+3];
3960         v5   = a->a + ii[row+4];
3961         sum1 = t[row];
3962         sum2 = t[row+1];
3963         sum3 = t[row+2];
3964         sum4 = t[row+3];
3965         sum5 = t[row+4];
3966         for (n = 0; n<sz-1; n+=2) {
3967           i1    = idx[0];
3968           i2    = idx[1];
3969           idx  += 2;
3970           tmp0  = t[i1];
3971           tmp1  = t[i2];
3972           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3973           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3974           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3975           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3976           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3977         }
3978 
3979         if (n == sz-1) {
3980           tmp0  = t[*idx];
3981           sum1 -= v1[0] * tmp0;
3982           sum2 -= v2[0] * tmp0;
3983           sum3 -= v3[0] * tmp0;
3984           sum4 -= v4[0] * tmp0;
3985           sum5 -= v5[0] * tmp0;
3986         }
3987         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3988         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3989         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3990         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3991         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3992         ibdiag   += 25; row += 5;
3993         break;
3994       default:
3995         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3996       }
3997     }
3998     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3999   }
4000   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4001   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4002   PetscFunctionReturn(0);
4003 }
4004 
4005 #undef __FUNCT__
4006 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
4007 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
4008 {
4009   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4010   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4011   const MatScalar   *bdiag = a->inode.bdiag;
4012   const PetscScalar *b;
4013   PetscErrorCode    ierr;
4014   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4015   const PetscInt    *sizes = a->inode.size;
4016 
4017   PetscFunctionBegin;
4018   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4019   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4020   cnt  = 0;
4021   for (i=0, row=0; i<m; i++) {
4022     switch (sizes[i]) {
4023     case 1:
4024       x[row] = b[row]*bdiag[cnt++];row++;
4025       break;
4026     case 2:
4027       x1       = b[row]; x2 = b[row+1];
4028       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4029       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4030       x[row++] = tmp1;
4031       x[row++] = tmp2;
4032       cnt     += 4;
4033       break;
4034     case 3:
4035       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4036       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4037       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4038       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4039       x[row++] = tmp1;
4040       x[row++] = tmp2;
4041       x[row++] = tmp3;
4042       cnt     += 9;
4043       break;
4044     case 4:
4045       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4046       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4047       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4048       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4049       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4050       x[row++] = tmp1;
4051       x[row++] = tmp2;
4052       x[row++] = tmp3;
4053       x[row++] = tmp4;
4054       cnt     += 16;
4055       break;
4056     case 5:
4057       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4058       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4059       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4060       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4061       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4062       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4063       x[row++] = tmp1;
4064       x[row++] = tmp2;
4065       x[row++] = tmp3;
4066       x[row++] = tmp4;
4067       x[row++] = tmp5;
4068       cnt     += 25;
4069       break;
4070     default:
4071       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4072     }
4073   }
4074   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4075   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4076   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4077   PetscFunctionReturn(0);
4078 }
4079 
4080 /*
4081     samestructure indicates that the matrix has not changed its nonzero structure so we
4082     do not need to recompute the inodes
4083 */
4084 #undef __FUNCT__
4085 #define __FUNCT__ "Mat_CheckInode"
4086 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure)
4087 {
4088   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4089   PetscErrorCode ierr;
4090   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4091   PetscBool      flag;
4092   const PetscInt *idx,*idy,*ii;
4093 
4094   PetscFunctionBegin;
4095   if (!a->inode.use) PetscFunctionReturn(0);
4096   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4097 
4098 
4099   m = A->rmap->n;
4100   if (a->inode.size) ns = a->inode.size;
4101   else {
4102     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4103   }
4104 
4105   i          = 0;
4106   node_count = 0;
4107   idx        = a->j;
4108   ii         = a->i;
4109   while (i < m) {                /* For each row */
4110     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4111     /* Limits the number of elements in a node to 'a->inode.limit' */
4112     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4113       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4114       if (nzy != nzx) break;
4115       idy += nzx;              /* Same nonzero pattern */
4116       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4117       if (!flag) break;
4118     }
4119     ns[node_count++] = blk_size;
4120     idx             += blk_size*nzx;
4121     i                = j;
4122   }
4123   /* If not enough inodes found,, do not use inode version of the routines */
4124   if (!m || node_count > .8*m) {
4125     ierr = PetscFree(ns);CHKERRQ(ierr);
4126 
4127     a->inode.node_count       = 0;
4128     a->inode.size             = NULL;
4129     a->inode.use              = PETSC_FALSE;
4130     A->ops->mult              = MatMult_SeqAIJ;
4131     A->ops->sor               = MatSOR_SeqAIJ;
4132     A->ops->multadd           = MatMultAdd_SeqAIJ;
4133     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4134     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4135     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4136     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4137     A->ops->coloringpatch     = 0;
4138     A->ops->multdiagonalblock = 0;
4139 
4140     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4141   } else {
4142     if (!A->factortype) {
4143       A->ops->mult              = MatMult_SeqAIJ_Inode;
4144       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4145       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4146       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4147       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4148       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4149       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4150       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4151       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4152     } else {
4153       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4154     }
4155     a->inode.node_count = node_count;
4156     a->inode.size       = ns;
4157     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4158   }
4159   a->inode.checked = PETSC_TRUE;
4160   PetscFunctionReturn(0);
4161 }
4162 
4163 #undef __FUNCT__
4164 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4165 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4166 {
4167   Mat            B =*C;
4168   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4169   PetscErrorCode ierr;
4170   PetscInt       m=A->rmap->n;
4171 
4172   PetscFunctionBegin;
4173   c->inode.use       = a->inode.use;
4174   c->inode.limit     = a->inode.limit;
4175   c->inode.max_limit = a->inode.max_limit;
4176   if (a->inode.size) {
4177     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4178     c->inode.node_count = a->inode.node_count;
4179     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4180     /* note the table of functions below should match that in Mat_CheckInode() */
4181     if (!B->factortype) {
4182       B->ops->mult              = MatMult_SeqAIJ_Inode;
4183       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4184       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4185       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4186       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4187       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4188       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4189       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4190       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4191     } else {
4192       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4193     }
4194   } else {
4195     c->inode.size       = 0;
4196     c->inode.node_count = 0;
4197   }
4198   c->inode.ibdiagvalid = PETSC_FALSE;
4199   c->inode.ibdiag      = 0;
4200   c->inode.bdiag       = 0;
4201   PetscFunctionReturn(0);
4202 }
4203 
4204 #undef __FUNCT__
4205 #define __FUNCT__ "MatGetRow_FactoredLU"
4206 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4207 {
4208   PetscInt       k;
4209   const PetscInt *vi;
4210 
4211   PetscFunctionBegin;
4212   vi = aj + ai[row];
4213   for (k=0; k<nzl; k++) cols[k] = vi[k];
4214   vi        = aj + adiag[row];
4215   cols[nzl] = vi[0];
4216   vi        = aj + adiag[row+1]+1;
4217   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4218   PetscFunctionReturn(0);
4219 }
4220 /*
4221    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4222    Modified from Mat_CheckInode().
4223 
4224    Input Parameters:
4225 .  Mat A - ILU or LU matrix factor
4226 
4227 */
4228 #undef __FUNCT__
4229 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4230 PetscErrorCode Mat_CheckInode_FactorLU(Mat A)
4231 {
4232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4233   PetscErrorCode ierr;
4234   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4235   PetscInt       *cols1,*cols2,*ns;
4236   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4237   PetscBool      flag;
4238 
4239   PetscFunctionBegin;
4240   if (!a->inode.use)    PetscFunctionReturn(0);
4241   if (a->inode.checked) PetscFunctionReturn(0);
4242 
4243   m = A->rmap->n;
4244   if (a->inode.size) ns = a->inode.size;
4245   else {
4246     ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);
4247   }
4248 
4249   i          = 0;
4250   node_count = 0;
4251   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4252   while (i < m) {                /* For each row */
4253     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4254     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4255     nzx  = nzl1 + nzu1 + 1;
4256     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4257 
4258     /* Limits the number of elements in a node to 'a->inode.limit' */
4259     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4260       nzl2 = ai[j+1] - ai[j];
4261       nzu2 = adiag[j] - adiag[j+1] - 1;
4262       nzy  = nzl2 + nzu2 + 1;
4263       if (nzy != nzx) break;
4264       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4265       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4266       if (!flag) break;
4267     }
4268     ns[node_count++] = blk_size;
4269     i                = j;
4270   }
4271   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4272   /* If not enough inodes found,, do not use inode version of the routines */
4273   if (!m || node_count > .8*m) {
4274     ierr = PetscFree(ns);CHKERRQ(ierr);
4275 
4276     a->inode.node_count = 0;
4277     a->inode.size       = NULL;
4278     a->inode.use        = PETSC_FALSE;
4279 
4280     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4281   } else {
4282     A->ops->mult              = 0;
4283     A->ops->sor               = 0;
4284     A->ops->multadd           = 0;
4285     A->ops->getrowij          = 0;
4286     A->ops->restorerowij      = 0;
4287     A->ops->getcolumnij       = 0;
4288     A->ops->restorecolumnij   = 0;
4289     A->ops->coloringpatch     = 0;
4290     A->ops->multdiagonalblock = 0;
4291     a->inode.node_count       = node_count;
4292     a->inode.size             = ns;
4293 
4294     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4295   }
4296   a->inode.checked = PETSC_TRUE;
4297   PetscFunctionReturn(0);
4298 }
4299 
4300 #undef __FUNCT__
4301 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4302 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4303 {
4304   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4305 
4306   PetscFunctionBegin;
4307   a->inode.ibdiagvalid = PETSC_FALSE;
4308   PetscFunctionReturn(0);
4309 }
4310 
4311 /*
4312      This is really ugly. if inodes are used this replaces the
4313   permutations with ones that correspond to rows/cols of the matrix
4314   rather then inode blocks
4315 */
4316 #undef __FUNCT__
4317 #define __FUNCT__ "MatInodeAdjustForInodes"
4318 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4319 {
4320   PetscErrorCode ierr;
4321 
4322   PetscFunctionBegin;
4323   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4324   PetscFunctionReturn(0);
4325 }
4326 
4327 #undef __FUNCT__
4328 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4329 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4330 {
4331   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4332   PetscErrorCode ierr;
4333   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4334   const PetscInt *ridx,*cidx;
4335   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4336   PetscInt       nslim_col,*ns_col;
4337   IS             ris = *rperm,cis = *cperm;
4338 
4339   PetscFunctionBegin;
4340   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4341   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4342 
4343   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4344   ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4345   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4346 
4347   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4348   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4349 
4350   /* Form the inode structure for the rows of permuted matric using inv perm*/
4351   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4352 
4353   /* Construct the permutations for rows*/
4354   for (i=0,row = 0; i<nslim_row; ++i) {
4355     indx      = ridx[i];
4356     start_val = tns[indx];
4357     end_val   = tns[indx + 1];
4358     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4359   }
4360 
4361   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4362   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4363 
4364   /* Construct permutations for columns */
4365   for (i=0,col=0; i<nslim_col; ++i) {
4366     indx      = cidx[i];
4367     start_val = tns[indx];
4368     end_val   = tns[indx + 1];
4369     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4370   }
4371 
4372   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4373   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4374   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4375   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4376 
4377   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4378   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4379 
4380   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4381   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4382   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4383   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4384   ierr = PetscFree(tns);CHKERRQ(ierr);
4385   PetscFunctionReturn(0);
4386 }
4387 
4388 #undef __FUNCT__
4389 #define __FUNCT__ "MatInodeGetInodeSizes"
4390 /*@C
4391    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4392 
4393    Not Collective
4394 
4395    Input Parameter:
4396 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4397 
4398    Output Parameter:
4399 +  node_count - no of inodes present in the matrix.
4400 .  sizes      - an array of size node_count,with sizes of each inode.
4401 -  limit      - the max size used to generate the inodes.
4402 
4403    Level: advanced
4404 
4405    Notes: This routine returns some internal storage information
4406    of the matrix, it is intended to be used by advanced users.
4407    It should be called after the matrix is assembled.
4408    The contents of the sizes[] array should not be changed.
4409    NULL may be passed for information not requested.
4410 
4411 .keywords: matrix, seqaij, get, inode
4412 
4413 .seealso: MatGetInfo()
4414 @*/
4415 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4416 {
4417   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4418 
4419   PetscFunctionBegin;
4420   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4421   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4422   if (f) {
4423     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4424   }
4425   PetscFunctionReturn(0);
4426 }
4427 
4428 #undef __FUNCT__
4429 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4430 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4431 {
4432   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4433 
4434   PetscFunctionBegin;
4435   if (node_count) *node_count = a->inode.node_count;
4436   if (sizes)      *sizes      = a->inode.size;
4437   if (limit)      *limit      = a->inode.limit;
4438   PetscFunctionReturn(0);
4439 }
4440