xref: /petsc/src/mat/impls/aij/seq/inode.c (revision b6bfaf9a0a178e626c17e7b8aafd5e3bedcd9a6d)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
11 {
12   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
24     for (; count+1 < n; count++,i++);
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n)  {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
67 
68   PetscFunctionBegin;
69   nslim_row = a->inode.node_count;
70   m         = A->rmap->n;
71   n         = A->cmap->n;
72   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
73 
74   /* Use the row_inode as column_inode */
75   nslim_col = nslim_row;
76   ns_col    = ns_row;
77 
78   /* allocate space for reformated inode structure */
79   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
80   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
81   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
82 
83   for (i1=0,col=0; i1<nslim_col; ++i1) {
84     nsz = ns_col[i1];
85     for (i2=0; i2<nsz; ++i2,++col)
86       tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124  /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
157 
158   PetscFunctionBegin;
159   nslim_row = a->inode.node_count;
160   n         = A->cmap->n;
161 
162   /* Create The column_inode for this matrix */
163   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
164 
165   /* allocate space for reformated column_inode structure */
166   ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
167   ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
168   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
169 
170   for (i1=0,col=0; i1<nslim_col; ++i1) {
171     nsz = ns_col[i1];
172     for (i2=0; i2<nsz; ++i2,++col)
173       tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208  /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n     = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col)
292       tvc[col] = i1;
293   }
294   /* allocate space for column pointers */
295   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr);
296   *iia = ia;
297   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
298   ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr);
299 
300   /* determine the number of columns in each row */
301   ia[0] = oshift;
302   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
303     j   = aj + ai[row] + ishift;
304     col = *j++ + ishift;
305     i2  = tvc[col];
306     nz  = ai[row+1] - ai[row];
307     while (nz-- > 0) {           /* off-diagonal elemets */
308       /* ia[i1+1]++; */
309       ia[i2+1]++;
310       i2++;
311       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
312       if (nz > 0) i2 = tvc[col];
313     }
314   }
315 
316   /* shift ia[i] to point to next col */
317   for (i1=1; i1<nslim_col+1; i1++) {
318     col        = ia[i1-1];
319     ia[i1]    += col;
320     work[i1-1] = col - oshift;
321   }
322 
323   /* allocate space for column pointers */
324   nz   = ia[nslim_col] + (!ishift);
325   ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr);
326   *jja = ja;
327 
328  /* loop over matrix putting into ja */
329   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
330     j   = aj + ai[row] + ishift;
331     col = *j++ + ishift;
332     i2  = tvc[col];
333     nz  = ai[row+1] - ai[row];
334     while (nz-- > 0) {
335       /* ja[work[i1]++] = i2 + oshift; */
336       ja[work[i2]++] = i1 + oshift;
337       i2++;
338       while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
339       if (nz > 0) i2 = tvc[col];
340     }
341   }
342   ierr = PetscFree(ns_col);CHKERRQ(ierr);
343   ierr = PetscFree(work);CHKERRQ(ierr);
344   ierr = PetscFree(tns);CHKERRQ(ierr);
345   ierr = PetscFree(tvc);CHKERRQ(ierr);
346   PetscFunctionReturn(0);
347 }
348 
349 #undef __FUNCT__
350 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
351 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
352 {
353   PetscErrorCode ierr;
354 
355   PetscFunctionBegin;
356   ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr);
357   if (!ia) PetscFunctionReturn(0);
358 
359   if (!blockcompressed) {
360     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
361   } else if (symmetric) {
362     /* Since the indices are symmetric it does'nt matter */
363     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
364   } else {
365     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
366   }
367   PetscFunctionReturn(0);
368 }
369 
370 #undef __FUNCT__
371 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
372 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool  symmetric,PetscBool  blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
373 {
374   PetscErrorCode ierr;
375 
376   PetscFunctionBegin;
377   if (!ia) PetscFunctionReturn(0);
378   if (!blockcompressed) {
379     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
380   } else {
381     ierr = PetscFree(*ia);CHKERRQ(ierr);
382     ierr = PetscFree(*ja);CHKERRQ(ierr);
383   }
384   PetscFunctionReturn(0);
385 }
386 
387 /* ----------------------------------------------------------- */
388 
389 #undef __FUNCT__
390 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
391 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
392 {
393   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
394   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
395   PetscScalar       *y;
396   const PetscScalar *x;
397   const MatScalar   *v1,*v2,*v3,*v4,*v5;
398   PetscErrorCode    ierr;
399   PetscInt          *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx  = a->j;
412   v1   = a->a;
413   ii   = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz  = ns[i];
417     n    = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii  += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz   = n;                   /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1 :
426       sum1  = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1   = idx[0];          /* The instructions are ordered to */
430         i2   = idx[1];          /* make the compiler's job easy */
431         idx += 2;
432         tmp0 = x[i1];
433         tmp1 = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435        }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ * tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1  = 0.;
445       sum2  = 0.;
446       v2    = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1   = idx[0];
450         i2   = idx[1];
451         idx += 2;
452         tmp0 = x[i1];
453         tmp1 = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1  = 0.;
469       sum2  = 0.;
470       sum3  = 0.;
471       v2    = v1 + n;
472       v3    = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1   = idx[0];
476         i2   = idx[1];
477         idx += 2;
478         tmp0 = x[i1];
479         tmp1 = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1       =v3;             /* Since the next block to be processed starts there*/
494       idx     +=2*sz;
495       break;
496     case 4:
497       sum1  = 0.;
498       sum2  = 0.;
499       sum3  = 0.;
500       sum4  = 0.;
501       v2    = v1 + n;
502       v3    = v2 + n;
503       v4    = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1   = idx[0];
507         i2   = idx[1];
508         idx += 2;
509         tmp0 = x[i1];
510         tmp1 = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1  = 0.;
532       sum2  = 0.;
533       sum3  = 0.;
534       sum4  = 0.;
535       sum5  = 0.;
536       v2    = v1 + n;
537       v3    = v2 + n;
538       v4    = v3 + n;
539       v5    = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1   = idx[0];
543         i2   = idx[1];
544         idx += 2;
545         tmp0 = x[i1];
546         tmp1 = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default :
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar    sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   MatScalar      *v1,*v2,*v3,*v4,*v5;
587   PetscScalar    *x,*y,*z,*zt;
588   PetscErrorCode ierr;
589   PetscInt       *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
590 
591   PetscFunctionBegin;
592   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
593   node_max = a->inode.node_count;
594   ns       = a->inode.size;     /* Node Size array */
595   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
596   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
597   if (zz != yy) {
598     ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
599   } else {
600     z = y;
601   }
602   zt = z;
603 
604   idx  = a->j;
605   v1   = a->a;
606   ii   = a->i;
607 
608   for (i = 0,row = 0; i< node_max; ++i) {
609     nsz  = ns[i];
610     n    = ii[1] - ii[0];
611     ii  += nsz;
612     sz   = n;                   /* No of non zeros in this row */
613                                 /* Switch on the size of Node */
614     switch (nsz) {               /* Each loop in 'case' is unrolled */
615     case 1 :
616       sum1  = *zt++;
617 
618       for (n = 0; n< sz-1; n+=2) {
619         i1   = idx[0];          /* The instructions are ordered to */
620         i2   = idx[1];          /* make the compiler's job easy */
621         idx += 2;
622         tmp0 = x[i1];
623         tmp1 = x[i2];
624         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
625        }
626 
627       if (n   == sz-1) {          /* Take care of the last nonzero  */
628         tmp0  = x[*idx++];
629         sum1 += *v1++ * tmp0;
630       }
631       y[row++]=sum1;
632       break;
633     case 2:
634       sum1  = *zt++;
635       sum2  = *zt++;
636       v2    = v1 + n;
637 
638       for (n = 0; n< sz-1; n+=2) {
639         i1   = idx[0];
640         i2   = idx[1];
641         idx += 2;
642         tmp0 = x[i1];
643         tmp1 = x[i2];
644         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
645         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
646       }
647       if (n   == sz-1) {
648         tmp0  = x[*idx++];
649         sum1 += *v1++ * tmp0;
650         sum2 += *v2++ * tmp0;
651       }
652       y[row++]=sum1;
653       y[row++]=sum2;
654       v1      =v2;              /* Since the next block to be processed starts there*/
655       idx    +=sz;
656       break;
657     case 3:
658       sum1  = *zt++;
659       sum2  = *zt++;
660       sum3  = *zt++;
661       v2    = v1 + n;
662       v3    = v2 + n;
663 
664       for (n = 0; n< sz-1; n+=2) {
665         i1   = idx[0];
666         i2   = idx[1];
667         idx += 2;
668         tmp0 = x[i1];
669         tmp1 = x[i2];
670         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
671         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
672         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
673       }
674       if (n == sz-1) {
675         tmp0  = x[*idx++];
676         sum1 += *v1++ * tmp0;
677         sum2 += *v2++ * tmp0;
678         sum3 += *v3++ * tmp0;
679       }
680       y[row++]=sum1;
681       y[row++]=sum2;
682       y[row++]=sum3;
683       v1       =v3;             /* Since the next block to be processed starts there*/
684       idx     +=2*sz;
685       break;
686     case 4:
687       sum1  = *zt++;
688       sum2  = *zt++;
689       sum3  = *zt++;
690       sum4  = *zt++;
691       v2    = v1 + n;
692       v3    = v2 + n;
693       v4    = v3 + n;
694 
695       for (n = 0; n< sz-1; n+=2) {
696         i1   = idx[0];
697         i2   = idx[1];
698         idx += 2;
699         tmp0 = x[i1];
700         tmp1 = x[i2];
701         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
702         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
703         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
704         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
705       }
706       if (n == sz-1) {
707         tmp0  = x[*idx++];
708         sum1 += *v1++ * tmp0;
709         sum2 += *v2++ * tmp0;
710         sum3 += *v3++ * tmp0;
711         sum4 += *v4++ * tmp0;
712       }
713       y[row++]=sum1;
714       y[row++]=sum2;
715       y[row++]=sum3;
716       y[row++]=sum4;
717       v1      =v4;              /* Since the next block to be processed starts there*/
718       idx    +=3*sz;
719       break;
720     case 5:
721       sum1  = *zt++;
722       sum2  = *zt++;
723       sum3  = *zt++;
724       sum4  = *zt++;
725       sum5  = *zt++;
726       v2    = v1 + n;
727       v3    = v2 + n;
728       v4    = v3 + n;
729       v5    = v4 + n;
730 
731       for (n = 0; n<sz-1; n+=2) {
732         i1   = idx[0];
733         i2   = idx[1];
734         idx += 2;
735         tmp0 = x[i1];
736         tmp1 = x[i2];
737         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
738         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
739         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
740         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
741         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
742       }
743       if (n   == sz-1) {
744         tmp0  = x[*idx++];
745         sum1 += *v1++ * tmp0;
746         sum2 += *v2++ * tmp0;
747         sum3 += *v3++ * tmp0;
748         sum4 += *v4++ * tmp0;
749         sum5 += *v5++ * tmp0;
750       }
751       y[row++]=sum1;
752       y[row++]=sum2;
753       y[row++]=sum3;
754       y[row++]=sum4;
755       y[row++]=sum5;
756       v1      =v5;       /* Since the next block to be processed starts there */
757       idx    +=4*sz;
758       break;
759     default :
760       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
761     }
762   }
763   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
764   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
765   if (zz != yy) {
766     ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
767   }
768   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
769   PetscFunctionReturn(0);
770 }
771 
772 /* ----------------------------------------------------------- */
773 #undef __FUNCT__
774 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
775 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
776 {
777   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
778   IS                iscol = a->col,isrow = a->row;
779   PetscErrorCode    ierr;
780   const PetscInt    *r,*c,*rout,*cout;
781   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
782   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
783   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
784   PetscScalar       sum1,sum2,sum3,sum4,sum5;
785   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
786   const PetscScalar *b;
787 
788   PetscFunctionBegin;
789   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
790   node_max = a->inode.node_count;
791   ns       = a->inode.size;     /* Node Size array */
792 
793   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
794   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795   tmp  = a->solve_work;
796 
797   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
799 
800   /* forward solve the lower triangular */
801   tmps = tmp ;
802   aa   = a_a ;
803   aj   = a_j ;
804   ad   = a->diag;
805 
806   for (i = 0,row = 0; i< node_max; ++i) {
807     nsz = ns[i];
808     aii = ai[row];
809     v1  = aa + aii;
810     vi  = aj + aii;
811     nz  = ad[row]- aii;
812     if (i < node_max-1) {
813       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
814       * but our indexing to determine it's size could. */
815       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
816       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
817       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
818       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
819     }
820 
821     switch (nsz) {               /* Each loop in 'case' is unrolled */
822     case 1 :
823       sum1 = b[*r++];
824       for (j=0; j<nz-1; j+=2) {
825         i0   = vi[0];
826         i1   = vi[1];
827         vi  +=2;
828         tmp0 = tmps[i0];
829         tmp1 = tmps[i1];
830         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
831       }
832       if (j == nz-1) {
833         tmp0 = tmps[*vi++];
834         sum1 -= *v1++ *tmp0;
835       }
836       tmp[row ++]=sum1;
837       break;
838     case 2:
839       sum1 = b[*r++];
840       sum2 = b[*r++];
841       v2   = aa + ai[row+1];
842 
843       for (j=0; j<nz-1; j+=2) {
844         i0   = vi[0];
845         i1   = vi[1];
846         vi  +=2;
847         tmp0 = tmps[i0];
848         tmp1 = tmps[i1];
849         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
850         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
851       }
852       if (j == nz-1) {
853         tmp0 = tmps[*vi++];
854         sum1 -= *v1++ *tmp0;
855         sum2 -= *v2++ *tmp0;
856       }
857       sum2 -= *v2++ * sum1;
858       tmp[row ++]=sum1;
859       tmp[row ++]=sum2;
860       break;
861     case 3:
862       sum1 = b[*r++];
863       sum2 = b[*r++];
864       sum3 = b[*r++];
865       v2   = aa + ai[row+1];
866       v3   = aa + ai[row+2];
867 
868       for (j=0; j<nz-1; j+=2) {
869         i0   = vi[0];
870         i1   = vi[1];
871         vi  +=2;
872         tmp0 = tmps[i0];
873         tmp1 = tmps[i1];
874         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
875         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
876         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
877       }
878       if (j == nz-1) {
879         tmp0 = tmps[*vi++];
880         sum1 -= *v1++ *tmp0;
881         sum2 -= *v2++ *tmp0;
882         sum3 -= *v3++ *tmp0;
883       }
884       sum2 -= *v2++ * sum1;
885       sum3 -= *v3++ * sum1;
886       sum3 -= *v3++ * sum2;
887       tmp[row ++]=sum1;
888       tmp[row ++]=sum2;
889       tmp[row ++]=sum3;
890       break;
891 
892     case 4:
893       sum1 = b[*r++];
894       sum2 = b[*r++];
895       sum3 = b[*r++];
896       sum4 = b[*r++];
897       v2   = aa + ai[row+1];
898       v3   = aa + ai[row+2];
899       v4   = aa + ai[row+3];
900 
901       for (j=0; j<nz-1; j+=2) {
902         i0   = vi[0];
903         i1   = vi[1];
904         vi  +=2;
905         tmp0 = tmps[i0];
906         tmp1 = tmps[i1];
907         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
908         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
909         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
910         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
911       }
912       if (j == nz-1) {
913         tmp0 = tmps[*vi++];
914         sum1 -= *v1++ *tmp0;
915         sum2 -= *v2++ *tmp0;
916         sum3 -= *v3++ *tmp0;
917         sum4 -= *v4++ *tmp0;
918       }
919       sum2 -= *v2++ * sum1;
920       sum3 -= *v3++ * sum1;
921       sum4 -= *v4++ * sum1;
922       sum3 -= *v3++ * sum2;
923       sum4 -= *v4++ * sum2;
924       sum4 -= *v4++ * sum3;
925 
926       tmp[row ++]=sum1;
927       tmp[row ++]=sum2;
928       tmp[row ++]=sum3;
929       tmp[row ++]=sum4;
930       break;
931     case 5:
932       sum1 = b[*r++];
933       sum2 = b[*r++];
934       sum3 = b[*r++];
935       sum4 = b[*r++];
936       sum5 = b[*r++];
937       v2   = aa + ai[row+1];
938       v3   = aa + ai[row+2];
939       v4   = aa + ai[row+3];
940       v5   = aa + ai[row+4];
941 
942       for (j=0; j<nz-1; j+=2) {
943         i0   = vi[0];
944         i1   = vi[1];
945         vi  +=2;
946         tmp0 = tmps[i0];
947         tmp1 = tmps[i1];
948         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
949         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
950         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
951         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
952         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
953       }
954       if (j == nz-1) {
955         tmp0 = tmps[*vi++];
956         sum1 -= *v1++ *tmp0;
957         sum2 -= *v2++ *tmp0;
958         sum3 -= *v3++ *tmp0;
959         sum4 -= *v4++ *tmp0;
960         sum5 -= *v5++ *tmp0;
961       }
962 
963       sum2 -= *v2++ * sum1;
964       sum3 -= *v3++ * sum1;
965       sum4 -= *v4++ * sum1;
966       sum5 -= *v5++ * sum1;
967       sum3 -= *v3++ * sum2;
968       sum4 -= *v4++ * sum2;
969       sum5 -= *v5++ * sum2;
970       sum4 -= *v4++ * sum3;
971       sum5 -= *v5++ * sum3;
972       sum5 -= *v5++ * sum4;
973 
974       tmp[row ++]=sum1;
975       tmp[row ++]=sum2;
976       tmp[row ++]=sum3;
977       tmp[row ++]=sum4;
978       tmp[row ++]=sum5;
979       break;
980     default:
981       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
982     }
983   }
984   /* backward solve the upper triangular */
985   for (i=node_max -1 ,row = n-1 ; i>=0; i--) {
986     nsz = ns[i];
987     aii = ai[row+1] -1;
988     v1  = aa + aii;
989     vi  = aj + aii;
990     nz  = aii- ad[row];
991     switch (nsz) {               /* Each loop in 'case' is unrolled */
992     case 1 :
993       sum1 = tmp[row];
994 
995       for (j=nz ; j>1; j-=2) {
996         vi  -=2;
997         i0   = vi[2];
998         i1   = vi[1];
999         tmp0 = tmps[i0];
1000         tmp1 = tmps[i1];
1001         v1   -= 2;
1002         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1003       }
1004       if (j==1) {
1005         tmp0  = tmps[*vi--];
1006         sum1 -= *v1-- * tmp0;
1007       }
1008       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1009       break;
1010     case 2 :
1011       sum1 = tmp[row];
1012       sum2 = tmp[row -1];
1013       v2   = aa + ai[row]-1;
1014       for (j=nz ; j>1; j-=2) {
1015         vi  -=2;
1016         i0   = vi[2];
1017         i1   = vi[1];
1018         tmp0 = tmps[i0];
1019         tmp1 = tmps[i1];
1020         v1   -= 2;
1021         v2   -= 2;
1022         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1023         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1024       }
1025       if (j==1) {
1026         tmp0  = tmps[*vi--];
1027         sum1 -= *v1-- * tmp0;
1028         sum2 -= *v2-- * tmp0;
1029       }
1030 
1031       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1032       sum2   -= *v2-- * tmp0;
1033       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1034       break;
1035     case 3 :
1036       sum1 = tmp[row];
1037       sum2 = tmp[row -1];
1038       sum3 = tmp[row -2];
1039       v2   = aa + ai[row]-1;
1040       v3   = aa + ai[row -1]-1;
1041       for (j=nz ; j>1; j-=2) {
1042         vi  -=2;
1043         i0   = vi[2];
1044         i1   = vi[1];
1045         tmp0 = tmps[i0];
1046         tmp1 = tmps[i1];
1047         v1   -= 2;
1048         v2   -= 2;
1049         v3   -= 2;
1050         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1051         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1052         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1053       }
1054       if (j==1) {
1055         tmp0  = tmps[*vi--];
1056         sum1 -= *v1-- * tmp0;
1057         sum2 -= *v2-- * tmp0;
1058         sum3 -= *v3-- * tmp0;
1059       }
1060       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1061       sum2   -= *v2-- * tmp0;
1062       sum3   -= *v3-- * tmp0;
1063       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1064       sum3   -= *v3-- * tmp0;
1065       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1066 
1067       break;
1068     case 4 :
1069       sum1 = tmp[row];
1070       sum2 = tmp[row -1];
1071       sum3 = tmp[row -2];
1072       sum4 = tmp[row -3];
1073       v2   = aa + ai[row]-1;
1074       v3   = aa + ai[row -1]-1;
1075       v4   = aa + ai[row -2]-1;
1076 
1077       for (j=nz ; j>1; j-=2) {
1078         vi  -=2;
1079         i0   = vi[2];
1080         i1   = vi[1];
1081         tmp0 = tmps[i0];
1082         tmp1 = tmps[i1];
1083         v1  -= 2;
1084         v2  -= 2;
1085         v3  -= 2;
1086         v4  -= 2;
1087         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1088         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1089         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1090         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1091       }
1092       if (j==1) {
1093         tmp0  = tmps[*vi--];
1094         sum1 -= *v1-- * tmp0;
1095         sum2 -= *v2-- * tmp0;
1096         sum3 -= *v3-- * tmp0;
1097         sum4 -= *v4-- * tmp0;
1098       }
1099 
1100       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1101       sum2   -= *v2-- * tmp0;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1105       sum3   -= *v3-- * tmp0;
1106       sum4   -= *v4-- * tmp0;
1107       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1108       sum4   -= *v4-- * tmp0;
1109       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1110       break;
1111     case 5 :
1112       sum1 = tmp[row];
1113       sum2 = tmp[row -1];
1114       sum3 = tmp[row -2];
1115       sum4 = tmp[row -3];
1116       sum5 = tmp[row -4];
1117       v2   = aa + ai[row]-1;
1118       v3   = aa + ai[row -1]-1;
1119       v4   = aa + ai[row -2]-1;
1120       v5   = aa + ai[row -3]-1;
1121       for (j=nz ; j>1; j-=2) {
1122         vi  -= 2;
1123         i0   = vi[2];
1124         i1   = vi[1];
1125         tmp0 = tmps[i0];
1126         tmp1 = tmps[i1];
1127         v1   -= 2;
1128         v2   -= 2;
1129         v3   -= 2;
1130         v4   -= 2;
1131         v5   -= 2;
1132         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1133         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1134         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1135         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1136         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1137       }
1138       if (j==1) {
1139         tmp0  = tmps[*vi--];
1140         sum1 -= *v1-- * tmp0;
1141         sum2 -= *v2-- * tmp0;
1142         sum3 -= *v3-- * tmp0;
1143         sum4 -= *v4-- * tmp0;
1144         sum5 -= *v5-- * tmp0;
1145       }
1146 
1147       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1148       sum2   -= *v2-- * tmp0;
1149       sum3   -= *v3-- * tmp0;
1150       sum4   -= *v4-- * tmp0;
1151       sum5   -= *v5-- * tmp0;
1152       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1153       sum3   -= *v3-- * tmp0;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1157       sum4   -= *v4-- * tmp0;
1158       sum5   -= *v5-- * tmp0;
1159       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1160       sum5   -= *v5-- * tmp0;
1161       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1162       break;
1163     default:
1164       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1165     }
1166   }
1167   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1168   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1169   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1170   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1171   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1177 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1178 {
1179   Mat              C=B;
1180   Mat_SeqAIJ       *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data;
1181   IS               isrow = b->row,isicol = b->icol;
1182   PetscErrorCode   ierr;
1183   const PetscInt   *r,*ic,*ics;
1184   const PetscInt   n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1185   PetscInt         i,j,k,nz,nzL,row,*pj;
1186   const PetscInt   *ajtmp,*bjtmp;
1187   MatScalar        *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1188   const  MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1189   FactorShiftCtx   sctx;
1190   const PetscInt   *ddiag;
1191   PetscReal        rs;
1192   MatScalar        d;
1193   PetscInt         inod,nodesz,node_max,col;
1194   const PetscInt   *ns;
1195   PetscInt         *tmp_vec1,*tmp_vec2,*nsmap;
1196 
1197   PetscFunctionBegin;
1198   /* MatPivotSetUp(): initialize shift context sctx */
1199   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1200 
1201   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1202     ddiag          = a->diag;
1203     sctx.shift_top = info->zeropivot;
1204     for (i=0; i<n; i++) {
1205       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1206       d  = (aa)[ddiag[i]];
1207       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1208       v  = aa+ai[i];
1209       nz = ai[i+1] - ai[i];
1210       for (j=0; j<nz; j++)
1211         rs += PetscAbsScalar(v[j]);
1212       if (rs>sctx.shift_top) sctx.shift_top = rs;
1213     }
1214     sctx.shift_top   *= 1.1;
1215     sctx.nshift_max   = 5;
1216     sctx.shift_lo     = 0.;
1217     sctx.shift_hi     = 1.;
1218   }
1219 
1220   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1221   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1222 
1223   ierr  = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr);
1224   ierr  = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1225   rtmp2 = rtmp1 + n;
1226   rtmp3 = rtmp2 + n;
1227   rtmp4 = rtmp3 + n;
1228   ics   = ic;
1229 
1230   node_max = a->inode.node_count;
1231   ns       = a->inode.size;
1232   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1233 
1234   /* If max inode size > 4, split it into two inodes.*/
1235   /* also map the inode sizes according to the ordering */
1236   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1237   for (i=0,j=0; i<node_max; ++i,++j) {
1238     if (ns[i] > 4) {
1239       tmp_vec1[j] = 4;
1240       ++j;
1241       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1242     } else {
1243       tmp_vec1[j] = ns[i];
1244     }
1245   }
1246   /* Use the correct node_max */
1247   node_max = j;
1248 
1249   /* Now reorder the inode info based on mat re-ordering info */
1250   /* First create a row -> inode_size_array_index map */
1251   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1252   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1253   for (i=0,row=0; i<node_max; i++) {
1254     nodesz = tmp_vec1[i];
1255     for (j=0; j<nodesz; j++,row++) {
1256       nsmap[row] = i;
1257     }
1258   }
1259   /* Using nsmap, create a reordered ns structure */
1260   for (i=0,j=0; i< node_max; i++) {
1261     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1262     tmp_vec2[i]  = nodesz;
1263     j           += nodesz;
1264   }
1265   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1266   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1267 
1268   /* Now use the correct ns */
1269   ns = tmp_vec2;
1270 
1271   do {
1272     sctx.newshift = PETSC_FALSE;
1273     /* Now loop over each block-row, and do the factorization */
1274     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1275       nodesz = ns[inod];
1276 
1277       switch (nodesz) {
1278       case 1:
1279       /*----------*/
1280         /* zero rtmp1 */
1281         /* L part */
1282         nz    = bi[i+1] - bi[i];
1283         bjtmp = bj + bi[i];
1284         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1285 
1286         /* U part */
1287         nz = bdiag[i]-bdiag[i+1];
1288         bjtmp = bj + bdiag[i+1]+1;
1289         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1290 
1291         /* load in initial (unfactored row) */
1292         nz    = ai[r[i]+1] - ai[r[i]];
1293         ajtmp = aj + ai[r[i]];
1294         v     = aa + ai[r[i]];
1295         for (j=0; j<nz; j++) {
1296           rtmp1[ics[ajtmp[j]]] = v[j];
1297         }
1298         /* ZeropivotApply() */
1299         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1300 
1301         /* elimination */
1302         bjtmp = bj + bi[i];
1303         row   = *bjtmp++;
1304         nzL   = bi[i+1] - bi[i];
1305         for (k=0; k < nzL;k++) {
1306           pc = rtmp1 + row;
1307           if (*pc != 0.0) {
1308             pv   = b->a + bdiag[row];
1309             mul1 = *pc * (*pv);
1310             *pc  = mul1;
1311             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1312             pv = b->a + bdiag[row+1]+1;
1313             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1314             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1315             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1316           }
1317           row = *bjtmp++;
1318         }
1319 
1320         /* finished row so stick it into b->a */
1321         rs = 0.0;
1322         /* L part */
1323         pv = b->a + bi[i] ;
1324         pj = b->j + bi[i] ;
1325         nz = bi[i+1] - bi[i];
1326         for (j=0; j<nz; j++) {
1327           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1328         }
1329 
1330         /* U part */
1331         pv = b->a + bdiag[i+1]+1;
1332         pj = b->j + bdiag[i+1]+1;
1333         nz = bdiag[i] - bdiag[i+1]-1;
1334         for (j=0; j<nz; j++) {
1335           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1336         }
1337 
1338         /* Check zero pivot */
1339         sctx.rs = rs;
1340         sctx.pv = rtmp1[i];
1341         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1342         if (sctx.newshift) break;
1343 
1344         /* Mark diagonal and invert diagonal for simplier triangular solves */
1345         pv  = b->a + bdiag[i];
1346         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1347         break;
1348 
1349       case 2:
1350       /*----------*/
1351         /* zero rtmp1 and rtmp2 */
1352         /* L part */
1353         nz    = bi[i+1] - bi[i];
1354         bjtmp = bj + bi[i];
1355         for  (j=0; j<nz; j++) {
1356           col = bjtmp[j];
1357           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1358         }
1359 
1360         /* U part */
1361         nz = bdiag[i]-bdiag[i+1];
1362         bjtmp = bj + bdiag[i+1]+1;
1363         for  (j=0; j<nz; j++) {
1364           col = bjtmp[j];
1365           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1366         }
1367 
1368         /* load in initial (unfactored row) */
1369         nz    = ai[r[i]+1] - ai[r[i]];
1370         ajtmp = aj + ai[r[i]];
1371         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1372         for (j=0; j<nz; j++) {
1373           col = ics[ajtmp[j]];
1374           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1375         }
1376         /* ZeropivotApply(): shift the diagonal of the matrix  */
1377         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1378 
1379         /* elimination */
1380         bjtmp = bj + bi[i];
1381         row   = *bjtmp++; /* pivot row */
1382         nzL   = bi[i+1] - bi[i];
1383         for (k=0; k < nzL;k++) {
1384           pc1 = rtmp1 + row;
1385           pc2 = rtmp2 + row;
1386           if (*pc1 != 0.0 || *pc2 != 0.0) {
1387             pv   = b->a + bdiag[row];
1388             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1389             *pc1 = mul1;       *pc2 = mul2;
1390 
1391             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1392             pv = b->a + bdiag[row+1]+1;
1393             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1394             for (j=0; j<nz; j++) {
1395               col = pj[j];
1396               rtmp1[col] -= mul1 * pv[j];
1397               rtmp2[col] -= mul2 * pv[j];
1398             }
1399             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1400           }
1401           row = *bjtmp++;
1402         }
1403 
1404         /* finished row i; check zero pivot, then stick row i into b->a */
1405         rs  = 0.0;
1406         /* L part */
1407         pc1 = b->a + bi[i];
1408         pj  = b->j + bi[i] ;
1409         nz  = bi[i+1] - bi[i];
1410         for (j=0; j<nz; j++) {
1411           col = pj[j];
1412           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1413         }
1414         /* U part */
1415         pc1 = b->a + bdiag[i+1]+1;
1416         pj  = b->j + bdiag[i+1]+1;
1417         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1418         for (j=0; j<nz; j++) {
1419           col = pj[j];
1420           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1421         }
1422 
1423         sctx.rs  = rs;
1424         sctx.pv  = rtmp1[i];
1425         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1426         if (sctx.newshift) break;
1427         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1428         *pc1 = 1.0/sctx.pv;
1429 
1430         /* Now take care of diagonal 2x2 block. */
1431         pc2 = rtmp2 + i;
1432         if (*pc2 != 0.0) {
1433           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1434           *pc2 = mul1;          /* insert L entry */
1435           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1436           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1437           for (j=0; j<nz; j++) {
1438             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1439           }
1440           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1441         }
1442 
1443         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1444         rs = 0.0;
1445         /* L part */
1446         pc2 = b->a + bi[i+1];
1447         pj  = b->j + bi[i+1] ;
1448         nz  = bi[i+2] - bi[i+1];
1449         for (j=0; j<nz; j++) {
1450           col = pj[j];
1451           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1452         }
1453         /* U part */
1454         pc2 = b->a + bdiag[i+2]+1;
1455         pj  = b->j + bdiag[i+2]+1;
1456         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1457         for (j=0; j<nz; j++) {
1458           col = pj[j];
1459           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1460         }
1461 
1462         sctx.rs  = rs;
1463         sctx.pv  = rtmp2[i+1];
1464         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1465         if (sctx.newshift) break;
1466         pc2  = b->a + bdiag[i+1];
1467         *pc2 = 1.0/sctx.pv;
1468         break;
1469 
1470       case 3:
1471       /*----------*/
1472         /* zero rtmp */
1473         /* L part */
1474         nz    = bi[i+1] - bi[i];
1475         bjtmp = bj + bi[i];
1476         for  (j=0; j<nz; j++) {
1477           col = bjtmp[j];
1478           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1479         }
1480 
1481         /* U part */
1482         nz = bdiag[i]-bdiag[i+1];
1483         bjtmp = bj + bdiag[i+1]+1;
1484         for  (j=0; j<nz; j++) {
1485           col = bjtmp[j];
1486           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1487         }
1488 
1489         /* load in initial (unfactored row) */
1490         nz    = ai[r[i]+1] - ai[r[i]];
1491         ajtmp = aj + ai[r[i]];
1492         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1493         for (j=0; j<nz; j++) {
1494           col = ics[ajtmp[j]];
1495           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1496         }
1497         /* ZeropivotApply(): shift the diagonal of the matrix  */
1498         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1499 
1500         /* elimination */
1501         bjtmp = bj + bi[i];
1502         row   = *bjtmp++; /* pivot row */
1503         nzL   = bi[i+1] - bi[i];
1504         for (k=0; k < nzL;k++) {
1505           pc1 = rtmp1 + row;
1506           pc2 = rtmp2 + row;
1507           pc3 = rtmp3 + row;
1508           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1509             pv  = b->a + bdiag[row];
1510             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1511             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1512 
1513             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1514             pv = b->a + bdiag[row+1]+1;
1515             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1516             for (j=0; j<nz; j++) {
1517               col = pj[j];
1518               rtmp1[col] -= mul1 * pv[j];
1519               rtmp2[col] -= mul2 * pv[j];
1520               rtmp3[col] -= mul3 * pv[j];
1521             }
1522             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1523           }
1524           row = *bjtmp++;
1525         }
1526 
1527         /* finished row i; check zero pivot, then stick row i into b->a */
1528         rs  = 0.0;
1529         /* L part */
1530         pc1 = b->a + bi[i];
1531         pj  = b->j + bi[i] ;
1532         nz  = bi[i+1] - bi[i];
1533         for (j=0; j<nz; j++) {
1534           col = pj[j];
1535           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1536         }
1537         /* U part */
1538         pc1 = b->a + bdiag[i+1]+1;
1539         pj  = b->j + bdiag[i+1]+1;
1540         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1541         for (j=0; j<nz; j++) {
1542           col = pj[j];
1543           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1544         }
1545 
1546         sctx.rs  = rs;
1547         sctx.pv  = rtmp1[i];
1548         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1549         if (sctx.newshift) break;
1550         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1551         *pc1 = 1.0/sctx.pv;
1552 
1553         /* Now take care of 1st column of diagonal 3x3 block. */
1554         pc2 = rtmp2 + i;
1555         pc3 = rtmp3 + i;
1556         if (*pc2 != 0.0 || *pc3 != 0.0) {
1557           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1558           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1559           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1560           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1561           for (j=0; j<nz; j++) {
1562             col = pj[j];
1563             rtmp2[col] -= mul2 * rtmp1[col];
1564             rtmp3[col] -= mul3 * rtmp1[col];
1565           }
1566           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1567         }
1568 
1569         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1570         rs = 0.0;
1571         /* L part */
1572         pc2 = b->a + bi[i+1];
1573         pj  = b->j + bi[i+1] ;
1574         nz  = bi[i+2] - bi[i+1];
1575         for (j=0; j<nz; j++) {
1576           col = pj[j];
1577           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1578         }
1579         /* U part */
1580         pc2 = b->a + bdiag[i+2]+1;
1581         pj  = b->j + bdiag[i+2]+1;
1582         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1583         for (j=0; j<nz; j++) {
1584           col = pj[j];
1585           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1586         }
1587 
1588         sctx.rs  = rs;
1589         sctx.pv  = rtmp2[i+1];
1590         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1591         if (sctx.newshift) break;
1592         pc2  = b->a + bdiag[i+1];
1593         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1594 
1595         /* Now take care of 2nd column of diagonal 3x3 block. */
1596         pc3 = rtmp3 + i+1;
1597         if (*pc3 != 0.0) {
1598           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1599           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1600           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1601           for (j=0; j<nz; j++) {
1602             col = pj[j];
1603             rtmp3[col] -= mul3 * rtmp2[col];
1604           }
1605           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1606         }
1607 
1608         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1609         rs = 0.0;
1610         /* L part */
1611         pc3 = b->a + bi[i+2];
1612         pj  = b->j + bi[i+2] ;
1613         nz  = bi[i+3] - bi[i+2];
1614         for (j=0; j<nz; j++) {
1615           col = pj[j];
1616           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1617         }
1618         /* U part */
1619         pc3 = b->a + bdiag[i+3]+1;
1620         pj  = b->j + bdiag[i+3]+1;
1621         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1622         for (j=0; j<nz; j++) {
1623           col = pj[j];
1624           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1625         }
1626 
1627         sctx.rs  = rs;
1628         sctx.pv  = rtmp3[i+2];
1629         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1630         if (sctx.newshift) break;
1631         pc3  = b->a + bdiag[i+2];
1632         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1633         break;
1634       case 4:
1635       /*----------*/
1636         /* zero rtmp */
1637         /* L part */
1638         nz    = bi[i+1] - bi[i];
1639         bjtmp = bj + bi[i];
1640         for  (j=0; j<nz; j++) {
1641           col = bjtmp[j];
1642           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1643         }
1644 
1645         /* U part */
1646         nz = bdiag[i]-bdiag[i+1];
1647         bjtmp = bj + bdiag[i+1]+1;
1648         for  (j=0; j<nz; j++) {
1649           col = bjtmp[j];
1650           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1651         }
1652 
1653         /* load in initial (unfactored row) */
1654         nz    = ai[r[i]+1] - ai[r[i]];
1655         ajtmp = aj + ai[r[i]];
1656         v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1657         for (j=0; j<nz; j++) {
1658           col = ics[ajtmp[j]];
1659           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1660         }
1661         /* ZeropivotApply(): shift the diagonal of the matrix  */
1662         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1663 
1664         /* elimination */
1665         bjtmp = bj + bi[i];
1666         row   = *bjtmp++; /* pivot row */
1667         nzL   = bi[i+1] - bi[i];
1668         for (k=0; k < nzL;k++) {
1669           pc1 = rtmp1 + row;
1670           pc2 = rtmp2 + row;
1671           pc3 = rtmp3 + row;
1672           pc4 = rtmp4 + row;
1673           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1674             pv  = b->a + bdiag[row];
1675             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1676             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1677 
1678             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1679             pv = b->a + bdiag[row+1]+1;
1680             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1681             for (j=0; j<nz; j++) {
1682               col = pj[j];
1683               rtmp1[col] -= mul1 * pv[j];
1684               rtmp2[col] -= mul2 * pv[j];
1685               rtmp3[col] -= mul3 * pv[j];
1686               rtmp4[col] -= mul4 * pv[j];
1687             }
1688             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1689           }
1690           row = *bjtmp++;
1691         }
1692 
1693         /* finished row i; check zero pivot, then stick row i into b->a */
1694         rs  = 0.0;
1695         /* L part */
1696         pc1 = b->a + bi[i];
1697         pj  = b->j + bi[i] ;
1698         nz  = bi[i+1] - bi[i];
1699         for (j=0; j<nz; j++) {
1700           col = pj[j];
1701           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1702         }
1703         /* U part */
1704         pc1 = b->a + bdiag[i+1]+1;
1705         pj  = b->j + bdiag[i+1]+1;
1706         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1707         for (j=0; j<nz; j++) {
1708           col = pj[j];
1709           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1710         }
1711 
1712         sctx.rs  = rs;
1713         sctx.pv  = rtmp1[i];
1714         ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1715         if (sctx.newshift) break;
1716         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1717         *pc1 = 1.0/sctx.pv;
1718 
1719         /* Now take care of 1st column of diagonal 4x4 block. */
1720         pc2 = rtmp2 + i;
1721         pc3 = rtmp3 + i;
1722         pc4 = rtmp4 + i;
1723         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1724           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1725           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1726           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1727           pj = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1728           nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1729           for (j=0; j<nz; j++) {
1730             col = pj[j];
1731             rtmp2[col] -= mul2 * rtmp1[col];
1732             rtmp3[col] -= mul3 * rtmp1[col];
1733             rtmp4[col] -= mul4 * rtmp1[col];
1734           }
1735           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1736         }
1737 
1738         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1739         rs = 0.0;
1740         /* L part */
1741         pc2 = b->a + bi[i+1];
1742         pj  = b->j + bi[i+1] ;
1743         nz  = bi[i+2] - bi[i+1];
1744         for (j=0; j<nz; j++) {
1745           col = pj[j];
1746           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1747         }
1748         /* U part */
1749         pc2 = b->a + bdiag[i+2]+1;
1750         pj  = b->j + bdiag[i+2]+1;
1751         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1752         for (j=0; j<nz; j++) {
1753           col = pj[j];
1754           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1755         }
1756 
1757         sctx.rs  = rs;
1758         sctx.pv  = rtmp2[i+1];
1759         ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1760         if (sctx.newshift) break;
1761         pc2  = b->a + bdiag[i+1];
1762         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1763 
1764         /* Now take care of 2nd column of diagonal 4x4 block. */
1765         pc3 = rtmp3 + i+1;
1766         pc4 = rtmp4 + i+1;
1767         if (*pc3 != 0.0 || *pc4 != 0.0) {
1768           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1769           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1770           pj = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1771           nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1772           for (j=0; j<nz; j++) {
1773             col = pj[j];
1774             rtmp3[col] -= mul3 * rtmp2[col];
1775             rtmp4[col] -= mul4 * rtmp2[col];
1776           }
1777           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1778         }
1779 
1780         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1781         rs = 0.0;
1782         /* L part */
1783         pc3 = b->a + bi[i+2];
1784         pj  = b->j + bi[i+2] ;
1785         nz  = bi[i+3] - bi[i+2];
1786         for (j=0; j<nz; j++) {
1787           col = pj[j];
1788           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1789         }
1790         /* U part */
1791         pc3 = b->a + bdiag[i+3]+1;
1792         pj  = b->j + bdiag[i+3]+1;
1793         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1794         for (j=0; j<nz; j++) {
1795           col = pj[j];
1796           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1797         }
1798 
1799         sctx.rs  = rs;
1800         sctx.pv  = rtmp3[i+2];
1801         ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1802         if (sctx.newshift) break;
1803         pc3  = b->a + bdiag[i+2];
1804         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1805 
1806         /* Now take care of 3rd column of diagonal 4x4 block. */
1807         pc4 = rtmp4 + i+2;
1808         if (*pc4 != 0.0) {
1809           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1810           pj = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1811           nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1812           for (j=0; j<nz; j++) {
1813             col = pj[j];
1814             rtmp4[col] -= mul4 * rtmp3[col];
1815           }
1816           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1817         }
1818 
1819         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1820         rs = 0.0;
1821         /* L part */
1822         pc4 = b->a + bi[i+3];
1823         pj  = b->j + bi[i+3] ;
1824         nz  = bi[i+4] - bi[i+3];
1825         for (j=0; j<nz; j++) {
1826           col = pj[j];
1827           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1828         }
1829         /* U part */
1830         pc4 = b->a + bdiag[i+4]+1;
1831         pj  = b->j + bdiag[i+4]+1;
1832         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1833         for (j=0; j<nz; j++) {
1834           col = pj[j];
1835           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1836         }
1837 
1838         sctx.rs  = rs;
1839         sctx.pv  = rtmp4[i+3];
1840         ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1841         if (sctx.newshift) break;
1842         pc4  = b->a + bdiag[i+3];
1843         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1844          break;
1845 
1846       default:
1847         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1848       }
1849       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1850       i += nodesz;                 /* Update the row */
1851     }
1852 
1853     /* MatPivotRefine() */
1854     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1855       /*
1856        * if no shift in this attempt & shifting & started shifting & can refine,
1857        * then try lower shift
1858        */
1859       sctx.shift_hi       = sctx.shift_fraction;
1860       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1861       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1862       sctx.newshift       = PETSC_TRUE;
1863       sctx.nshift++;
1864     }
1865   } while (sctx.newshift);
1866 
1867   ierr = PetscFree(rtmp1);CHKERRQ(ierr);
1868   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1869   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1870   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1871 
1872   C->ops->solve              = MatSolve_SeqAIJ;
1873   C->ops->solveadd           = MatSolveAdd_SeqAIJ;
1874   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ;
1875   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ;
1876   C->ops->matsolve           = MatMatSolve_SeqAIJ;
1877   C->assembled    = PETSC_TRUE;
1878   C->preallocated = PETSC_TRUE;
1879   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1880 
1881   /* MatShiftView(A,info,&sctx) */
1882   if (sctx.nshift) {
1883     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1884       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
1885     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1886       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1887     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1888       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr);
1889     }
1890   }
1891   ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr);
1892   PetscFunctionReturn(0);
1893 }
1894 
1895 #undef __FUNCT__
1896 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1897 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1898 {
1899   Mat               C = B;
1900   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1901   IS                iscol = b->col,isrow = b->row,isicol = b->icol;
1902   PetscErrorCode    ierr;
1903   const PetscInt    *r,*ic,*c,*ics;
1904   PetscInt          n = A->rmap->n,*bi = b->i;
1905   PetscInt          *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1906   PetscInt          i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1907   PetscInt          *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1908   PetscScalar       mul1,mul2,mul3,tmp;
1909   MatScalar         *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1910   const MatScalar   *v1,*v2,*v3,*aa = a->a,*rtmp1;
1911   PetscReal         rs=0.0;
1912   FactorShiftCtx    sctx;
1913 
1914   PetscFunctionBegin;
1915   sctx.shift_top      = 0;
1916   sctx.nshift_max     = 0;
1917   sctx.shift_lo       = 0;
1918   sctx.shift_hi       = 0;
1919   sctx.shift_fraction = 0;
1920 
1921   /* if both shift schemes are chosen by user, only use info->shiftpd */
1922   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1923     sctx.shift_top = 0;
1924     for (i=0; i<n; i++) {
1925       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1926       rs    = 0.0;
1927       ajtmp = aj + ai[i];
1928       rtmp1 = aa + ai[i];
1929       nz = ai[i+1] - ai[i];
1930       for (j=0; j<nz; j++) {
1931         if (*ajtmp != i) {
1932           rs += PetscAbsScalar(*rtmp1++);
1933         } else {
1934           rs -= PetscRealPart(*rtmp1++);
1935         }
1936         ajtmp++;
1937       }
1938       if (rs>sctx.shift_top) sctx.shift_top = rs;
1939     }
1940     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1941     sctx.shift_top *= 1.1;
1942     sctx.nshift_max = 5;
1943     sctx.shift_lo   = 0.;
1944     sctx.shift_hi   = 1.;
1945   }
1946   sctx.shift_amount = 0;
1947   sctx.nshift       = 0;
1948 
1949   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1950   ierr  = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1951   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1952   ierr  = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr);
1953   ierr  = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1954   ics   = ic ;
1955   rtmp22 = rtmp11 + n;
1956   rtmp33 = rtmp22 + n;
1957 
1958   node_max = a->inode.node_count;
1959   ns       = a->inode.size;
1960   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1961 
1962   /* If max inode size > 3, split it into two inodes.*/
1963   /* also map the inode sizes according to the ordering */
1964   ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr);
1965   for (i=0,j=0; i<node_max; ++i,++j) {
1966     if (ns[i]>3) {
1967       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1968       ++j;
1969       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1970     } else {
1971       tmp_vec1[j] = ns[i];
1972     }
1973   }
1974   /* Use the correct node_max */
1975   node_max = j;
1976 
1977   /* Now reorder the inode info based on mat re-ordering info */
1978   /* First create a row -> inode_size_array_index map */
1979   ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr);
1980   ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr);
1981   for (i=0,row=0; i<node_max; i++) {
1982     nodesz = tmp_vec1[i];
1983     for (j=0; j<nodesz; j++,row++) {
1984       nsmap[row] = i;
1985     }
1986   }
1987   /* Using nsmap, create a reordered ns structure */
1988   for (i=0,j=0; i< node_max; i++) {
1989     nodesz       = tmp_vec1[nsmap[r[j]]];    /* here the reordered row_no is in r[] */
1990     tmp_vec2[i]  = nodesz;
1991     j           += nodesz;
1992   }
1993   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1994   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1995   /* Now use the correct ns */
1996   ns = tmp_vec2;
1997 
1998   do {
1999     sctx.newshift = PETSC_FALSE;
2000     /* Now loop over each block-row, and do the factorization */
2001     for (i=0,row=0; i<node_max; i++) {
2002       nodesz = ns[i];
2003       nz     = bi[row+1] - bi[row];
2004       bjtmp  = bj + bi[row];
2005 
2006       switch (nodesz) {
2007       case 1:
2008         for  (j=0; j<nz; j++) {
2009           idx        = bjtmp[j];
2010           rtmp11[idx] = 0.0;
2011         }
2012 
2013         /* load in initial (unfactored row) */
2014         idx    = r[row];
2015         nz_tmp = ai[idx+1] - ai[idx];
2016         ajtmp  = aj + ai[idx];
2017         v1     = aa + ai[idx];
2018 
2019         for (j=0; j<nz_tmp; j++) {
2020           idx        = ics[ajtmp[j]];
2021           rtmp11[idx] = v1[j];
2022         }
2023         rtmp11[ics[r[row]]] += sctx.shift_amount;
2024 
2025         prow = *bjtmp++ ;
2026         while (prow < row) {
2027           pc1 = rtmp11 + prow;
2028           if (*pc1 != 0.0) {
2029             pv   = ba + bd[prow];
2030             pj   = nbj + bd[prow];
2031             mul1 = *pc1 * *pv++;
2032             *pc1 = mul1;
2033             nz_tmp = bi[prow+1] - bd[prow] - 1;
2034             ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2035             for (j=0; j<nz_tmp; j++) {
2036               tmp = pv[j];
2037               idx = pj[j];
2038               rtmp11[idx] -= mul1 * tmp;
2039             }
2040           }
2041           prow = *bjtmp++ ;
2042         }
2043         pj  = bj + bi[row];
2044         pc1 = ba + bi[row];
2045 
2046         sctx.pv    = rtmp11[row];
2047         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2048         rs         = 0.0;
2049         for (j=0; j<nz; j++) {
2050           idx    = pj[j];
2051           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2052           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2053         }
2054         sctx.rs  = rs;
2055         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2056         if (sctx.newshift) goto endofwhile;
2057         break;
2058 
2059       case 2:
2060         for (j=0; j<nz; j++) {
2061           idx        = bjtmp[j];
2062           rtmp11[idx] = 0.0;
2063           rtmp22[idx] = 0.0;
2064         }
2065 
2066         /* load in initial (unfactored row) */
2067         idx    = r[row];
2068         nz_tmp = ai[idx+1] - ai[idx];
2069         ajtmp  = aj + ai[idx];
2070         v1     = aa + ai[idx];
2071         v2     = aa + ai[idx+1];
2072         for (j=0; j<nz_tmp; j++) {
2073           idx        = ics[ajtmp[j]];
2074           rtmp11[idx] = v1[j];
2075           rtmp22[idx] = v2[j];
2076         }
2077         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2078         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2079 
2080         prow = *bjtmp++ ;
2081         while (prow < row) {
2082           pc1 = rtmp11 + prow;
2083           pc2 = rtmp22 + prow;
2084           if (*pc1 != 0.0 || *pc2 != 0.0) {
2085             pv   = ba + bd[prow];
2086             pj   = nbj + bd[prow];
2087             mul1 = *pc1 * *pv;
2088             mul2 = *pc2 * *pv;
2089             ++pv;
2090             *pc1 = mul1;
2091             *pc2 = mul2;
2092 
2093             nz_tmp = bi[prow+1] - bd[prow] - 1;
2094             for (j=0; j<nz_tmp; j++) {
2095               tmp = pv[j];
2096               idx = pj[j];
2097               rtmp11[idx] -= mul1 * tmp;
2098               rtmp22[idx] -= mul2 * tmp;
2099             }
2100             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2101           }
2102           prow = *bjtmp++ ;
2103         }
2104 
2105         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2106         pc1 = rtmp11 + prow;
2107         pc2 = rtmp22 + prow;
2108 
2109         sctx.pv = *pc1;
2110         pj      = bj + bi[prow];
2111         rs      = 0.0;
2112         for (j=0; j<nz; j++) {
2113           idx = pj[j];
2114           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2115         }
2116         sctx.rs = rs;
2117         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2118         if (sctx.newshift) goto endofwhile;
2119 
2120         if (*pc2 != 0.0) {
2121           pj     = nbj + bd[prow];
2122           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2123           *pc2   = mul2;
2124           nz_tmp = bi[prow+1] - bd[prow] - 1;
2125           for (j=0; j<nz_tmp; j++) {
2126             idx = pj[j] ;
2127             tmp = rtmp11[idx];
2128             rtmp22[idx] -= mul2 * tmp;
2129           }
2130           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2131         }
2132 
2133         pj  = bj + bi[row];
2134         pc1 = ba + bi[row];
2135         pc2 = ba + bi[row+1];
2136 
2137         sctx.pv = rtmp22[row+1];
2138         rs = 0.0;
2139         rtmp11[row]   = 1.0/rtmp11[row];
2140         rtmp22[row+1] = 1.0/rtmp22[row+1];
2141         /* copy row entries from dense representation to sparse */
2142         for (j=0; j<nz; j++) {
2143           idx    = pj[j];
2144           pc1[j] = rtmp11[idx];
2145           pc2[j] = rtmp22[idx];
2146           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2147         }
2148         sctx.rs = rs;
2149         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2150         if (sctx.newshift) goto endofwhile;
2151         break;
2152 
2153       case 3:
2154         for  (j=0; j<nz; j++) {
2155           idx        = bjtmp[j];
2156           rtmp11[idx] = 0.0;
2157           rtmp22[idx] = 0.0;
2158           rtmp33[idx] = 0.0;
2159         }
2160         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2161         idx    = r[row];
2162         nz_tmp = ai[idx+1] - ai[idx];
2163         ajtmp = aj + ai[idx];
2164         v1    = aa + ai[idx];
2165         v2    = aa + ai[idx+1];
2166         v3    = aa + ai[idx+2];
2167         for (j=0; j<nz_tmp; j++) {
2168           idx        = ics[ajtmp[j]];
2169           rtmp11[idx] = v1[j];
2170           rtmp22[idx] = v2[j];
2171           rtmp33[idx] = v3[j];
2172         }
2173         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2174         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2175         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2176 
2177         /* loop over all pivot row blocks above this row block */
2178         prow = *bjtmp++ ;
2179         while (prow < row) {
2180           pc1 = rtmp11 + prow;
2181           pc2 = rtmp22 + prow;
2182           pc3 = rtmp33 + prow;
2183           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2184             pv   = ba  + bd[prow];
2185             pj   = nbj + bd[prow];
2186             mul1 = *pc1 * *pv;
2187             mul2 = *pc2 * *pv;
2188             mul3 = *pc3 * *pv;
2189             ++pv;
2190             *pc1 = mul1;
2191             *pc2 = mul2;
2192             *pc3 = mul3;
2193 
2194             nz_tmp = bi[prow+1] - bd[prow] - 1;
2195             /* update this row based on pivot row */
2196             for (j=0; j<nz_tmp; j++) {
2197               tmp = pv[j];
2198               idx = pj[j];
2199               rtmp11[idx] -= mul1 * tmp;
2200               rtmp22[idx] -= mul2 * tmp;
2201               rtmp33[idx] -= mul3 * tmp;
2202             }
2203             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2204           }
2205           prow = *bjtmp++ ;
2206         }
2207 
2208         /* Now take care of diagonal 3x3 block in this set of rows */
2209         /* note: prow = row here */
2210         pc1 = rtmp11 + prow;
2211         pc2 = rtmp22 + prow;
2212         pc3 = rtmp33 + prow;
2213 
2214         sctx.pv = *pc1;
2215         pj      = bj + bi[prow];
2216         rs      = 0.0;
2217         for (j=0; j<nz; j++) {
2218           idx = pj[j];
2219           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2220         }
2221         sctx.rs = rs;
2222         ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2223         if (sctx.newshift) goto endofwhile;
2224 
2225         if (*pc2 != 0.0 || *pc3 != 0.0) {
2226           mul2 = (*pc2)/(*pc1);
2227           mul3 = (*pc3)/(*pc1);
2228           *pc2 = mul2;
2229           *pc3 = mul3;
2230           nz_tmp = bi[prow+1] - bd[prow] - 1;
2231           pj     = nbj + bd[prow];
2232           for (j=0; j<nz_tmp; j++) {
2233             idx = pj[j] ;
2234             tmp = rtmp11[idx];
2235             rtmp22[idx] -= mul2 * tmp;
2236             rtmp33[idx] -= mul3 * tmp;
2237           }
2238           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2239         }
2240         ++prow;
2241 
2242         pc2 = rtmp22 + prow;
2243         pc3 = rtmp33 + prow;
2244         sctx.pv = *pc2;
2245         pj      = bj + bi[prow];
2246         rs      = 0.0;
2247         for (j=0; j<nz; j++) {
2248           idx = pj[j];
2249           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2250         }
2251         sctx.rs = rs;
2252         ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2253         if (sctx.newshift) goto endofwhile;
2254 
2255         if (*pc3 != 0.0) {
2256           mul3   = (*pc3)/(*pc2);
2257           *pc3   = mul3;
2258           pj     = nbj + bd[prow];
2259           nz_tmp = bi[prow+1] - bd[prow] - 1;
2260           for (j=0; j<nz_tmp; j++) {
2261             idx = pj[j] ;
2262             tmp = rtmp22[idx];
2263             rtmp33[idx] -= mul3 * tmp;
2264           }
2265           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2266         }
2267 
2268         pj  = bj + bi[row];
2269         pc1 = ba + bi[row];
2270         pc2 = ba + bi[row+1];
2271         pc3 = ba + bi[row+2];
2272 
2273         sctx.pv = rtmp33[row+2];
2274         rs = 0.0;
2275         rtmp11[row]   = 1.0/rtmp11[row];
2276         rtmp22[row+1] = 1.0/rtmp22[row+1];
2277         rtmp33[row+2] = 1.0/rtmp33[row+2];
2278         /* copy row entries from dense representation to sparse */
2279         for (j=0; j<nz; j++) {
2280           idx    = pj[j];
2281           pc1[j] = rtmp11[idx];
2282           pc2[j] = rtmp22[idx];
2283           pc3[j] = rtmp33[idx];
2284           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2285         }
2286 
2287         sctx.rs = rs;
2288         ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2289         if (sctx.newshift) goto endofwhile;
2290         break;
2291 
2292       default:
2293         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2294       }
2295       row += nodesz;                 /* Update the row */
2296     }
2297     endofwhile:;
2298   } while (sctx.newshift);
2299   ierr = PetscFree(rtmp11);CHKERRQ(ierr);
2300   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2301   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2302   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2303   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2304   (B)->ops->solve           = MatSolve_SeqAIJ_inplace;
2305   /* do not set solve add, since MatSolve_Inode + Add is faster */
2306   C->ops->solvetranspose     = MatSolveTranspose_SeqAIJ_inplace;
2307   C->ops->solvetransposeadd  = MatSolveTransposeAdd_SeqAIJ_inplace;
2308   C->assembled   = PETSC_TRUE;
2309   C->preallocated = PETSC_TRUE;
2310   if (sctx.nshift) {
2311     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2312       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr);
2313     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2314       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
2315     }
2316   }
2317   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2318   ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr);
2319   PetscFunctionReturn(0);
2320 }
2321 
2322 
2323 /* ----------------------------------------------------------- */
2324 #undef __FUNCT__
2325 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2326 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2327 {
2328   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2329   IS                iscol = a->col,isrow = a->row;
2330   PetscErrorCode    ierr;
2331   const PetscInt    *r,*c,*rout,*cout;
2332   PetscInt          i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j;
2333   PetscInt          node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1;
2334   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2335   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2336   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2337   const PetscScalar *b;
2338 
2339   PetscFunctionBegin;
2340   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2341   node_max = a->inode.node_count;
2342   ns       = a->inode.size;     /* Node Size array */
2343 
2344   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2345   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2346   tmp  = a->solve_work;
2347 
2348   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2349   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2350 
2351   /* forward solve the lower triangular */
2352   tmps = tmp ;
2353   aa   = a_a ;
2354   aj   = a_j ;
2355   ad   = a->diag;
2356 
2357   for (i = 0,row = 0; i< node_max; ++i) {
2358     nsz = ns[i];
2359     aii = ai[row];
2360     v1  = aa + aii;
2361     vi  = aj + aii;
2362     nz  = ai[row+1]- ai[row];
2363 
2364     if (i < node_max-1) {
2365       /* Prefetch the indices for the next block */
2366       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2367       /* Prefetch the data for the next block */
2368       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2369     }
2370 
2371     switch (nsz) {               /* Each loop in 'case' is unrolled */
2372     case 1 :
2373       sum1 = b[r[row]];
2374       for (j=0; j<nz-1; j+=2) {
2375         i0   = vi[j];
2376         i1   = vi[j+1];
2377         tmp0 = tmps[i0];
2378         tmp1 = tmps[i1];
2379         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2380       }
2381       if (j == nz-1) {
2382         tmp0 = tmps[vi[j]];
2383         sum1 -= v1[j]*tmp0;
2384       }
2385       tmp[row++]=sum1;
2386       break;
2387     case 2:
2388       sum1 = b[r[row]];
2389       sum2 = b[r[row+1]];
2390       v2   = aa + ai[row+1];
2391 
2392       for (j=0; j<nz-1; j+=2) {
2393         i0   = vi[j];
2394         i1   = vi[j+1];
2395         tmp0 = tmps[i0];
2396         tmp1 = tmps[i1];
2397         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2398         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2399       }
2400       if (j == nz-1) {
2401         tmp0 = tmps[vi[j]];
2402         sum1 -= v1[j] *tmp0;
2403         sum2 -= v2[j] *tmp0;
2404       }
2405       sum2 -= v2[nz] * sum1;
2406       tmp[row ++]=sum1;
2407       tmp[row ++]=sum2;
2408       break;
2409     case 3:
2410       sum1 = b[r[row]];
2411       sum2 = b[r[row+1]];
2412       sum3 = b[r[row+2]];
2413       v2   = aa + ai[row+1];
2414       v3   = aa + ai[row+2];
2415 
2416       for (j=0; j<nz-1; j+=2) {
2417         i0   = vi[j];
2418         i1   = vi[j+1];
2419         tmp0 = tmps[i0];
2420         tmp1 = tmps[i1];
2421         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2422         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2423         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2424       }
2425       if (j == nz-1) {
2426         tmp0 = tmps[vi[j]];
2427         sum1 -= v1[j] *tmp0;
2428         sum2 -= v2[j] *tmp0;
2429         sum3 -= v3[j] *tmp0;
2430       }
2431       sum2 -= v2[nz] * sum1;
2432       sum3 -= v3[nz] * sum1;
2433       sum3 -= v3[nz+1] * sum2;
2434       tmp[row ++]=sum1;
2435       tmp[row ++]=sum2;
2436       tmp[row ++]=sum3;
2437       break;
2438 
2439     case 4:
2440       sum1 = b[r[row]];
2441       sum2 = b[r[row+1]];
2442       sum3 = b[r[row+2]];
2443       sum4 = b[r[row+3]];
2444       v2   = aa + ai[row+1];
2445       v3   = aa + ai[row+2];
2446       v4   = aa + ai[row+3];
2447 
2448       for (j=0; j<nz-1; j+=2) {
2449         i0   = vi[j];
2450         i1   = vi[j+1];
2451         tmp0 = tmps[i0];
2452         tmp1 = tmps[i1];
2453         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2454         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2455         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2456         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2457       }
2458       if (j == nz-1) {
2459         tmp0 = tmps[vi[j]];
2460         sum1 -= v1[j] *tmp0;
2461         sum2 -= v2[j] *tmp0;
2462         sum3 -= v3[j] *tmp0;
2463         sum4 -= v4[j] *tmp0;
2464       }
2465       sum2 -= v2[nz] * sum1;
2466       sum3 -= v3[nz] * sum1;
2467       sum4 -= v4[nz] * sum1;
2468       sum3 -= v3[nz+1] * sum2;
2469       sum4 -= v4[nz+1] * sum2;
2470       sum4 -= v4[nz+2] * sum3;
2471 
2472       tmp[row ++]=sum1;
2473       tmp[row ++]=sum2;
2474       tmp[row ++]=sum3;
2475       tmp[row ++]=sum4;
2476       break;
2477     case 5:
2478       sum1 = b[r[row]];
2479       sum2 = b[r[row+1]];
2480       sum3 = b[r[row+2]];
2481       sum4 = b[r[row+3]];
2482       sum5 = b[r[row+4]];
2483       v2   = aa + ai[row+1];
2484       v3   = aa + ai[row+2];
2485       v4   = aa + ai[row+3];
2486       v5   = aa + ai[row+4];
2487 
2488       for (j=0; j<nz-1; j+=2) {
2489         i0   = vi[j];
2490         i1   = vi[j+1];
2491         tmp0 = tmps[i0];
2492         tmp1 = tmps[i1];
2493         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2494         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2495         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2496         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2497         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2498       }
2499       if (j == nz-1) {
2500         tmp0 = tmps[vi[j]];
2501         sum1 -= v1[j] *tmp0;
2502         sum2 -= v2[j] *tmp0;
2503         sum3 -= v3[j] *tmp0;
2504         sum4 -= v4[j] *tmp0;
2505         sum5 -= v5[j] *tmp0;
2506       }
2507 
2508       sum2 -= v2[nz] * sum1;
2509       sum3 -= v3[nz] * sum1;
2510       sum4 -= v4[nz] * sum1;
2511       sum5 -= v5[nz] * sum1;
2512       sum3 -= v3[nz+1] * sum2;
2513       sum4 -= v4[nz+1] * sum2;
2514       sum5 -= v5[nz+1] * sum2;
2515       sum4 -= v4[nz+2] * sum3;
2516       sum5 -= v5[nz+2] * sum3;
2517       sum5 -= v5[nz+3] * sum4;
2518 
2519       tmp[row ++]=sum1;
2520       tmp[row ++]=sum2;
2521       tmp[row ++]=sum3;
2522       tmp[row ++]=sum4;
2523       tmp[row ++]=sum5;
2524       break;
2525     default:
2526       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2527     }
2528   }
2529   /* backward solve the upper triangular */
2530   for (i=node_max -1 ,row = n-1 ; i>=0; i--) {
2531     nsz = ns[i];
2532     aii = ad[row+1] + 1;
2533     v1  = aa + aii;
2534     vi  = aj + aii;
2535     nz  = ad[row]- ad[row+1] - 1;
2536 
2537     if (i > 0) {
2538       /* Prefetch the indices for the next block */
2539       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2540       /* Prefetch the data for the next block */
2541       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2542     }
2543 
2544     switch (nsz) {               /* Each loop in 'case' is unrolled */
2545     case 1 :
2546       sum1 = tmp[row];
2547 
2548       for (j=0 ; j<nz-1; j+=2) {
2549         i0   = vi[j];
2550         i1   = vi[j+1];
2551         tmp0 = tmps[i0];
2552         tmp1 = tmps[i1];
2553         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2554       }
2555       if (j == nz-1) {
2556         tmp0  = tmps[vi[j]];
2557         sum1 -= v1[j]*tmp0;
2558       }
2559       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2560       break;
2561     case 2 :
2562       sum1 = tmp[row];
2563       sum2 = tmp[row-1];
2564       v2   = aa + ad[row] + 1;
2565       for (j=0 ; j<nz-1; j+=2) {
2566         i0   = vi[j];
2567         i1   = vi[j+1];
2568         tmp0 = tmps[i0];
2569         tmp1 = tmps[i1];
2570         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2571         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2572       }
2573       if (j == nz-1) {
2574         tmp0  = tmps[vi[j]];
2575         sum1 -= v1[j]* tmp0;
2576         sum2 -= v2[j+1]* tmp0;
2577       }
2578 
2579       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2580       sum2   -= v2[0] * tmp0;
2581       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2582       break;
2583     case 3 :
2584       sum1 = tmp[row];
2585       sum2 = tmp[row -1];
2586       sum3 = tmp[row -2];
2587       v2   = aa + ad[row] + 1;
2588       v3   = aa + ad[row -1] + 1;
2589       for (j=0 ; j<nz-1; j+=2) {
2590         i0   = vi[j];
2591         i1   = vi[j+1];
2592         tmp0 = tmps[i0];
2593         tmp1 = tmps[i1];
2594         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2595         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2596         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2597       }
2598       if (j== nz-1) {
2599         tmp0  = tmps[vi[j]];
2600         sum1 -= v1[j] * tmp0;
2601         sum2 -= v2[j+1] * tmp0;
2602         sum3 -= v3[j+2] * tmp0;
2603       }
2604       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2605       sum2   -= v2[0]* tmp0;
2606       sum3   -= v3[1] * tmp0;
2607       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608       sum3   -= v3[0]* tmp0;
2609       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2610 
2611       break;
2612     case 4 :
2613       sum1 = tmp[row];
2614       sum2 = tmp[row -1];
2615       sum3 = tmp[row -2];
2616       sum4 = tmp[row -3];
2617       v2   = aa + ad[row]+1;
2618       v3   = aa + ad[row -1]+1;
2619       v4   = aa + ad[row -2]+1;
2620 
2621       for (j=0 ; j<nz-1; j+=2) {
2622         i0   = vi[j];
2623         i1   = vi[j+1];
2624         tmp0 = tmps[i0];
2625         tmp1 = tmps[i1];
2626         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2627         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2628         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2629         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2630       }
2631       if (j== nz-1) {
2632         tmp0  = tmps[vi[j]];
2633         sum1 -= v1[j] * tmp0;
2634         sum2 -= v2[j+1] * tmp0;
2635         sum3 -= v3[j+2] * tmp0;
2636         sum4 -= v4[j+3] * tmp0;
2637       }
2638 
2639       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2640       sum2   -= v2[0] * tmp0;
2641       sum3   -= v3[1] * tmp0;
2642       sum4   -= v4[2] * tmp0;
2643       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2644       sum3   -= v3[0] * tmp0;
2645       sum4   -= v4[1] * tmp0;
2646       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2647       sum4   -= v4[0] * tmp0;
2648       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2649       break;
2650     case 5 :
2651       sum1 = tmp[row];
2652       sum2 = tmp[row -1];
2653       sum3 = tmp[row -2];
2654       sum4 = tmp[row -3];
2655       sum5 = tmp[row -4];
2656       v2   = aa + ad[row]+1;
2657       v3   = aa + ad[row -1]+1;
2658       v4   = aa + ad[row -2]+1;
2659       v5   = aa + ad[row -3]+1;
2660       for (j=0 ; j<nz-1; j+=2) {
2661         i0   = vi[j];
2662         i1   = vi[j+1];
2663         tmp0 = tmps[i0];
2664         tmp1 = tmps[i1];
2665         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2666         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2667         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2668         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2669         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2670       }
2671       if (j==nz-1) {
2672         tmp0  = tmps[vi[j]];
2673         sum1 -= v1[j] * tmp0;
2674         sum2 -= v2[j+1] * tmp0;
2675         sum3 -= v3[j+2] * tmp0;
2676         sum4 -= v4[j+3] * tmp0;
2677         sum5 -= v5[j+4] * tmp0;
2678       }
2679 
2680       tmp0    = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2681       sum2   -= v2[0] * tmp0;
2682       sum3   -= v3[1] * tmp0;
2683       sum4   -= v4[2] * tmp0;
2684       sum5   -= v5[3] * tmp0;
2685       tmp0    = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2686       sum3   -= v3[0] * tmp0;
2687       sum4   -= v4[1] * tmp0;
2688       sum5   -= v5[2] * tmp0;
2689       tmp0    = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2690       sum4   -= v4[0] * tmp0;
2691       sum5   -= v5[1] * tmp0;
2692       tmp0    = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2693       sum5   -= v5[0] * tmp0;
2694       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2695       break;
2696     default:
2697       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2698     }
2699   }
2700   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2701   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2702   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2704   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 
2709 /*
2710      Makes a longer coloring[] array and calls the usual code with that
2711 */
2712 #undef __FUNCT__
2713 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2714 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2715 {
2716   Mat_SeqAIJ       *a = (Mat_SeqAIJ*)mat->data;
2717   PetscErrorCode  ierr;
2718   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2719   PetscInt        *colorused,i;
2720   ISColoringValue *newcolor;
2721 
2722   PetscFunctionBegin;
2723   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr);
2724   /* loop over inodes, marking a color for each column*/
2725   row = 0;
2726   for (i=0; i<m; i++) {
2727     for (j=0; j<ns[i]; j++) {
2728       newcolor[row++] = coloring[i] + j*ncolors;
2729     }
2730   }
2731 
2732   /* eliminate unneeded colors */
2733   ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr);
2734   ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr);
2735   for (i=0; i<n; i++) {
2736     colorused[newcolor[i]] = 1;
2737   }
2738 
2739   for (i=1; i<5*ncolors; i++) {
2740     colorused[i] += colorused[i-1];
2741   }
2742   ncolors = colorused[5*ncolors-1];
2743   for (i=0; i<n; i++) {
2744     newcolor[i] = colorused[newcolor[i]]-1;
2745   }
2746   ierr = PetscFree(colorused);CHKERRQ(ierr);
2747   ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr);
2748   ierr = PetscFree(coloring);CHKERRQ(ierr);
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 #include <../src/mat/blockinvert.h>
2753 
2754 #undef __FUNCT__
2755 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2756 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2757 {
2758   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2759   PetscScalar        sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3;
2760   MatScalar          *ibdiag,*bdiag,work[25],*t;
2761   PetscScalar        *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2762   const MatScalar    *v = a->a,*v1,*v2,*v3,*v4,*v5;
2763   const PetscScalar  *xb, *b;
2764   PetscReal          zeropivot = 1.0e-15, shift = 0.0;
2765   PetscErrorCode     ierr;
2766   PetscInt           n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2;
2767   PetscInt           *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5];
2768 
2769   PetscFunctionBegin;
2770   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2771   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2772 
2773   if (!a->inode.ibdiagvalid) {
2774     if (!a->inode.ibdiag) {
2775       /* calculate space needed for diagonal blocks */
2776       for (i=0; i<m; i++) {
2777         cnt += sizes[i]*sizes[i];
2778       }
2779       a->inode.bdiagsize = cnt;
2780       ierr   = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr);
2781     }
2782 
2783     /* copy over the diagonal blocks and invert them */
2784     ibdiag = a->inode.ibdiag;
2785     bdiag  = a->inode.bdiag;
2786     cnt = 0;
2787     for (i=0, row = 0; i<m; i++) {
2788       for (j=0; j<sizes[i]; j++) {
2789         for (k=0; k<sizes[i]; k++) {
2790           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2791         }
2792       }
2793       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2794 
2795       switch (sizes[i]) {
2796         case 1:
2797           /* Create matrix data structure */
2798           if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2799           ibdiag[cnt] = 1.0/ibdiag[cnt];
2800           break;
2801         case 2:
2802           ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2803           break;
2804         case 3:
2805           ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2806           break;
2807         case 4:
2808           ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2809           break;
2810         case 5:
2811           ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2812           break;
2813        default:
2814          SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2815       }
2816       cnt += sizes[i]*sizes[i];
2817       row += sizes[i];
2818     }
2819     a->inode.ibdiagvalid = PETSC_TRUE;
2820   }
2821   ibdiag = a->inode.ibdiag;
2822   bdiag  = a->inode.bdiag;
2823   t = a->inode.ssor_work;
2824 
2825   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2826   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2827   CHKMEMQ;
2828   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2829   if (flag & SOR_ZERO_INITIAL_GUESS) {
2830     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2831 
2832       for (i=0, row=0; i<m; i++) {
2833         sz  = diag[row] - ii[row];
2834         v1  = a->a + ii[row];
2835         idx = a->j + ii[row];
2836 
2837         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2838         switch (sizes[i]) {
2839           case 1:
2840 
2841             sum1  = b[row];
2842             for (n = 0; n<sz-1; n+=2) {
2843               i1   = idx[0];
2844               i2   = idx[1];
2845               idx += 2;
2846               tmp0 = x[i1];
2847               tmp1 = x[i2];
2848               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2849             }
2850 
2851             if (n == sz-1) {
2852               tmp0  = x[*idx];
2853               sum1 -= *v1 * tmp0;
2854             }
2855             t[row] = sum1;
2856             x[row++] = sum1*(*ibdiag++);
2857             break;
2858           case 2:
2859             v2    = a->a + ii[row+1];
2860             sum1  = b[row];
2861             sum2  = b[row+1];
2862             for (n = 0; n<sz-1; n+=2) {
2863               i1   = idx[0];
2864               i2   = idx[1];
2865               idx += 2;
2866               tmp0 = x[i1];
2867               tmp1 = x[i2];
2868               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2869               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2870             }
2871 
2872             if (n == sz-1) {
2873               tmp0  = x[*idx];
2874               sum1 -= v1[0] * tmp0;
2875               sum2 -= v2[0] * tmp0;
2876             }
2877             t[row]   = sum1;
2878             t[row+1] = sum2;
2879             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2880             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2881             ibdiag  += 4;
2882             break;
2883           case 3:
2884             v2    = a->a + ii[row+1];
2885             v3    = a->a + ii[row+2];
2886             sum1  = b[row];
2887             sum2  = b[row+1];
2888             sum3  = b[row+2];
2889             for (n = 0; n<sz-1; n+=2) {
2890               i1   = idx[0];
2891               i2   = idx[1];
2892               idx += 2;
2893               tmp0 = x[i1];
2894               tmp1 = x[i2];
2895               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2896               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2897               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2898             }
2899 
2900             if (n == sz-1) {
2901               tmp0  = x[*idx];
2902               sum1 -= v1[0] * tmp0;
2903               sum2 -= v2[0] * tmp0;
2904               sum3 -= v3[0] * tmp0;
2905             }
2906             t[row]   = sum1;
2907             t[row+1] = sum2;
2908             t[row+2] = sum3;
2909             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2910             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2911             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2912             ibdiag  += 9;
2913             break;
2914           case 4:
2915             v2    = a->a + ii[row+1];
2916             v3    = a->a + ii[row+2];
2917             v4    = a->a + ii[row+3];
2918             sum1  = b[row];
2919             sum2  = b[row+1];
2920             sum3  = b[row+2];
2921             sum4  = b[row+3];
2922             for (n = 0; n<sz-1; n+=2) {
2923               i1   = idx[0];
2924               i2   = idx[1];
2925               idx += 2;
2926               tmp0 = x[i1];
2927               tmp1 = x[i2];
2928               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2929               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2930               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2931               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2932             }
2933 
2934             if (n == sz-1) {
2935               tmp0  = x[*idx];
2936               sum1 -= v1[0] * tmp0;
2937               sum2 -= v2[0] * tmp0;
2938               sum3 -= v3[0] * tmp0;
2939               sum4 -= v4[0] * tmp0;
2940             }
2941             t[row]   = sum1;
2942             t[row+1] = sum2;
2943             t[row+2] = sum3;
2944             t[row+3] = sum4;
2945             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2946             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2947             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2948             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2949             ibdiag  += 16;
2950             break;
2951           case 5:
2952             v2    = a->a + ii[row+1];
2953             v3    = a->a + ii[row+2];
2954             v4    = a->a + ii[row+3];
2955             v5    = a->a + ii[row+4];
2956             sum1  = b[row];
2957             sum2  = b[row+1];
2958             sum3  = b[row+2];
2959             sum4  = b[row+3];
2960             sum5  = b[row+4];
2961             for (n = 0; n<sz-1; n+=2) {
2962               i1   = idx[0];
2963               i2   = idx[1];
2964               idx += 2;
2965               tmp0 = x[i1];
2966               tmp1 = x[i2];
2967               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2968               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2969               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2970               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2971               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2972             }
2973 
2974             if (n == sz-1) {
2975               tmp0  = x[*idx];
2976               sum1 -= v1[0] * tmp0;
2977               sum2 -= v2[0] * tmp0;
2978               sum3 -= v3[0] * tmp0;
2979               sum4 -= v4[0] * tmp0;
2980               sum5 -= v5[0] * tmp0;
2981             }
2982             t[row]   = sum1;
2983             t[row+1] = sum2;
2984             t[row+2] = sum3;
2985             t[row+3] = sum4;
2986             t[row+4] = sum5;
2987             x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2988             x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2989             x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2990             x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2991             x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2992             ibdiag  += 25;
2993             break;
2994           default:
2995             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2996         }
2997       }
2998 
2999       xb = t;
3000       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3001     } else xb = b;
3002     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3003 
3004       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3005       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3006         ibdiag -= sizes[i]*sizes[i];
3007         sz      = ii[row+1] - diag[row] - 1;
3008         v1      = a->a + diag[row] + 1;
3009         idx     = a->j + diag[row] + 1;
3010 
3011         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3012         switch (sizes[i]) {
3013           case 1:
3014 
3015             sum1  = xb[row];
3016             for (n = 0; n<sz-1; n+=2) {
3017               i1   = idx[0];
3018               i2   = idx[1];
3019               idx += 2;
3020               tmp0 = x[i1];
3021               tmp1 = x[i2];
3022               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3023             }
3024 
3025             if (n == sz-1) {
3026               tmp0  = x[*idx];
3027               sum1 -= *v1*tmp0;
3028             }
3029             x[row--] = sum1*(*ibdiag);
3030             break;
3031 
3032           case 2:
3033 
3034             sum1  = xb[row];
3035             sum2  = xb[row-1];
3036             /* note that sum1 is associated with the second of the two rows */
3037             v2    = a->a + diag[row-1] + 2;
3038             for (n = 0; n<sz-1; n+=2) {
3039               i1   = idx[0];
3040               i2   = idx[1];
3041               idx += 2;
3042               tmp0 = x[i1];
3043               tmp1 = x[i2];
3044               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046             }
3047 
3048             if (n == sz-1) {
3049               tmp0  = x[*idx];
3050               sum1 -= *v1*tmp0;
3051               sum2 -= *v2*tmp0;
3052             }
3053             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3054             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3055             break;
3056           case 3:
3057 
3058             sum1  = xb[row];
3059             sum2  = xb[row-1];
3060             sum3  = xb[row-2];
3061             v2    = a->a + diag[row-1] + 2;
3062             v3    = a->a + diag[row-2] + 3;
3063             for (n = 0; n<sz-1; n+=2) {
3064               i1   = idx[0];
3065               i2   = idx[1];
3066               idx += 2;
3067               tmp0 = x[i1];
3068               tmp1 = x[i2];
3069               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3070               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3071               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3072             }
3073 
3074             if (n == sz-1) {
3075               tmp0  = x[*idx];
3076               sum1 -= *v1*tmp0;
3077               sum2 -= *v2*tmp0;
3078               sum3 -= *v3*tmp0;
3079             }
3080             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3081             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3082             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3083             break;
3084           case 4:
3085 
3086             sum1  = xb[row];
3087             sum2  = xb[row-1];
3088             sum3  = xb[row-2];
3089             sum4  = xb[row-3];
3090             v2    = a->a + diag[row-1] + 2;
3091             v3    = a->a + diag[row-2] + 3;
3092             v4    = a->a + diag[row-3] + 4;
3093             for (n = 0; n<sz-1; n+=2) {
3094               i1   = idx[0];
3095               i2   = idx[1];
3096               idx += 2;
3097               tmp0 = x[i1];
3098               tmp1 = x[i2];
3099               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3100               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3101               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3102               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3103             }
3104 
3105             if (n == sz-1) {
3106               tmp0  = x[*idx];
3107               sum1 -= *v1*tmp0;
3108               sum2 -= *v2*tmp0;
3109               sum3 -= *v3*tmp0;
3110               sum4 -= *v4*tmp0;
3111             }
3112             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3113             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3114             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3115             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3116             break;
3117           case 5:
3118 
3119             sum1  = xb[row];
3120             sum2  = xb[row-1];
3121             sum3  = xb[row-2];
3122             sum4  = xb[row-3];
3123             sum5  = xb[row-4];
3124             v2    = a->a + diag[row-1] + 2;
3125             v3    = a->a + diag[row-2] + 3;
3126             v4    = a->a + diag[row-3] + 4;
3127             v5    = a->a + diag[row-4] + 5;
3128             for (n = 0; n<sz-1; n+=2) {
3129               i1   = idx[0];
3130               i2   = idx[1];
3131               idx += 2;
3132               tmp0 = x[i1];
3133               tmp1 = x[i2];
3134               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3135               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3136               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3137               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3138               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3139             }
3140 
3141             if (n == sz-1) {
3142               tmp0  = x[*idx];
3143               sum1 -= *v1*tmp0;
3144               sum2 -= *v2*tmp0;
3145               sum3 -= *v3*tmp0;
3146               sum4 -= *v4*tmp0;
3147               sum5 -= *v5*tmp0;
3148             }
3149             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3150             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3151             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3152             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3153             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3154             break;
3155           default:
3156             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3157         }
3158       }
3159 
3160       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3161     }
3162     its--;
3163   }
3164   while (its--) {
3165 
3166     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3167       ibdiag = a->inode.ibdiag;
3168 
3169       for (i=0, row=0; i<m; i++) {
3170         sz  = ii[row + 1] - ii[row];
3171         v1  = a->a + ii[row];
3172         idx = a->j + ii[row];
3173 
3174         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3175         switch (sizes[i]) {
3176           case 1:
3177 
3178             sum1  = b[row];
3179             for (n = 0; n<sz-1; n+=2) {
3180               i1   = idx[0];
3181               i2   = idx[1];
3182               idx += 2;
3183               tmp0 = x[i1];
3184               tmp1 = x[i2];
3185               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3186             }
3187 
3188             if (n == sz-1) {
3189               tmp0  = x[*idx];
3190               sum1 -= *v1 * tmp0;
3191             }
3192 
3193             /* in MatSOR_SeqAIJ this line would be
3194              *
3195              * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3196              *
3197              * but omega == 1, so this becomes
3198              *
3199              * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++);
3200              *
3201              * but bdiag and ibdiag cancel each other, so we can change this
3202              * to adding sum1*(*ibdiag++).  We can skip bdiag for the larger
3203              * block sizes as well
3204              */
3205             x[row++] += sum1*(*ibdiag++);
3206             break;
3207           case 2:
3208             v2    = a->a + ii[row+1];
3209             sum1  = b[row];
3210             sum2  = b[row+1];
3211             for (n = 0; n<sz-1; n+=2) {
3212               i1   = idx[0];
3213               i2   = idx[1];
3214               idx += 2;
3215               tmp0 = x[i1];
3216               tmp1 = x[i2];
3217               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3218               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3219             }
3220 
3221             if (n == sz-1) {
3222               tmp0  = x[*idx];
3223               sum1 -= v1[0] * tmp0;
3224               sum2 -= v2[0] * tmp0;
3225             }
3226             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2];
3227             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3];
3228             ibdiag  += 4;
3229             break;
3230           case 3:
3231             v2    = a->a + ii[row+1];
3232             v3    = a->a + ii[row+2];
3233             sum1  = b[row];
3234             sum2  = b[row+1];
3235             sum3  = b[row+2];
3236             for (n = 0; n<sz-1; n+=2) {
3237               i1   = idx[0];
3238               i2   = idx[1];
3239               idx += 2;
3240               tmp0 = x[i1];
3241               tmp1 = x[i2];
3242               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3243               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3244               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3245             }
3246 
3247             if (n == sz-1) {
3248               tmp0  = x[*idx];
3249               sum1 -= v1[0] * tmp0;
3250               sum2 -= v2[0] * tmp0;
3251               sum3 -= v3[0] * tmp0;
3252             }
3253             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3254             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3255             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3256             ibdiag  += 9;
3257             break;
3258           case 4:
3259             v2    = a->a + ii[row+1];
3260             v3    = a->a + ii[row+2];
3261             v4    = a->a + ii[row+3];
3262             sum1  = b[row];
3263             sum2  = b[row+1];
3264             sum3  = b[row+2];
3265             sum4  = b[row+3];
3266             for (n = 0; n<sz-1; n+=2) {
3267               i1   = idx[0];
3268               i2   = idx[1];
3269               idx += 2;
3270               tmp0 = x[i1];
3271               tmp1 = x[i2];
3272               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3276             }
3277 
3278             if (n == sz-1) {
3279               tmp0  = x[*idx];
3280               sum1 -= v1[0] * tmp0;
3281               sum2 -= v2[0] * tmp0;
3282               sum3 -= v3[0] * tmp0;
3283               sum4 -= v4[0] * tmp0;
3284             }
3285             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3286             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3287             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3288             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3289             ibdiag  += 16;
3290             break;
3291           case 5:
3292             v2    = a->a + ii[row+1];
3293             v3    = a->a + ii[row+2];
3294             v4    = a->a + ii[row+3];
3295             v5    = a->a + ii[row+4];
3296             sum1  = b[row];
3297             sum2  = b[row+1];
3298             sum3  = b[row+2];
3299             sum4  = b[row+3];
3300             sum5  = b[row+4];
3301             for (n = 0; n<sz-1; n+=2) {
3302               i1   = idx[0];
3303               i2   = idx[1];
3304               idx += 2;
3305               tmp0 = x[i1];
3306               tmp1 = x[i2];
3307               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3308               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3309               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3310               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3311               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3312             }
3313 
3314             if (n == sz-1) {
3315               tmp0  = x[*idx];
3316               sum1 -= v1[0] * tmp0;
3317               sum2 -= v2[0] * tmp0;
3318               sum3 -= v3[0] * tmp0;
3319               sum4 -= v4[0] * tmp0;
3320               sum5 -= v5[0] * tmp0;
3321             }
3322             x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3323             x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3324             x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3325             x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3326             x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3327             ibdiag  += 25;
3328             break;
3329           default:
3330             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3331         }
3332       }
3333 
3334       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3335     }
3336     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3337 
3338       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3339       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3340         ibdiag -= sizes[i]*sizes[i];
3341         sz      = ii[row+1] - ii[row];
3342         v1      = a->a + ii[row];
3343         idx     = a->j + ii[row];
3344 
3345         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3346         switch (sizes[i]) {
3347           case 1:
3348 
3349             sum1  = b[row];
3350             for (n = 0; n<sz-1; n+=2) {
3351               i1   = idx[0];
3352               i2   = idx[1];
3353               idx += 2;
3354               tmp0 = x[i1];
3355               tmp1 = x[i2];
3356               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3357             }
3358 
3359             if (n == sz-1) {
3360               tmp0  = x[*idx];
3361               sum1 -= *v1*tmp0;
3362             }
3363             x[row--] += sum1*(*ibdiag);
3364             break;
3365 
3366           case 2:
3367 
3368             sum1  = b[row];
3369             sum2  = b[row-1];
3370             /* note that sum1 is associated with the second of the two rows */
3371             v2    = a->a + ii[row - 1];
3372             for (n = 0; n<sz-1; n+=2) {
3373               i1   = idx[0];
3374               i2   = idx[1];
3375               idx += 2;
3376               tmp0 = x[i1];
3377               tmp1 = x[i2];
3378               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3379               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3380             }
3381 
3382             if (n == sz-1) {
3383               tmp0  = x[*idx];
3384               sum1 -= *v1*tmp0;
3385               sum2 -= *v2*tmp0;
3386             }
3387             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3388             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3389             break;
3390           case 3:
3391 
3392             sum1  = b[row];
3393             sum2  = b[row-1];
3394             sum3  = b[row-2];
3395             v2    = a->a + ii[row-1];
3396             v3    = a->a + ii[row-2];
3397             for (n = 0; n<sz-1; n+=2) {
3398               i1   = idx[0];
3399               i2   = idx[1];
3400               idx += 2;
3401               tmp0 = x[i1];
3402               tmp1 = x[i2];
3403               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3404               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3405               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3406             }
3407 
3408             if (n == sz-1) {
3409               tmp0  = x[*idx];
3410               sum1 -= *v1*tmp0;
3411               sum2 -= *v2*tmp0;
3412               sum3 -= *v3*tmp0;
3413             }
3414             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3415             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3416             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3417             break;
3418           case 4:
3419 
3420             sum1  = b[row];
3421             sum2  = b[row-1];
3422             sum3  = b[row-2];
3423             sum4  = b[row-3];
3424             v2    = a->a + ii[row-1];
3425             v3    = a->a + ii[row-2];
3426             v4    = a->a + ii[row-3];
3427             for (n = 0; n<sz-1; n+=2) {
3428               i1   = idx[0];
3429               i2   = idx[1];
3430               idx += 2;
3431               tmp0 = x[i1];
3432               tmp1 = x[i2];
3433               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3434               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3435               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3436               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3437             }
3438 
3439             if (n == sz-1) {
3440               tmp0  = x[*idx];
3441               sum1 -= *v1*tmp0;
3442               sum2 -= *v2*tmp0;
3443               sum3 -= *v3*tmp0;
3444               sum4 -= *v4*tmp0;
3445             }
3446             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3447             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3448             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3449             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3450             break;
3451           case 5:
3452 
3453             sum1  = b[row];
3454             sum2  = b[row-1];
3455             sum3  = b[row-2];
3456             sum4  = b[row-3];
3457             sum5  = b[row-4];
3458             v2    = a->a + ii[row-1];
3459             v3    = a->a + ii[row-2];
3460             v4    = a->a + ii[row-3];
3461             v5    = a->a + ii[row-4];
3462             for (n = 0; n<sz-1; n+=2) {
3463               i1   = idx[0];
3464               i2   = idx[1];
3465               idx += 2;
3466               tmp0 = x[i1];
3467               tmp1 = x[i2];
3468               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3469               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3470               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3471               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3472               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3473             }
3474 
3475             if (n == sz-1) {
3476               tmp0  = x[*idx];
3477               sum1 -= *v1*tmp0;
3478               sum2 -= *v2*tmp0;
3479               sum3 -= *v3*tmp0;
3480               sum4 -= *v4*tmp0;
3481               sum5 -= *v5*tmp0;
3482             }
3483             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3484             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3485             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3486             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3487             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3488             break;
3489           default:
3490             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3491         }
3492       }
3493 
3494       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3495     }
3496   }
3497   if (flag & SOR_EISENSTAT) {
3498     /*
3499           Apply  (U + D)^-1  where D is now the block diagonal
3500     */
3501     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3502     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3503       ibdiag -= sizes[i]*sizes[i];
3504       sz      = ii[row+1] - diag[row] - 1;
3505       v1      = a->a + diag[row] + 1;
3506       idx     = a->j + diag[row] + 1;
3507       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3508       switch (sizes[i]) {
3509         case 1:
3510 
3511           sum1  = b[row];
3512           for (n = 0; n<sz-1; n+=2) {
3513             i1   = idx[0];
3514             i2   = idx[1];
3515             idx += 2;
3516             tmp0 = x[i1];
3517             tmp1 = x[i2];
3518             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3519           }
3520 
3521           if (n == sz-1) {
3522             tmp0  = x[*idx];
3523             sum1 -= *v1*tmp0;
3524           }
3525           x[row] = sum1*(*ibdiag);row--;
3526           break;
3527 
3528         case 2:
3529 
3530           sum1  = b[row];
3531           sum2  = b[row-1];
3532           /* note that sum1 is associated with the second of the two rows */
3533           v2    = a->a + diag[row-1] + 2;
3534           for (n = 0; n<sz-1; n+=2) {
3535             i1   = idx[0];
3536             i2   = idx[1];
3537             idx += 2;
3538             tmp0 = x[i1];
3539             tmp1 = x[i2];
3540             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3541             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3542           }
3543 
3544           if (n == sz-1) {
3545             tmp0  = x[*idx];
3546             sum1 -= *v1*tmp0;
3547             sum2 -= *v2*tmp0;
3548           }
3549           x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3550           x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3551           row -= 2;
3552           break;
3553         case 3:
3554 
3555           sum1  = b[row];
3556           sum2  = b[row-1];
3557           sum3  = b[row-2];
3558           v2    = a->a + diag[row-1] + 2;
3559           v3    = a->a + diag[row-2] + 3;
3560           for (n = 0; n<sz-1; n+=2) {
3561             i1   = idx[0];
3562             i2   = idx[1];
3563             idx += 2;
3564             tmp0 = x[i1];
3565             tmp1 = x[i2];
3566             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3567             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3568             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3569           }
3570 
3571           if (n == sz-1) {
3572             tmp0  = x[*idx];
3573             sum1 -= *v1*tmp0;
3574             sum2 -= *v2*tmp0;
3575             sum3 -= *v3*tmp0;
3576           }
3577           x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3578           x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3579           x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3580           row -= 3;
3581           break;
3582         case 4:
3583 
3584           sum1  = b[row];
3585           sum2  = b[row-1];
3586           sum3  = b[row-2];
3587           sum4  = b[row-3];
3588           v2    = a->a + diag[row-1] + 2;
3589           v3    = a->a + diag[row-2] + 3;
3590           v4    = a->a + diag[row-3] + 4;
3591           for (n = 0; n<sz-1; n+=2) {
3592             i1   = idx[0];
3593             i2   = idx[1];
3594             idx += 2;
3595             tmp0 = x[i1];
3596             tmp1 = x[i2];
3597             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3598             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3599             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3600             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3601           }
3602 
3603           if (n == sz-1) {
3604             tmp0  = x[*idx];
3605             sum1 -= *v1*tmp0;
3606             sum2 -= *v2*tmp0;
3607             sum3 -= *v3*tmp0;
3608             sum4 -= *v4*tmp0;
3609           }
3610           x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3611           x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3612           x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3613           x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3614           row -= 4;
3615           break;
3616         case 5:
3617 
3618           sum1  = b[row];
3619           sum2  = b[row-1];
3620           sum3  = b[row-2];
3621           sum4  = b[row-3];
3622           sum5  = b[row-4];
3623           v2    = a->a + diag[row-1] + 2;
3624           v3    = a->a + diag[row-2] + 3;
3625           v4    = a->a + diag[row-3] + 4;
3626           v5    = a->a + diag[row-4] + 5;
3627           for (n = 0; n<sz-1; n+=2) {
3628             i1   = idx[0];
3629             i2   = idx[1];
3630             idx += 2;
3631             tmp0 = x[i1];
3632             tmp1 = x[i2];
3633             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3634             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3635             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3636             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3637             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3638           }
3639 
3640           if (n == sz-1) {
3641             tmp0  = x[*idx];
3642             sum1 -= *v1*tmp0;
3643             sum2 -= *v2*tmp0;
3644             sum3 -= *v3*tmp0;
3645             sum4 -= *v4*tmp0;
3646             sum5 -= *v5*tmp0;
3647           }
3648           x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3649           x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3650           x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3651           x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3652           x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3653           row -= 5;
3654           break;
3655         default:
3656           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3657       }
3658     }
3659     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3660 
3661     /*
3662            t = b - D x    where D is the block diagonal
3663     */
3664     cnt = 0;
3665     for (i=0, row=0; i<m; i++) {
3666       switch (sizes[i]) {
3667         case 1:
3668           t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3669           break;
3670         case 2:
3671           x1   = x[row]; x2 = x[row+1];
3672           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3673           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3674           t[row]   = b[row] - tmp1;
3675           t[row+1] = b[row+1] - tmp2; row += 2;
3676           cnt += 4;
3677           break;
3678         case 3:
3679           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2];
3680           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3681           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3682           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3683           t[row] = b[row] - tmp1;
3684           t[row+1] = b[row+1] - tmp2;
3685           t[row+2] = b[row+2] - tmp3; row += 3;
3686           cnt += 9;
3687           break;
3688         case 4:
3689           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3690           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3691           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3692           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3693           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3694           t[row] = b[row] - tmp1;
3695           t[row+1] = b[row+1] - tmp2;
3696           t[row+2] = b[row+2] - tmp3;
3697           t[row+3] = b[row+3] - tmp4; row += 4;
3698           cnt += 16;
3699           break;
3700         case 5:
3701           x1   = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3702           tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3703           tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3704           tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3705           tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3706           tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3707           t[row] = b[row] - tmp1;
3708           t[row+1] = b[row+1] - tmp2;
3709           t[row+2] = b[row+2] - tmp3;
3710           t[row+3] = b[row+3] - tmp4;
3711           t[row+4] = b[row+4] - tmp5;row += 5;
3712           cnt += 25;
3713           break;
3714         default:
3715           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3716       }
3717     }
3718     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3719 
3720 
3721 
3722     /*
3723           Apply (L + D)^-1 where D is the block diagonal
3724     */
3725     for (i=0, row=0; i<m; i++) {
3726       sz  = diag[row] - ii[row];
3727       v1  = a->a + ii[row];
3728       idx = a->j + ii[row];
3729       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3730       switch (sizes[i]) {
3731         case 1:
3732 
3733           sum1  = t[row];
3734           for (n = 0; n<sz-1; n+=2) {
3735             i1   = idx[0];
3736             i2   = idx[1];
3737             idx += 2;
3738             tmp0 = t[i1];
3739             tmp1 = t[i2];
3740             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3741           }
3742 
3743           if (n == sz-1) {
3744             tmp0  = t[*idx];
3745             sum1 -= *v1 * tmp0;
3746           }
3747           x[row] += t[row] = sum1*(*ibdiag++); row++;
3748           break;
3749         case 2:
3750           v2    = a->a + ii[row+1];
3751           sum1  = t[row];
3752           sum2  = t[row+1];
3753           for (n = 0; n<sz-1; n+=2) {
3754             i1   = idx[0];
3755             i2   = idx[1];
3756             idx += 2;
3757             tmp0 = t[i1];
3758             tmp1 = t[i2];
3759             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3760             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3761           }
3762 
3763           if (n == sz-1) {
3764             tmp0  = t[*idx];
3765             sum1 -= v1[0] * tmp0;
3766             sum2 -= v2[0] * tmp0;
3767           }
3768           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3769           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3770           ibdiag  += 4; row += 2;
3771           break;
3772         case 3:
3773           v2    = a->a + ii[row+1];
3774           v3    = a->a + ii[row+2];
3775           sum1  = t[row];
3776           sum2  = t[row+1];
3777           sum3  = t[row+2];
3778           for (n = 0; n<sz-1; n+=2) {
3779             i1   = idx[0];
3780             i2   = idx[1];
3781             idx += 2;
3782             tmp0 = t[i1];
3783             tmp1 = t[i2];
3784             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3785             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3786             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3787           }
3788 
3789           if (n == sz-1) {
3790             tmp0  = t[*idx];
3791             sum1 -= v1[0] * tmp0;
3792             sum2 -= v2[0] * tmp0;
3793             sum3 -= v3[0] * tmp0;
3794           }
3795           x[row]  += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3796           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3797           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3798           ibdiag  += 9; row += 3;
3799           break;
3800         case 4:
3801           v2    = a->a + ii[row+1];
3802           v3    = a->a + ii[row+2];
3803           v4    = a->a + ii[row+3];
3804           sum1  = t[row];
3805           sum2  = t[row+1];
3806           sum3  = t[row+2];
3807           sum4  = t[row+3];
3808           for (n = 0; n<sz-1; n+=2) {
3809             i1   = idx[0];
3810             i2   = idx[1];
3811             idx += 2;
3812             tmp0 = t[i1];
3813             tmp1 = t[i2];
3814             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3815             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3816             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3817             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3818           }
3819 
3820           if (n == sz-1) {
3821             tmp0  = t[*idx];
3822             sum1 -= v1[0] * tmp0;
3823             sum2 -= v2[0] * tmp0;
3824             sum3 -= v3[0] * tmp0;
3825             sum4 -= v4[0] * tmp0;
3826           }
3827           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3828           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3829           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3830           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3831           ibdiag  += 16; row += 4;
3832           break;
3833         case 5:
3834           v2    = a->a + ii[row+1];
3835           v3    = a->a + ii[row+2];
3836           v4    = a->a + ii[row+3];
3837           v5    = a->a + ii[row+4];
3838           sum1  = t[row];
3839           sum2  = t[row+1];
3840           sum3  = t[row+2];
3841           sum4  = t[row+3];
3842           sum5  = t[row+4];
3843           for (n = 0; n<sz-1; n+=2) {
3844             i1   = idx[0];
3845             i2   = idx[1];
3846             idx += 2;
3847             tmp0 = t[i1];
3848             tmp1 = t[i2];
3849             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3850             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3851             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3852             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3853             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3854           }
3855 
3856           if (n == sz-1) {
3857             tmp0  = t[*idx];
3858             sum1 -= v1[0] * tmp0;
3859             sum2 -= v2[0] * tmp0;
3860             sum3 -= v3[0] * tmp0;
3861             sum4 -= v4[0] * tmp0;
3862             sum5 -= v5[0] * tmp0;
3863           }
3864           x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3865           x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3866           x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3867           x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3868           x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3869           ibdiag  += 25; row += 5;
3870           break;
3871         default:
3872           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3873       }
3874     }
3875     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3876   }
3877   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3878   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3879   CHKMEMQ;  PetscFunctionReturn(0);
3880 }
3881 
3882 #undef __FUNCT__
3883 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3884 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3885 {
3886   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3887   PetscScalar        *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3888   const MatScalar    *bdiag = a->inode.bdiag;
3889   const PetscScalar  *b;
3890   PetscErrorCode      ierr;
3891   PetscInt            m = a->inode.node_count,cnt = 0,i,row;
3892   const PetscInt      *sizes = a->inode.size;
3893 
3894   PetscFunctionBegin;
3895   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3896   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3897   cnt = 0;
3898   for (i=0, row=0; i<m; i++) {
3899     switch (sizes[i]) {
3900       case 1:
3901         x[row] = b[row]*bdiag[cnt++];row++;
3902         break;
3903       case 2:
3904         x1   = b[row]; x2 = b[row+1];
3905         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3906         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3907         x[row++] = tmp1;
3908         x[row++] = tmp2;
3909         cnt += 4;
3910         break;
3911       case 3:
3912         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2];
3913         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3914         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3915         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3916         x[row++] = tmp1;
3917         x[row++] = tmp2;
3918         x[row++] = tmp3;
3919         cnt += 9;
3920         break;
3921       case 4:
3922         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
3923         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3924         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3925         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3926         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3927         x[row++] = tmp1;
3928         x[row++] = tmp2;
3929         x[row++] = tmp3;
3930         x[row++] = tmp4;
3931         cnt += 16;
3932         break;
3933       case 5:
3934         x1   = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
3935         tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3936         tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3937         tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3938         tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3939         tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3940         x[row++] = tmp1;
3941         x[row++] = tmp2;
3942         x[row++] = tmp3;
3943         x[row++] = tmp4;
3944         x[row++] = tmp5;
3945         cnt += 25;
3946         break;
3947       default:
3948         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3949     }
3950   }
3951   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
3952   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3953   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3954   PetscFunctionReturn(0);
3955 }
3956 
3957 /*
3958     samestructure indicates that the matrix has not changed its nonzero structure so we
3959     do not need to recompute the inodes
3960 */
3961 #undef __FUNCT__
3962 #define __FUNCT__ "Mat_CheckInode"
3963 PetscErrorCode Mat_CheckInode(Mat A,PetscBool  samestructure)
3964 {
3965   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3966   PetscErrorCode ierr;
3967   PetscInt       i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
3968   PetscBool      flag;
3969 
3970   PetscFunctionBegin;
3971   if (!a->inode.use)                     PetscFunctionReturn(0);
3972   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
3973 
3974 
3975   m = A->rmap->n;
3976   if (a->inode.size) {ns = a->inode.size;}
3977   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
3978 
3979   i          = 0;
3980   node_count = 0;
3981   idx        = a->j;
3982   ii         = a->i;
3983   while (i < m) {                /* For each row */
3984     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
3985     /* Limits the number of elements in a node to 'a->inode.limit' */
3986     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
3987       nzy     = ii[j+1] - ii[j]; /* Same number of nonzeros */
3988       if (nzy != nzx) break;
3989       idy  += nzx;             /* Same nonzero pattern */
3990       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
3991       if (!flag) break;
3992     }
3993     ns[node_count++] = blk_size;
3994     idx += blk_size*nzx;
3995     i    = j;
3996   }
3997   /* If not enough inodes found,, do not use inode version of the routines */
3998   if (!m || node_count > .8*m) {
3999     ierr = PetscFree(ns);CHKERRQ(ierr);
4000     a->inode.node_count       = 0;
4001     a->inode.size             = PETSC_NULL;
4002     a->inode.use              = PETSC_FALSE;
4003     A->ops->mult              = MatMult_SeqAIJ;
4004     A->ops->sor               = MatSOR_SeqAIJ;
4005     A->ops->multadd           = MatMultAdd_SeqAIJ;
4006     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4007     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4008     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4009     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4010     A->ops->coloringpatch     = 0;
4011     A->ops->multdiagonalblock = 0;
4012     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4013   } else {
4014     if (!A->factortype) {
4015       A->ops->mult              = MatMult_SeqAIJ_Inode;
4016       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4017       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4018       A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4019       A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4020       A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4021       A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4022       A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4023       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4024     } else {
4025       A->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4026     }
4027     a->inode.node_count       = node_count;
4028     a->inode.size             = ns;
4029     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4030   }
4031   a->inode.checked = PETSC_TRUE;
4032   PetscFunctionReturn(0);
4033 }
4034 
4035 #undef __FUNCT__
4036 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4037 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4038 {
4039   Mat            B=*C;
4040   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4041   PetscErrorCode ierr;
4042   PetscInt       m=A->rmap->n;
4043 
4044   PetscFunctionBegin;
4045   c->inode.use          = a->inode.use;
4046   c->inode.limit        = a->inode.limit;
4047   c->inode.max_limit    = a->inode.max_limit;
4048   if (a->inode.size) {
4049     ierr                = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr);
4050     c->inode.node_count = a->inode.node_count;
4051     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4052     /* note the table of functions below should match that in Mat_CheckInode() */
4053     if (!B->factortype) {
4054       B->ops->mult              = MatMult_SeqAIJ_Inode;
4055       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4056       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4057       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4058       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4059       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4060       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4061       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4062       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4063     } else {
4064       B->ops->solve             = MatSolve_SeqAIJ_Inode_inplace;
4065     }
4066   } else {
4067     c->inode.size       = 0;
4068     c->inode.node_count = 0;
4069   }
4070   c->inode.ibdiagvalid = PETSC_FALSE;
4071   c->inode.ibdiag      = 0;
4072   c->inode.bdiag       = 0;
4073   PetscFunctionReturn(0);
4074 }
4075 
4076 #undef __FUNCT__
4077 #define __FUNCT__ "MatGetRow_FactoredLU"
4078 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row)
4079 {
4080   PetscInt k, *vi;
4081 
4082   PetscFunctionBegin;
4083   vi = aj + ai[row];
4084   for (k=0;k<nzl;k++) cols[k] = vi[k];
4085   vi = aj + adiag[row];
4086   cols[nzl] = vi[0];
4087   vi = aj + adiag[row+1]+1;
4088   for (k=0;k<nzu;k++) cols[nzl+1+k] = vi[k];
4089   PetscFunctionReturn(0);
4090 }
4091 /*
4092    Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix.
4093    Modified from Mat_CheckInode().
4094 
4095    Input Parameters:
4096 +  Mat A - ILU or LU matrix factor
4097 -  samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we
4098     do not need to recompute the inodes
4099 */
4100 #undef __FUNCT__
4101 #define __FUNCT__ "Mat_CheckInode_FactorLU"
4102 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool  samestructure)
4103 {
4104   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4105   PetscErrorCode ierr;
4106   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4107   PetscInt       *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag;
4108   PetscBool      flag;
4109 
4110   PetscFunctionBegin;
4111   if (!a->inode.use)                     PetscFunctionReturn(0);
4112   if (a->inode.checked && samestructure) PetscFunctionReturn(0);
4113 
4114   m = A->rmap->n;
4115   if (a->inode.size) {ns = a->inode.size;}
4116   else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);}
4117 
4118   i          = 0;
4119   node_count = 0;
4120   while (i < m) {                /* For each row */
4121     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4122     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4123     nzx  = nzl1 + nzu1 + 1;
4124     ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr);
4125     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4126 
4127     /* Limits the number of elements in a node to 'a->inode.limit' */
4128     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4129       nzl2    = ai[j+1] - ai[j];
4130       nzu2    = adiag[j] - adiag[j+1] - 1;
4131       nzy     = nzl2 + nzu2 + 1;
4132       if (nzy != nzx) break;
4133       ierr    = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr);
4134       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4135       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4136       if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;}
4137       ierr = PetscFree(cols2);CHKERRQ(ierr);
4138     }
4139     ns[node_count++] = blk_size;
4140     ierr = PetscFree(cols1);CHKERRQ(ierr);
4141     i    = j;
4142   }
4143   /* If not enough inodes found,, do not use inode version of the routines */
4144   if (!m || node_count > .8*m) {
4145     ierr = PetscFree(ns);CHKERRQ(ierr);
4146     a->inode.node_count     = 0;
4147     a->inode.size           = PETSC_NULL;
4148     a->inode.use            = PETSC_FALSE;
4149     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4150   } else {
4151     A->ops->mult              = 0;
4152     A->ops->sor               = 0;
4153     A->ops->multadd           = 0;
4154     A->ops->getrowij          = 0;
4155     A->ops->restorerowij      = 0;
4156     A->ops->getcolumnij       = 0;
4157     A->ops->restorecolumnij   = 0;
4158     A->ops->coloringpatch     = 0;
4159     A->ops->multdiagonalblock = 0;
4160     A->ops->solve             = MatSolve_SeqAIJ_Inode;
4161     a->inode.node_count       = node_count;
4162     a->inode.size             = ns;
4163     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4164   }
4165   a->inode.checked = PETSC_TRUE;
4166   PetscFunctionReturn(0);
4167 }
4168 
4169 #undef __FUNCT__
4170 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4171 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4172 {
4173   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4174 
4175   PetscFunctionBegin;
4176   a->inode.ibdiagvalid = PETSC_FALSE;
4177   PetscFunctionReturn(0);
4178 }
4179 
4180 /*
4181      This is really ugly. if inodes are used this replaces the
4182   permutations with ones that correspond to rows/cols of the matrix
4183   rather then inode blocks
4184 */
4185 #undef __FUNCT__
4186 #define __FUNCT__ "MatInodeAdjustForInodes"
4187 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4188 {
4189   PetscErrorCode ierr;
4190 
4191   PetscFunctionBegin;
4192   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4193   PetscFunctionReturn(0);
4194 }
4195 
4196 EXTERN_C_BEGIN
4197 #undef __FUNCT__
4198 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4199 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4200 {
4201   Mat_SeqAIJ      *a=(Mat_SeqAIJ*)A->data;
4202   PetscErrorCode ierr;
4203   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4204   const PetscInt *ridx,*cidx;
4205   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4206   PetscInt       nslim_col,*ns_col;
4207   IS             ris = *rperm,cis = *cperm;
4208 
4209   PetscFunctionBegin;
4210   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4211   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4212 
4213   ierr  = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4214   ierr  = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr);
4215   ierr  = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr);
4216 
4217   ierr  = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4218   ierr  = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4219 
4220   /* Form the inode structure for the rows of permuted matric using inv perm*/
4221   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4222 
4223   /* Construct the permutations for rows*/
4224   for (i=0,row = 0; i<nslim_row; ++i) {
4225     indx      = ridx[i];
4226     start_val = tns[indx];
4227     end_val   = tns[indx + 1];
4228     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4229   }
4230 
4231   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4232   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4233 
4234  /* Construct permutations for columns */
4235   for (i=0,col=0; i<nslim_col; ++i) {
4236     indx      = cidx[i];
4237     start_val = tns[indx];
4238     end_val   = tns[indx + 1];
4239     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4240   }
4241 
4242   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4243   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4244   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4245   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4246 
4247   ierr  = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4248   ierr  = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4249 
4250   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4251   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4252   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4253   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4254   ierr = PetscFree(tns);CHKERRQ(ierr);
4255   PetscFunctionReturn(0);
4256 }
4257 EXTERN_C_END
4258 
4259 #undef __FUNCT__
4260 #define __FUNCT__ "MatInodeGetInodeSizes"
4261 /*@C
4262    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4263 
4264    Not Collective
4265 
4266    Input Parameter:
4267 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4268 
4269    Output Parameter:
4270 +  node_count - no of inodes present in the matrix.
4271 .  sizes      - an array of size node_count,with sizes of each inode.
4272 -  limit      - the max size used to generate the inodes.
4273 
4274    Level: advanced
4275 
4276    Notes: This routine returns some internal storage information
4277    of the matrix, it is intended to be used by advanced users.
4278    It should be called after the matrix is assembled.
4279    The contents of the sizes[] array should not be changed.
4280    PETSC_NULL may be passed for information not requested.
4281 
4282 .keywords: matrix, seqaij, get, inode
4283 
4284 .seealso: MatGetInfo()
4285 @*/
4286 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4287 {
4288   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4289 
4290   PetscFunctionBegin;
4291   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4292   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr);
4293   if (f) {
4294     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4295   }
4296   PetscFunctionReturn(0);
4297 }
4298 
4299 EXTERN_C_BEGIN
4300 #undef __FUNCT__
4301 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4302 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4303 {
4304   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4305 
4306   PetscFunctionBegin;
4307   if (node_count) *node_count = a->inode.node_count;
4308   if (sizes)      *sizes      = a->inode.size;
4309   if (limit)      *limit      = a->inode.limit;
4310   PetscFunctionReturn(0);
4311 }
4312 EXTERN_C_END
4313