xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 6baedc03412ed09645b40a041aa76a8d30c8acee)
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
763   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 /* ----------------------------------------------------------- */
768 #undef __FUNCT__
769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771 {
772   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
773   IS                iscol = a->col,isrow = a->row;
774   PetscErrorCode    ierr;
775   const PetscInt    *r,*c,*rout,*cout;
776   PetscInt          i,j,n = A->rmap->n,nz;
777   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
778   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
780   PetscScalar       sum1,sum2,sum3,sum4,sum5;
781   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782   const PetscScalar *b;
783 
784   PetscFunctionBegin;
785   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786   node_max = a->inode.node_count;
787   ns       = a->inode.size;     /* Node Size array */
788 
789   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
791   tmp  = a->solve_work;
792 
793   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
794   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
795 
796   /* forward solve the lower triangular */
797   tmps = tmp;
798   aa   = a_a;
799   aj   = a_j;
800   ad   = a->diag;
801 
802   for (i = 0,row = 0; i< node_max; ++i) {
803     nsz = ns[i];
804     aii = ai[row];
805     v1  = aa + aii;
806     vi  = aj + aii;
807     nz  = ad[row]- aii;
808     if (i < node_max-1) {
809       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810       * but our indexing to determine it's size could. */
811       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815     }
816 
817     switch (nsz) {               /* Each loop in 'case' is unrolled */
818     case 1:
819       sum1 = b[*r++];
820       for (j=0; j<nz-1; j+=2) {
821         i0    = vi[0];
822         i1    = vi[1];
823         vi   +=2;
824         tmp0  = tmps[i0];
825         tmp1  = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if (j == nz-1) {
829         tmp0  = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for (j=0; j<nz-1; j+=2) {
840         i0    = vi[0];
841         i1    = vi[1];
842         vi   +=2;
843         tmp0  = tmps[i0];
844         tmp1  = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if (j == nz-1) {
849         tmp0  = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2     -= *v2++ *sum1;
854       tmp[row++]=sum1;
855       tmp[row++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2) {
865         i0    = vi[0];
866         i1    = vi[1];
867         vi   +=2;
868         tmp0  = tmps[i0];
869         tmp1  = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1) {
875         tmp0  = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883 
884       tmp[row++]=sum1;
885       tmp[row++]=sum2;
886       tmp[row++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2) {
899         i0    = vi[0];
900         i1    = vi[1];
901         vi   +=2;
902         tmp0  = tmps[i0];
903         tmp1  = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1) {
910         tmp0  = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row++]=sum1;
924       tmp[row++]=sum2;
925       tmp[row++]=sum3;
926       tmp[row++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2) {
940         i0    = vi[0];
941         i1    = vi[1];
942         vi   +=2;
943         tmp0  = tmps[i0];
944         tmp1  = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1) {
952         tmp0  = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row++]=sum1;
972       tmp[row++]=sum2;
973       tmp[row++]=sum3;
974       tmp[row++]=sum4;
975       tmp[row++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1,row = n-1; i>=0; i--) {
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz) {               /* Each loop in 'case' is unrolled */
989     case 1:
990       sum1 = tmp[row];
991 
992       for (j=nz; j>1; j-=2) {
993         vi   -=2;
994         i0    = vi[2];
995         i1    = vi[1];
996         tmp0  = tmps[i0];
997         tmp1  = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1) {
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2:
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1) {
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3:
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz; j>1; j-=2) {
1039         vi   -=2;
1040         i0    = vi[2];
1041         i1    = vi[1];
1042         tmp0  = tmps[i0];
1043         tmp1  = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1) {
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4:
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz; j>1; j-=2) {
1075         vi   -=2;
1076         i0    = vi[2];
1077         i1    = vi[1];
1078         tmp0  = tmps[i0];
1079         tmp1  = tmps[i1];
1080         v1   -= 2;
1081         v2   -= 2;
1082         v3   -= 2;
1083         v4   -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1) {
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5:
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz; j>1; j-=2) {
1119         vi   -= 2;
1120         i0    = vi[2];
1121         i1    = vi[1];
1122         tmp0  = tmps[i0];
1123         tmp1  = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1) {
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   Mat             C     =B;
1177   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178   IS              isrow = b->row,isicol = b->icol;
1179   PetscErrorCode  ierr;
1180   const PetscInt  *r,*ic,*ics;
1181   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182   PetscInt        i,j,k,nz,nzL,row,*pj;
1183   const PetscInt  *ajtmp,*bjtmp;
1184   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186   FactorShiftCtx  sctx;
1187   const PetscInt  *ddiag;
1188   PetscReal       rs;
1189   MatScalar       d;
1190   PetscInt        inod,nodesz,node_max,col;
1191   const PetscInt  *ns;
1192   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1193 
1194   PetscFunctionBegin;
1195   /* MatPivotSetUp(): initialize shift context sctx */
1196   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1197 
1198   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199     ddiag          = a->diag;
1200     sctx.shift_top = info->zeropivot;
1201     for (i=0; i<n; i++) {
1202       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203       d  = (aa)[ddiag[i]];
1204       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205       v  = aa+ai[i];
1206       nz = ai[i+1] - ai[i];
1207       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     sctx.shift_top *= 1.1;
1211     sctx.nshift_max = 5;
1212     sctx.shift_lo   = 0.;
1213     sctx.shift_hi   = 1.;
1214   }
1215 
1216   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1217   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1218 
1219   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1220   ics   = ic;
1221 
1222   node_max = a->inode.node_count;
1223   ns       = a->inode.size;
1224   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1225 
1226   /* If max inode size > 4, split it into two inodes.*/
1227   /* also map the inode sizes according to the ordering */
1228   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1229   for (i=0,j=0; i<node_max; ++i,++j) {
1230     if (ns[i] > 4) {
1231       tmp_vec1[j] = 4;
1232       ++j;
1233       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234     } else {
1235       tmp_vec1[j] = ns[i];
1236     }
1237   }
1238   /* Use the correct node_max */
1239   node_max = j;
1240 
1241   /* Now reorder the inode info based on mat re-ordering info */
1242   /* First create a row -> inode_size_array_index map */
1243   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1244   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1245   for (i=0,row=0; i<node_max; i++) {
1246     nodesz = tmp_vec1[i];
1247     for (j=0; j<nodesz; j++,row++) {
1248       nsmap[row] = i;
1249     }
1250   }
1251   /* Using nsmap, create a reordered ns structure */
1252   for (i=0,j=0; i< node_max; i++) {
1253     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1254     tmp_vec2[i] = nodesz;
1255     j          += nodesz;
1256   }
1257   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1258   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1259 
1260   /* Now use the correct ns */
1261   ns = tmp_vec2;
1262 
1263   do {
1264     sctx.newshift = PETSC_FALSE;
1265     /* Now loop over each block-row, and do the factorization */
1266     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267       nodesz = ns[inod];
1268 
1269       switch (nodesz) {
1270       case 1:
1271         /*----------*/
1272         /* zero rtmp1 */
1273         /* L part */
1274         nz    = bi[i+1] - bi[i];
1275         bjtmp = bj + bi[i];
1276         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1277 
1278         /* U part */
1279         nz    = bdiag[i]-bdiag[i+1];
1280         bjtmp = bj + bdiag[i+1]+1;
1281         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1282 
1283         /* load in initial (unfactored row) */
1284         nz    = ai[r[i]+1] - ai[r[i]];
1285         ajtmp = aj + ai[r[i]];
1286         v     = aa + ai[r[i]];
1287         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1288 
1289         /* ZeropivotApply() */
1290         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1291 
1292         /* elimination */
1293         bjtmp = bj + bi[i];
1294         row   = *bjtmp++;
1295         nzL   = bi[i+1] - bi[i];
1296         for (k=0; k < nzL; k++) {
1297           pc = rtmp1 + row;
1298           if (*pc != 0.0) {
1299             pv   = b->a + bdiag[row];
1300             mul1 = *pc * (*pv);
1301             *pc  = mul1;
1302             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303             pv   = b->a + bdiag[row+1]+1;
1304             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1307           }
1308           row = *bjtmp++;
1309         }
1310 
1311         /* finished row so stick it into b->a */
1312         rs = 0.0;
1313         /* L part */
1314         pv = b->a + bi[i];
1315         pj = b->j + bi[i];
1316         nz = bi[i+1] - bi[i];
1317         for (j=0; j<nz; j++) {
1318           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319         }
1320 
1321         /* U part */
1322         pv = b->a + bdiag[i+1]+1;
1323         pj = b->j + bdiag[i+1]+1;
1324         nz = bdiag[i] - bdiag[i+1]-1;
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* Check zero pivot */
1330         sctx.rs = rs;
1331         sctx.pv = rtmp1[i];
1332         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1333         if (sctx.newshift) break;
1334 
1335         /* Mark diagonal and invert diagonal for simplier triangular solves */
1336         pv  = b->a + bdiag[i];
1337         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338         break;
1339 
1340       case 2:
1341         /*----------*/
1342         /* zero rtmp1 and rtmp2 */
1343         /* L part */
1344         nz    = bi[i+1] - bi[i];
1345         bjtmp = bj + bi[i];
1346         for  (j=0; j<nz; j++) {
1347           col        = bjtmp[j];
1348           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349         }
1350 
1351         /* U part */
1352         nz    = bdiag[i]-bdiag[i+1];
1353         bjtmp = bj + bdiag[i+1]+1;
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* load in initial (unfactored row) */
1360         nz    = ai[r[i]+1] - ai[r[i]];
1361         ajtmp = aj + ai[r[i]];
1362         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363         for (j=0; j<nz; j++) {
1364           col        = ics[ajtmp[j]];
1365           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366         }
1367         /* ZeropivotApply(): shift the diagonal of the matrix  */
1368         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1369 
1370         /* elimination */
1371         bjtmp = bj + bi[i];
1372         row   = *bjtmp++; /* pivot row */
1373         nzL   = bi[i+1] - bi[i];
1374         for (k=0; k < nzL; k++) {
1375           pc1 = rtmp1 + row;
1376           pc2 = rtmp2 + row;
1377           if (*pc1 != 0.0 || *pc2 != 0.0) {
1378             pv   = b->a + bdiag[row];
1379             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380             *pc1 = mul1;       *pc2 = mul2;
1381 
1382             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1383             pv = b->a + bdiag[row+1]+1;
1384             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385             for (j=0; j<nz; j++) {
1386               col         = pj[j];
1387               rtmp1[col] -= mul1 * pv[j];
1388               rtmp2[col] -= mul2 * pv[j];
1389             }
1390             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1391           }
1392           row = *bjtmp++;
1393         }
1394 
1395         /* finished row i; check zero pivot, then stick row i into b->a */
1396         rs = 0.0;
1397         /* L part */
1398         pc1 = b->a + bi[i];
1399         pj  = b->j + bi[i];
1400         nz  = bi[i+1] - bi[i];
1401         for (j=0; j<nz; j++) {
1402           col    = pj[j];
1403           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404         }
1405         /* U part */
1406         pc1 = b->a + bdiag[i+1]+1;
1407         pj  = b->j + bdiag[i+1]+1;
1408         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413 
1414         sctx.rs = rs;
1415         sctx.pv = rtmp1[i];
1416         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1417         if (sctx.newshift) break;
1418         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1419         *pc1 = 1.0/sctx.pv;
1420 
1421         /* Now take care of diagonal 2x2 block. */
1422         pc2 = rtmp2 + i;
1423         if (*pc2 != 0.0) {
1424           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425           *pc2 = mul1;          /* insert L entry */
1426           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428           for (j=0; j<nz; j++) {
1429             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430           }
1431           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1432         }
1433 
1434         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435         rs = 0.0;
1436         /* L part */
1437         pc2 = b->a + bi[i+1];
1438         pj  = b->j + bi[i+1];
1439         nz  = bi[i+2] - bi[i+1];
1440         for (j=0; j<nz; j++) {
1441           col    = pj[j];
1442           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443         }
1444         /* U part */
1445         pc2 = b->a + bdiag[i+2]+1;
1446         pj  = b->j + bdiag[i+2]+1;
1447         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452 
1453         sctx.rs = rs;
1454         sctx.pv = rtmp2[i+1];
1455         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1456         if (sctx.newshift) break;
1457         pc2  = b->a + bdiag[i+1];
1458         *pc2 = 1.0/sctx.pv;
1459         break;
1460 
1461       case 3:
1462         /*----------*/
1463         /* zero rtmp */
1464         /* L part */
1465         nz    = bi[i+1] - bi[i];
1466         bjtmp = bj + bi[i];
1467         for  (j=0; j<nz; j++) {
1468           col        = bjtmp[j];
1469           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470         }
1471 
1472         /* U part */
1473         nz    = bdiag[i]-bdiag[i+1];
1474         bjtmp = bj + bdiag[i+1]+1;
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* load in initial (unfactored row) */
1481         nz    = ai[r[i]+1] - ai[r[i]];
1482         ajtmp = aj + ai[r[i]];
1483         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484         for (j=0; j<nz; j++) {
1485           col        = ics[ajtmp[j]];
1486           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487         }
1488         /* ZeropivotApply(): shift the diagonal of the matrix  */
1489         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1490 
1491         /* elimination */
1492         bjtmp = bj + bi[i];
1493         row   = *bjtmp++; /* pivot row */
1494         nzL   = bi[i+1] - bi[i];
1495         for (k=0; k < nzL; k++) {
1496           pc1 = rtmp1 + row;
1497           pc2 = rtmp2 + row;
1498           pc3 = rtmp3 + row;
1499           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500             pv   = b->a + bdiag[row];
1501             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1503 
1504             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1505             pv = b->a + bdiag[row+1]+1;
1506             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507             for (j=0; j<nz; j++) {
1508               col         = pj[j];
1509               rtmp1[col] -= mul1 * pv[j];
1510               rtmp2[col] -= mul2 * pv[j];
1511               rtmp3[col] -= mul3 * pv[j];
1512             }
1513             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1514           }
1515           row = *bjtmp++;
1516         }
1517 
1518         /* finished row i; check zero pivot, then stick row i into b->a */
1519         rs = 0.0;
1520         /* L part */
1521         pc1 = b->a + bi[i];
1522         pj  = b->j + bi[i];
1523         nz  = bi[i+1] - bi[i];
1524         for (j=0; j<nz; j++) {
1525           col    = pj[j];
1526           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527         }
1528         /* U part */
1529         pc1 = b->a + bdiag[i+1]+1;
1530         pj  = b->j + bdiag[i+1]+1;
1531         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536 
1537         sctx.rs = rs;
1538         sctx.pv = rtmp1[i];
1539         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1540         if (sctx.newshift) break;
1541         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1542         *pc1 = 1.0/sctx.pv;
1543 
1544         /* Now take care of 1st column of diagonal 3x3 block. */
1545         pc2 = rtmp2 + i;
1546         pc3 = rtmp3 + i;
1547         if (*pc2 != 0.0 || *pc3 != 0.0) {
1548           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1551           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552           for (j=0; j<nz; j++) {
1553             col         = pj[j];
1554             rtmp2[col] -= mul2 * rtmp1[col];
1555             rtmp3[col] -= mul3 * rtmp1[col];
1556           }
1557           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1558         }
1559 
1560         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561         rs = 0.0;
1562         /* L part */
1563         pc2 = b->a + bi[i+1];
1564         pj  = b->j + bi[i+1];
1565         nz  = bi[i+2] - bi[i+1];
1566         for (j=0; j<nz; j++) {
1567           col    = pj[j];
1568           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569         }
1570         /* U part */
1571         pc2 = b->a + bdiag[i+2]+1;
1572         pj  = b->j + bdiag[i+2]+1;
1573         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578 
1579         sctx.rs = rs;
1580         sctx.pv = rtmp2[i+1];
1581         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1582         if (sctx.newshift) break;
1583         pc2  = b->a + bdiag[i+1];
1584         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1585 
1586         /* Now take care of 2nd column of diagonal 3x3 block. */
1587         pc3 = rtmp3 + i+1;
1588         if (*pc3 != 0.0) {
1589           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1591           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592           for (j=0; j<nz; j++) {
1593             col         = pj[j];
1594             rtmp3[col] -= mul3 * rtmp2[col];
1595           }
1596           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1597         }
1598 
1599         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600         rs = 0.0;
1601         /* L part */
1602         pc3 = b->a + bi[i+2];
1603         pj  = b->j + bi[i+2];
1604         nz  = bi[i+3] - bi[i+2];
1605         for (j=0; j<nz; j++) {
1606           col    = pj[j];
1607           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608         }
1609         /* U part */
1610         pc3 = b->a + bdiag[i+3]+1;
1611         pj  = b->j + bdiag[i+3]+1;
1612         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp3[i+2];
1620         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1621         if (sctx.newshift) break;
1622         pc3  = b->a + bdiag[i+2];
1623         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624         break;
1625       case 4:
1626         /*----------*/
1627         /* zero rtmp */
1628         /* L part */
1629         nz    = bi[i+1] - bi[i];
1630         bjtmp = bj + bi[i];
1631         for  (j=0; j<nz; j++) {
1632           col        = bjtmp[j];
1633           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634         }
1635 
1636         /* U part */
1637         nz    = bdiag[i]-bdiag[i+1];
1638         bjtmp = bj + bdiag[i+1]+1;
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642         }
1643 
1644         /* load in initial (unfactored row) */
1645         nz    = ai[r[i]+1] - ai[r[i]];
1646         ajtmp = aj + ai[r[i]];
1647         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648         for (j=0; j<nz; j++) {
1649           col        = ics[ajtmp[j]];
1650           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651         }
1652         /* ZeropivotApply(): shift the diagonal of the matrix  */
1653         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1654 
1655         /* elimination */
1656         bjtmp = bj + bi[i];
1657         row   = *bjtmp++; /* pivot row */
1658         nzL   = bi[i+1] - bi[i];
1659         for (k=0; k < nzL; k++) {
1660           pc1 = rtmp1 + row;
1661           pc2 = rtmp2 + row;
1662           pc3 = rtmp3 + row;
1663           pc4 = rtmp4 + row;
1664           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665             pv   = b->a + bdiag[row];
1666             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1668 
1669             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670             pv = b->a + bdiag[row+1]+1;
1671             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672             for (j=0; j<nz; j++) {
1673               col         = pj[j];
1674               rtmp1[col] -= mul1 * pv[j];
1675               rtmp2[col] -= mul2 * pv[j];
1676               rtmp3[col] -= mul3 * pv[j];
1677               rtmp4[col] -= mul4 * pv[j];
1678             }
1679             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1680           }
1681           row = *bjtmp++;
1682         }
1683 
1684         /* finished row i; check zero pivot, then stick row i into b->a */
1685         rs = 0.0;
1686         /* L part */
1687         pc1 = b->a + bi[i];
1688         pj  = b->j + bi[i];
1689         nz  = bi[i+1] - bi[i];
1690         for (j=0; j<nz; j++) {
1691           col    = pj[j];
1692           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693         }
1694         /* U part */
1695         pc1 = b->a + bdiag[i+1]+1;
1696         pj  = b->j + bdiag[i+1]+1;
1697         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702 
1703         sctx.rs = rs;
1704         sctx.pv = rtmp1[i];
1705         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1706         if (sctx.newshift) break;
1707         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1708         *pc1 = 1.0/sctx.pv;
1709 
1710         /* Now take care of 1st column of diagonal 4x4 block. */
1711         pc2 = rtmp2 + i;
1712         pc3 = rtmp3 + i;
1713         pc4 = rtmp4 + i;
1714         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1719           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720           for (j=0; j<nz; j++) {
1721             col         = pj[j];
1722             rtmp2[col] -= mul2 * rtmp1[col];
1723             rtmp3[col] -= mul3 * rtmp1[col];
1724             rtmp4[col] -= mul4 * rtmp1[col];
1725           }
1726           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1727         }
1728 
1729         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730         rs = 0.0;
1731         /* L part */
1732         pc2 = b->a + bi[i+1];
1733         pj  = b->j + bi[i+1];
1734         nz  = bi[i+2] - bi[i+1];
1735         for (j=0; j<nz; j++) {
1736           col    = pj[j];
1737           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738         }
1739         /* U part */
1740         pc2 = b->a + bdiag[i+2]+1;
1741         pj  = b->j + bdiag[i+2]+1;
1742         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747 
1748         sctx.rs = rs;
1749         sctx.pv = rtmp2[i+1];
1750         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1751         if (sctx.newshift) break;
1752         pc2  = b->a + bdiag[i+1];
1753         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1754 
1755         /* Now take care of 2nd column of diagonal 4x4 block. */
1756         pc3 = rtmp3 + i+1;
1757         pc4 = rtmp4 + i+1;
1758         if (*pc3 != 0.0 || *pc4 != 0.0) {
1759           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1762           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763           for (j=0; j<nz; j++) {
1764             col         = pj[j];
1765             rtmp3[col] -= mul3 * rtmp2[col];
1766             rtmp4[col] -= mul4 * rtmp2[col];
1767           }
1768           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1769         }
1770 
1771         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772         rs = 0.0;
1773         /* L part */
1774         pc3 = b->a + bi[i+2];
1775         pj  = b->j + bi[i+2];
1776         nz  = bi[i+3] - bi[i+2];
1777         for (j=0; j<nz; j++) {
1778           col    = pj[j];
1779           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780         }
1781         /* U part */
1782         pc3 = b->a + bdiag[i+3]+1;
1783         pj  = b->j + bdiag[i+3]+1;
1784         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789 
1790         sctx.rs = rs;
1791         sctx.pv = rtmp3[i+2];
1792         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1793         if (sctx.newshift) break;
1794         pc3  = b->a + bdiag[i+2];
1795         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1796 
1797         /* Now take care of 3rd column of diagonal 4x4 block. */
1798         pc4 = rtmp4 + i+2;
1799         if (*pc4 != 0.0) {
1800           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1802           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803           for (j=0; j<nz; j++) {
1804             col         = pj[j];
1805             rtmp4[col] -= mul4 * rtmp3[col];
1806           }
1807           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1808         }
1809 
1810         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811         rs = 0.0;
1812         /* L part */
1813         pc4 = b->a + bi[i+3];
1814         pj  = b->j + bi[i+3];
1815         nz  = bi[i+4] - bi[i+3];
1816         for (j=0; j<nz; j++) {
1817           col    = pj[j];
1818           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819         }
1820         /* U part */
1821         pc4 = b->a + bdiag[i+4]+1;
1822         pj  = b->j + bdiag[i+4]+1;
1823         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828 
1829         sctx.rs = rs;
1830         sctx.pv = rtmp4[i+3];
1831         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1832         if (sctx.newshift) break;
1833         pc4  = b->a + bdiag[i+3];
1834         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835         break;
1836 
1837       default:
1838         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839       }
1840       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841       i += nodesz;                 /* Update the row */
1842     }
1843 
1844     /* MatPivotRefine() */
1845     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846       /*
1847        * if no shift in this attempt & shifting & started shifting & can refine,
1848        * then try lower shift
1849        */
1850       sctx.shift_hi       = sctx.shift_fraction;
1851       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1853       sctx.newshift       = PETSC_TRUE;
1854       sctx.nshift++;
1855     }
1856   } while (sctx.newshift);
1857 
1858   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1859   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1860   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1861   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1862 
1863   if (b->inode.size) {
1864     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1865   } else {
1866     C->ops->solve           = MatSolve_SeqAIJ;
1867   }
1868   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1869   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1870   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1872   C->assembled              = PETSC_TRUE;
1873   C->preallocated           = PETSC_TRUE;
1874 
1875   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1876 
1877   /* MatShiftView(A,info,&sctx) */
1878   if (sctx.nshift) {
1879     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1881     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1885     }
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 #undef __FUNCT__
1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893 {
1894   Mat             C     = B;
1895   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1897   PetscErrorCode  ierr;
1898   const PetscInt  *r,*ic,*c,*ics;
1899   PetscInt        n   = A->rmap->n,*bi = b->i;
1900   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1902   PetscInt        *ai = a->i,*aj = a->j;
1903   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904   PetscScalar     mul1,mul2,mul3,tmp;
1905   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907   PetscReal       rs=0.0;
1908   FactorShiftCtx  sctx;
1909 
1910   PetscFunctionBegin;
1911   sctx.shift_top      = 0;
1912   sctx.nshift_max     = 0;
1913   sctx.shift_lo       = 0;
1914   sctx.shift_hi       = 0;
1915   sctx.shift_fraction = 0;
1916 
1917   /* if both shift schemes are chosen by user, only use info->shiftpd */
1918   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919     sctx.shift_top = 0;
1920     for (i=0; i<n; i++) {
1921       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922       rs    = 0.0;
1923       ajtmp = aj + ai[i];
1924       rtmp1 = aa + ai[i];
1925       nz    = ai[i+1] - ai[i];
1926       for (j=0; j<nz; j++) {
1927         if (*ajtmp != i) {
1928           rs += PetscAbsScalar(*rtmp1++);
1929         } else {
1930           rs -= PetscRealPart(*rtmp1++);
1931         }
1932         ajtmp++;
1933       }
1934       if (rs>sctx.shift_top) sctx.shift_top = rs;
1935     }
1936     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937     sctx.shift_top *= 1.1;
1938     sctx.nshift_max = 5;
1939     sctx.shift_lo   = 0.;
1940     sctx.shift_hi   = 1.;
1941   }
1942   sctx.shift_amount = 0;
1943   sctx.nshift       = 0;
1944 
1945   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1946   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1947   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1948   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1949   ics    = ic;
1950 
1951   node_max = a->inode.node_count;
1952   ns       = a->inode.size;
1953   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1954 
1955   /* If max inode size > 3, split it into two inodes.*/
1956   /* also map the inode sizes according to the ordering */
1957   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1958   for (i=0,j=0; i<node_max; ++i,++j) {
1959     if (ns[i]>3) {
1960       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1961       ++j;
1962       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963     } else {
1964       tmp_vec1[j] = ns[i];
1965     }
1966   }
1967   /* Use the correct node_max */
1968   node_max = j;
1969 
1970   /* Now reorder the inode info based on mat re-ordering info */
1971   /* First create a row -> inode_size_array_index map */
1972   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1973   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1974   for (i=0,row=0; i<node_max; i++) {
1975     nodesz = tmp_vec1[i];
1976     for (j=0; j<nodesz; j++,row++) {
1977       nsmap[row] = i;
1978     }
1979   }
1980   /* Using nsmap, create a reordered ns structure */
1981   for (i=0,j=0; i< node_max; i++) {
1982     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1983     tmp_vec2[i] = nodesz;
1984     j          += nodesz;
1985   }
1986   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1987   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1988   /* Now use the correct ns */
1989   ns = tmp_vec2;
1990 
1991   do {
1992     sctx.newshift = PETSC_FALSE;
1993     /* Now loop over each block-row, and do the factorization */
1994     for (i=0,row=0; i<node_max; i++) {
1995       nodesz = ns[i];
1996       nz     = bi[row+1] - bi[row];
1997       bjtmp  = bj + bi[row];
1998 
1999       switch (nodesz) {
2000       case 1:
2001         for  (j=0; j<nz; j++) {
2002           idx         = bjtmp[j];
2003           rtmp11[idx] = 0.0;
2004         }
2005 
2006         /* load in initial (unfactored row) */
2007         idx    = r[row];
2008         nz_tmp = ai[idx+1] - ai[idx];
2009         ajtmp  = aj + ai[idx];
2010         v1     = aa + ai[idx];
2011 
2012         for (j=0; j<nz_tmp; j++) {
2013           idx         = ics[ajtmp[j]];
2014           rtmp11[idx] = v1[j];
2015         }
2016         rtmp11[ics[r[row]]] += sctx.shift_amount;
2017 
2018         prow = *bjtmp++;
2019         while (prow < row) {
2020           pc1 = rtmp11 + prow;
2021           if (*pc1 != 0.0) {
2022             pv     = ba + bd[prow];
2023             pj     = nbj + bd[prow];
2024             mul1   = *pc1 * *pv++;
2025             *pc1   = mul1;
2026             nz_tmp = bi[prow+1] - bd[prow] - 1;
2027             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2028             for (j=0; j<nz_tmp; j++) {
2029               tmp          = pv[j];
2030               idx          = pj[j];
2031               rtmp11[idx] -= mul1 * tmp;
2032             }
2033           }
2034           prow = *bjtmp++;
2035         }
2036         pj  = bj + bi[row];
2037         pc1 = ba + bi[row];
2038 
2039         sctx.pv     = rtmp11[row];
2040         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041         rs          = 0.0;
2042         for (j=0; j<nz; j++) {
2043           idx    = pj[j];
2044           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046         }
2047         sctx.rs = rs;
2048         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2049         if (sctx.newshift) goto endofwhile;
2050         break;
2051 
2052       case 2:
2053         for (j=0; j<nz; j++) {
2054           idx         = bjtmp[j];
2055           rtmp11[idx] = 0.0;
2056           rtmp22[idx] = 0.0;
2057         }
2058 
2059         /* load in initial (unfactored row) */
2060         idx    = r[row];
2061         nz_tmp = ai[idx+1] - ai[idx];
2062         ajtmp  = aj + ai[idx];
2063         v1     = aa + ai[idx];
2064         v2     = aa + ai[idx+1];
2065         for (j=0; j<nz_tmp; j++) {
2066           idx         = ics[ajtmp[j]];
2067           rtmp11[idx] = v1[j];
2068           rtmp22[idx] = v2[j];
2069         }
2070         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2071         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2072 
2073         prow = *bjtmp++;
2074         while (prow < row) {
2075           pc1 = rtmp11 + prow;
2076           pc2 = rtmp22 + prow;
2077           if (*pc1 != 0.0 || *pc2 != 0.0) {
2078             pv   = ba + bd[prow];
2079             pj   = nbj + bd[prow];
2080             mul1 = *pc1 * *pv;
2081             mul2 = *pc2 * *pv;
2082             ++pv;
2083             *pc1 = mul1;
2084             *pc2 = mul2;
2085 
2086             nz_tmp = bi[prow+1] - bd[prow] - 1;
2087             for (j=0; j<nz_tmp; j++) {
2088               tmp          = pv[j];
2089               idx          = pj[j];
2090               rtmp11[idx] -= mul1 * tmp;
2091               rtmp22[idx] -= mul2 * tmp;
2092             }
2093             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2094           }
2095           prow = *bjtmp++;
2096         }
2097 
2098         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099         pc1 = rtmp11 + prow;
2100         pc2 = rtmp22 + prow;
2101 
2102         sctx.pv = *pc1;
2103         pj      = bj + bi[prow];
2104         rs      = 0.0;
2105         for (j=0; j<nz; j++) {
2106           idx = pj[j];
2107           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108         }
2109         sctx.rs = rs;
2110         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2111         if (sctx.newshift) goto endofwhile;
2112 
2113         if (*pc2 != 0.0) {
2114           pj     = nbj + bd[prow];
2115           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116           *pc2   = mul2;
2117           nz_tmp = bi[prow+1] - bd[prow] - 1;
2118           for (j=0; j<nz_tmp; j++) {
2119             idx          = pj[j];
2120             tmp          = rtmp11[idx];
2121             rtmp22[idx] -= mul2 * tmp;
2122           }
2123           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2124         }
2125 
2126         pj  = bj + bi[row];
2127         pc1 = ba + bi[row];
2128         pc2 = ba + bi[row+1];
2129 
2130         sctx.pv       = rtmp22[row+1];
2131         rs            = 0.0;
2132         rtmp11[row]   = 1.0/rtmp11[row];
2133         rtmp22[row+1] = 1.0/rtmp22[row+1];
2134         /* copy row entries from dense representation to sparse */
2135         for (j=0; j<nz; j++) {
2136           idx    = pj[j];
2137           pc1[j] = rtmp11[idx];
2138           pc2[j] = rtmp22[idx];
2139           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140         }
2141         sctx.rs = rs;
2142         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2143         if (sctx.newshift) goto endofwhile;
2144         break;
2145 
2146       case 3:
2147         for  (j=0; j<nz; j++) {
2148           idx         = bjtmp[j];
2149           rtmp11[idx] = 0.0;
2150           rtmp22[idx] = 0.0;
2151           rtmp33[idx] = 0.0;
2152         }
2153         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154         idx    = r[row];
2155         nz_tmp = ai[idx+1] - ai[idx];
2156         ajtmp  = aj + ai[idx];
2157         v1     = aa + ai[idx];
2158         v2     = aa + ai[idx+1];
2159         v3     = aa + ai[idx+2];
2160         for (j=0; j<nz_tmp; j++) {
2161           idx         = ics[ajtmp[j]];
2162           rtmp11[idx] = v1[j];
2163           rtmp22[idx] = v2[j];
2164           rtmp33[idx] = v3[j];
2165         }
2166         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2167         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2169 
2170         /* loop over all pivot row blocks above this row block */
2171         prow = *bjtmp++;
2172         while (prow < row) {
2173           pc1 = rtmp11 + prow;
2174           pc2 = rtmp22 + prow;
2175           pc3 = rtmp33 + prow;
2176           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177             pv   = ba  + bd[prow];
2178             pj   = nbj + bd[prow];
2179             mul1 = *pc1 * *pv;
2180             mul2 = *pc2 * *pv;
2181             mul3 = *pc3 * *pv;
2182             ++pv;
2183             *pc1 = mul1;
2184             *pc2 = mul2;
2185             *pc3 = mul3;
2186 
2187             nz_tmp = bi[prow+1] - bd[prow] - 1;
2188             /* update this row based on pivot row */
2189             for (j=0; j<nz_tmp; j++) {
2190               tmp          = pv[j];
2191               idx          = pj[j];
2192               rtmp11[idx] -= mul1 * tmp;
2193               rtmp22[idx] -= mul2 * tmp;
2194               rtmp33[idx] -= mul3 * tmp;
2195             }
2196             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2197           }
2198           prow = *bjtmp++;
2199         }
2200 
2201         /* Now take care of diagonal 3x3 block in this set of rows */
2202         /* note: prow = row here */
2203         pc1 = rtmp11 + prow;
2204         pc2 = rtmp22 + prow;
2205         pc3 = rtmp33 + prow;
2206 
2207         sctx.pv = *pc1;
2208         pj      = bj + bi[prow];
2209         rs      = 0.0;
2210         for (j=0; j<nz; j++) {
2211           idx = pj[j];
2212           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213         }
2214         sctx.rs = rs;
2215         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2216         if (sctx.newshift) goto endofwhile;
2217 
2218         if (*pc2 != 0.0 || *pc3 != 0.0) {
2219           mul2   = (*pc2)/(*pc1);
2220           mul3   = (*pc3)/(*pc1);
2221           *pc2   = mul2;
2222           *pc3   = mul3;
2223           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224           pj     = nbj + bd[prow];
2225           for (j=0; j<nz_tmp; j++) {
2226             idx          = pj[j];
2227             tmp          = rtmp11[idx];
2228             rtmp22[idx] -= mul2 * tmp;
2229             rtmp33[idx] -= mul3 * tmp;
2230           }
2231           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2232         }
2233         ++prow;
2234 
2235         pc2     = rtmp22 + prow;
2236         pc3     = rtmp33 + prow;
2237         sctx.pv = *pc2;
2238         pj      = bj + bi[prow];
2239         rs      = 0.0;
2240         for (j=0; j<nz; j++) {
2241           idx = pj[j];
2242           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243         }
2244         sctx.rs = rs;
2245         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2246         if (sctx.newshift) goto endofwhile;
2247 
2248         if (*pc3 != 0.0) {
2249           mul3   = (*pc3)/(*pc2);
2250           *pc3   = mul3;
2251           pj     = nbj + bd[prow];
2252           nz_tmp = bi[prow+1] - bd[prow] - 1;
2253           for (j=0; j<nz_tmp; j++) {
2254             idx          = pj[j];
2255             tmp          = rtmp22[idx];
2256             rtmp33[idx] -= mul3 * tmp;
2257           }
2258           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2259         }
2260 
2261         pj  = bj + bi[row];
2262         pc1 = ba + bi[row];
2263         pc2 = ba + bi[row+1];
2264         pc3 = ba + bi[row+2];
2265 
2266         sctx.pv       = rtmp33[row+2];
2267         rs            = 0.0;
2268         rtmp11[row]   = 1.0/rtmp11[row];
2269         rtmp22[row+1] = 1.0/rtmp22[row+1];
2270         rtmp33[row+2] = 1.0/rtmp33[row+2];
2271         /* copy row entries from dense representation to sparse */
2272         for (j=0; j<nz; j++) {
2273           idx    = pj[j];
2274           pc1[j] = rtmp11[idx];
2275           pc2[j] = rtmp22[idx];
2276           pc3[j] = rtmp33[idx];
2277           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278         }
2279 
2280         sctx.rs = rs;
2281         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2282         if (sctx.newshift) goto endofwhile;
2283         break;
2284 
2285       default:
2286         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287       }
2288       row += nodesz;                 /* Update the row */
2289     }
2290 endofwhile:;
2291   } while (sctx.newshift);
2292   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2293   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2294   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2295   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2296   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2297 
2298   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299   /* do not set solve add, since MatSolve_Inode + Add is faster */
2300   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2301   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302   C->assembled              = PETSC_TRUE;
2303   C->preallocated           = PETSC_TRUE;
2304   if (sctx.nshift) {
2305     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2307     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2309     }
2310   }
2311   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2312   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 
2317 /* ----------------------------------------------------------- */
2318 #undef __FUNCT__
2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321 {
2322   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2323   IS                iscol = a->col,isrow = a->row;
2324   PetscErrorCode    ierr;
2325   const PetscInt    *r,*c,*rout,*cout;
2326   PetscInt          i,j,n = A->rmap->n;
2327   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2328   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2330   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2331   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332   const PetscScalar *b;
2333 
2334   PetscFunctionBegin;
2335   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336   node_max = a->inode.node_count;
2337   ns       = a->inode.size;     /* Node Size array */
2338 
2339   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2341   tmp  = a->solve_work;
2342 
2343   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2344   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2345 
2346   /* forward solve the lower triangular */
2347   tmps = tmp;
2348   aa   = a_a;
2349   aj   = a_j;
2350   ad   = a->diag;
2351 
2352   for (i = 0,row = 0; i< node_max; ++i) {
2353     nsz = ns[i];
2354     aii = ai[row];
2355     v1  = aa + aii;
2356     vi  = aj + aii;
2357     nz  = ai[row+1]- ai[row];
2358 
2359     if (i < node_max-1) {
2360       /* Prefetch the indices for the next block */
2361       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362       /* Prefetch the data for the next block */
2363       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364     }
2365 
2366     switch (nsz) {               /* Each loop in 'case' is unrolled */
2367     case 1:
2368       sum1 = b[r[row]];
2369       for (j=0; j<nz-1; j+=2) {
2370         i0    = vi[j];
2371         i1    = vi[j+1];
2372         tmp0  = tmps[i0];
2373         tmp1  = tmps[i1];
2374         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375       }
2376       if (j == nz-1) {
2377         tmp0  = tmps[vi[j]];
2378         sum1 -= v1[j]*tmp0;
2379       }
2380       tmp[row++]=sum1;
2381       break;
2382     case 2:
2383       sum1 = b[r[row]];
2384       sum2 = b[r[row+1]];
2385       v2   = aa + ai[row+1];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394       }
2395       if (j == nz-1) {
2396         tmp0  = tmps[vi[j]];
2397         sum1 -= v1[j] *tmp0;
2398         sum2 -= v2[j] *tmp0;
2399       }
2400       sum2     -= v2[nz] * sum1;
2401       tmp[row++]=sum1;
2402       tmp[row++]=sum2;
2403       break;
2404     case 3:
2405       sum1 = b[r[row]];
2406       sum2 = b[r[row+1]];
2407       sum3 = b[r[row+2]];
2408       v2   = aa + ai[row+1];
2409       v3   = aa + ai[row+2];
2410 
2411       for (j=0; j<nz-1; j+=2) {
2412         i0    = vi[j];
2413         i1    = vi[j+1];
2414         tmp0  = tmps[i0];
2415         tmp1  = tmps[i1];
2416         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425       }
2426       sum2     -= v2[nz] * sum1;
2427       sum3     -= v3[nz] * sum1;
2428       sum3     -= v3[nz+1] * sum2;
2429       tmp[row++]=sum1;
2430       tmp[row++]=sum2;
2431       tmp[row++]=sum3;
2432       break;
2433 
2434     case 4:
2435       sum1 = b[r[row]];
2436       sum2 = b[r[row+1]];
2437       sum3 = b[r[row+2]];
2438       sum4 = b[r[row+3]];
2439       v2   = aa + ai[row+1];
2440       v3   = aa + ai[row+2];
2441       v4   = aa + ai[row+3];
2442 
2443       for (j=0; j<nz-1; j+=2) {
2444         i0    = vi[j];
2445         i1    = vi[j+1];
2446         tmp0  = tmps[i0];
2447         tmp1  = tmps[i1];
2448         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452       }
2453       if (j == nz-1) {
2454         tmp0  = tmps[vi[j]];
2455         sum1 -= v1[j] *tmp0;
2456         sum2 -= v2[j] *tmp0;
2457         sum3 -= v3[j] *tmp0;
2458         sum4 -= v4[j] *tmp0;
2459       }
2460       sum2 -= v2[nz] * sum1;
2461       sum3 -= v3[nz] * sum1;
2462       sum4 -= v4[nz] * sum1;
2463       sum3 -= v3[nz+1] * sum2;
2464       sum4 -= v4[nz+1] * sum2;
2465       sum4 -= v4[nz+2] * sum3;
2466 
2467       tmp[row++]=sum1;
2468       tmp[row++]=sum2;
2469       tmp[row++]=sum3;
2470       tmp[row++]=sum4;
2471       break;
2472     case 5:
2473       sum1 = b[r[row]];
2474       sum2 = b[r[row+1]];
2475       sum3 = b[r[row+2]];
2476       sum4 = b[r[row+3]];
2477       sum5 = b[r[row+4]];
2478       v2   = aa + ai[row+1];
2479       v3   = aa + ai[row+2];
2480       v4   = aa + ai[row+3];
2481       v5   = aa + ai[row+4];
2482 
2483       for (j=0; j<nz-1; j+=2) {
2484         i0    = vi[j];
2485         i1    = vi[j+1];
2486         tmp0  = tmps[i0];
2487         tmp1  = tmps[i1];
2488         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493       }
2494       if (j == nz-1) {
2495         tmp0  = tmps[vi[j]];
2496         sum1 -= v1[j] *tmp0;
2497         sum2 -= v2[j] *tmp0;
2498         sum3 -= v3[j] *tmp0;
2499         sum4 -= v4[j] *tmp0;
2500         sum5 -= v5[j] *tmp0;
2501       }
2502 
2503       sum2 -= v2[nz] * sum1;
2504       sum3 -= v3[nz] * sum1;
2505       sum4 -= v4[nz] * sum1;
2506       sum5 -= v5[nz] * sum1;
2507       sum3 -= v3[nz+1] * sum2;
2508       sum4 -= v4[nz+1] * sum2;
2509       sum5 -= v5[nz+1] * sum2;
2510       sum4 -= v4[nz+2] * sum3;
2511       sum5 -= v5[nz+2] * sum3;
2512       sum5 -= v5[nz+3] * sum4;
2513 
2514       tmp[row++]=sum1;
2515       tmp[row++]=sum2;
2516       tmp[row++]=sum3;
2517       tmp[row++]=sum4;
2518       tmp[row++]=sum5;
2519       break;
2520     default:
2521       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522     }
2523   }
2524   /* backward solve the upper triangular */
2525   for (i=node_max -1,row = n-1; i>=0; i--) {
2526     nsz = ns[i];
2527     aii = ad[row+1] + 1;
2528     v1  = aa + aii;
2529     vi  = aj + aii;
2530     nz  = ad[row]- ad[row+1] - 1;
2531 
2532     if (i > 0) {
2533       /* Prefetch the indices for the next block */
2534       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535       /* Prefetch the data for the next block */
2536       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537     }
2538 
2539     switch (nsz) {               /* Each loop in 'case' is unrolled */
2540     case 1:
2541       sum1 = tmp[row];
2542 
2543       for (j=0; j<nz-1; j+=2) {
2544         i0    = vi[j];
2545         i1    = vi[j+1];
2546         tmp0  = tmps[i0];
2547         tmp1  = tmps[i1];
2548         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549       }
2550       if (j == nz-1) {
2551         tmp0  = tmps[vi[j]];
2552         sum1 -= v1[j]*tmp0;
2553       }
2554       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555       break;
2556     case 2:
2557       sum1 = tmp[row];
2558       sum2 = tmp[row-1];
2559       v2   = aa + ad[row] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567       }
2568       if (j == nz-1) {
2569         tmp0  = tmps[vi[j]];
2570         sum1 -= v1[j]* tmp0;
2571         sum2 -= v2[j+1]* tmp0;
2572       }
2573 
2574       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575       sum2     -= v2[0] * tmp0;
2576       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577       break;
2578     case 3:
2579       sum1 = tmp[row];
2580       sum2 = tmp[row -1];
2581       sum3 = tmp[row -2];
2582       v2   = aa + ad[row] + 1;
2583       v3   = aa + ad[row -1] + 1;
2584       for (j=0; j<nz-1; j+=2) {
2585         i0    = vi[j];
2586         i1    = vi[j+1];
2587         tmp0  = tmps[i0];
2588         tmp1  = tmps[i1];
2589         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598       }
2599       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600       sum2     -= v2[0]* tmp0;
2601       sum3     -= v3[1] * tmp0;
2602       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603       sum3     -= v3[0]* tmp0;
2604       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2605 
2606       break;
2607     case 4:
2608       sum1 = tmp[row];
2609       sum2 = tmp[row -1];
2610       sum3 = tmp[row -2];
2611       sum4 = tmp[row -3];
2612       v2   = aa + ad[row]+1;
2613       v3   = aa + ad[row -1]+1;
2614       v4   = aa + ad[row -2]+1;
2615 
2616       for (j=0; j<nz-1; j+=2) {
2617         i0    = vi[j];
2618         i1    = vi[j+1];
2619         tmp0  = tmps[i0];
2620         tmp1  = tmps[i1];
2621         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2622         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625       }
2626       if (j== nz-1) {
2627         tmp0  = tmps[vi[j]];
2628         sum1 -= v1[j] * tmp0;
2629         sum2 -= v2[j+1] * tmp0;
2630         sum3 -= v3[j+2] * tmp0;
2631         sum4 -= v4[j+3] * tmp0;
2632       }
2633 
2634       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635       sum2     -= v2[0] * tmp0;
2636       sum3     -= v3[1] * tmp0;
2637       sum4     -= v4[2] * tmp0;
2638       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639       sum3     -= v3[0] * tmp0;
2640       sum4     -= v4[1] * tmp0;
2641       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642       sum4     -= v4[0] * tmp0;
2643       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644       break;
2645     case 5:
2646       sum1 = tmp[row];
2647       sum2 = tmp[row -1];
2648       sum3 = tmp[row -2];
2649       sum4 = tmp[row -3];
2650       sum5 = tmp[row -4];
2651       v2   = aa + ad[row]+1;
2652       v3   = aa + ad[row -1]+1;
2653       v4   = aa + ad[row -2]+1;
2654       v5   = aa + ad[row -3]+1;
2655       for (j=0; j<nz-1; j+=2) {
2656         i0    = vi[j];
2657         i1    = vi[j+1];
2658         tmp0  = tmps[i0];
2659         tmp1  = tmps[i1];
2660         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665       }
2666       if (j==nz-1) {
2667         tmp0  = tmps[vi[j]];
2668         sum1 -= v1[j] * tmp0;
2669         sum2 -= v2[j+1] * tmp0;
2670         sum3 -= v3[j+2] * tmp0;
2671         sum4 -= v4[j+3] * tmp0;
2672         sum5 -= v5[j+4] * tmp0;
2673       }
2674 
2675       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676       sum2     -= v2[0] * tmp0;
2677       sum3     -= v3[1] * tmp0;
2678       sum4     -= v4[2] * tmp0;
2679       sum5     -= v5[3] * tmp0;
2680       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681       sum3     -= v3[0] * tmp0;
2682       sum4     -= v4[1] * tmp0;
2683       sum5     -= v5[2] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685       sum4     -= v4[0] * tmp0;
2686       sum5     -= v5[1] * tmp0;
2687       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688       sum5     -= v5[0] * tmp0;
2689       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690       break;
2691     default:
2692       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693     }
2694   }
2695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2699   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 
2704 /*
2705      Makes a longer coloring[] array and calls the usual code with that
2706 */
2707 #undef __FUNCT__
2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710 {
2711   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2712   PetscErrorCode  ierr;
2713   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714   PetscInt        *colorused,i;
2715   ISColoringValue *newcolor;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2719   /* loop over inodes, marking a color for each column*/
2720   row = 0;
2721   for (i=0; i<m; i++) {
2722     for (j=0; j<ns[i]; j++) {
2723       newcolor[row++] = coloring[i] + j*ncolors;
2724     }
2725   }
2726 
2727   /* eliminate unneeded colors */
2728   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2729   for (i=0; i<n; i++) {
2730     colorused[newcolor[i]] = 1;
2731   }
2732 
2733   for (i=1; i<5*ncolors; i++) {
2734     colorused[i] += colorused[i-1];
2735   }
2736   ncolors = colorused[5*ncolors-1];
2737   for (i=0; i<n; i++) {
2738     newcolor[i] = colorused[newcolor[i]]-1;
2739   }
2740   ierr = PetscFree(colorused);CHKERRQ(ierr);
2741   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2742   ierr = PetscFree(coloring);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 #include <petsc/private/kernels/blockinvert.h>
2747 
2748 #undef __FUNCT__
2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751 {
2752   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2753   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754   MatScalar         *ibdiag,*bdiag,work[25],*t;
2755   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757   const PetscScalar *xb, *b;
2758   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2759   PetscErrorCode    ierr;
2760   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761   PetscInt          sz,k,ipvt[5];
2762   PetscBool         zeropivotdetected=PETSC_FALSE;
2763   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2764 
2765   PetscFunctionBegin;
2766   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2767   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2768   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2769 
2770   if (!a->inode.ibdiagvalid) {
2771     if (!a->inode.ibdiag) {
2772       /* calculate space needed for diagonal blocks */
2773       for (i=0; i<m; i++) {
2774         cnt += sizes[i]*sizes[i];
2775       }
2776       a->inode.bdiagsize = cnt;
2777 
2778       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2779     }
2780 
2781     /* copy over the diagonal blocks and invert them */
2782     ibdiag = a->inode.ibdiag;
2783     bdiag  = a->inode.bdiag;
2784     cnt    = 0;
2785     for (i=0, row = 0; i<m; i++) {
2786       for (j=0; j<sizes[i]; j++) {
2787         for (k=0; k<sizes[i]; k++) {
2788           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2789         }
2790       }
2791       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2792 
2793       switch (sizes[i]) {
2794       case 1:
2795         /* Create matrix data structure */
2796         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2797         ibdiag[cnt] = 1.0/ibdiag[cnt];
2798         break;
2799       case 2:
2800         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2801         break;
2802       case 3:
2803         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr);
2804         break;
2805       case 4:
2806         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2807         break;
2808       case 5:
2809         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2810         break;
2811       default:
2812         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2813       }
2814       if (zeropivotdetected) {
2815         A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2816       }
2817 
2818       cnt += sizes[i]*sizes[i];
2819       row += sizes[i];
2820     }
2821     a->inode.ibdiagvalid = PETSC_TRUE;
2822   }
2823   ibdiag = a->inode.ibdiag;
2824   bdiag  = a->inode.bdiag;
2825   t      = a->inode.ssor_work;
2826 
2827   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2828   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2829   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2830   if (flag & SOR_ZERO_INITIAL_GUESS) {
2831     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2832 
2833       for (i=0, row=0; i<m; i++) {
2834         sz  = diag[row] - ii[row];
2835         v1  = a->a + ii[row];
2836         idx = a->j + ii[row];
2837 
2838         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2839         switch (sizes[i]) {
2840         case 1:
2841 
2842           sum1 = b[row];
2843           for (n = 0; n<sz-1; n+=2) {
2844             i1    = idx[0];
2845             i2    = idx[1];
2846             idx  += 2;
2847             tmp0  = x[i1];
2848             tmp1  = x[i2];
2849             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2850           }
2851 
2852           if (n == sz-1) {
2853             tmp0  = x[*idx];
2854             sum1 -= *v1 * tmp0;
2855           }
2856           t[row]   = sum1;
2857           x[row++] = sum1*(*ibdiag++);
2858           break;
2859         case 2:
2860           v2   = a->a + ii[row+1];
2861           sum1 = b[row];
2862           sum2 = b[row+1];
2863           for (n = 0; n<sz-1; n+=2) {
2864             i1    = idx[0];
2865             i2    = idx[1];
2866             idx  += 2;
2867             tmp0  = x[i1];
2868             tmp1  = x[i2];
2869             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2870             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2871           }
2872 
2873           if (n == sz-1) {
2874             tmp0  = x[*idx];
2875             sum1 -= v1[0] * tmp0;
2876             sum2 -= v2[0] * tmp0;
2877           }
2878           t[row]   = sum1;
2879           t[row+1] = sum2;
2880           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2881           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2882           ibdiag  += 4;
2883           break;
2884         case 3:
2885           v2   = a->a + ii[row+1];
2886           v3   = a->a + ii[row+2];
2887           sum1 = b[row];
2888           sum2 = b[row+1];
2889           sum3 = b[row+2];
2890           for (n = 0; n<sz-1; n+=2) {
2891             i1    = idx[0];
2892             i2    = idx[1];
2893             idx  += 2;
2894             tmp0  = x[i1];
2895             tmp1  = x[i2];
2896             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2897             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2898             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2899           }
2900 
2901           if (n == sz-1) {
2902             tmp0  = x[*idx];
2903             sum1 -= v1[0] * tmp0;
2904             sum2 -= v2[0] * tmp0;
2905             sum3 -= v3[0] * tmp0;
2906           }
2907           t[row]   = sum1;
2908           t[row+1] = sum2;
2909           t[row+2] = sum3;
2910           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2911           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2912           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2913           ibdiag  += 9;
2914           break;
2915         case 4:
2916           v2   = a->a + ii[row+1];
2917           v3   = a->a + ii[row+2];
2918           v4   = a->a + ii[row+3];
2919           sum1 = b[row];
2920           sum2 = b[row+1];
2921           sum3 = b[row+2];
2922           sum4 = b[row+3];
2923           for (n = 0; n<sz-1; n+=2) {
2924             i1    = idx[0];
2925             i2    = idx[1];
2926             idx  += 2;
2927             tmp0  = x[i1];
2928             tmp1  = x[i2];
2929             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2930             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2931             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2932             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2933           }
2934 
2935           if (n == sz-1) {
2936             tmp0  = x[*idx];
2937             sum1 -= v1[0] * tmp0;
2938             sum2 -= v2[0] * tmp0;
2939             sum3 -= v3[0] * tmp0;
2940             sum4 -= v4[0] * tmp0;
2941           }
2942           t[row]   = sum1;
2943           t[row+1] = sum2;
2944           t[row+2] = sum3;
2945           t[row+3] = sum4;
2946           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2947           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2948           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2949           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2950           ibdiag  += 16;
2951           break;
2952         case 5:
2953           v2   = a->a + ii[row+1];
2954           v3   = a->a + ii[row+2];
2955           v4   = a->a + ii[row+3];
2956           v5   = a->a + ii[row+4];
2957           sum1 = b[row];
2958           sum2 = b[row+1];
2959           sum3 = b[row+2];
2960           sum4 = b[row+3];
2961           sum5 = b[row+4];
2962           for (n = 0; n<sz-1; n+=2) {
2963             i1    = idx[0];
2964             i2    = idx[1];
2965             idx  += 2;
2966             tmp0  = x[i1];
2967             tmp1  = x[i2];
2968             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2969             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2970             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2971             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2972             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2973           }
2974 
2975           if (n == sz-1) {
2976             tmp0  = x[*idx];
2977             sum1 -= v1[0] * tmp0;
2978             sum2 -= v2[0] * tmp0;
2979             sum3 -= v3[0] * tmp0;
2980             sum4 -= v4[0] * tmp0;
2981             sum5 -= v5[0] * tmp0;
2982           }
2983           t[row]   = sum1;
2984           t[row+1] = sum2;
2985           t[row+2] = sum3;
2986           t[row+3] = sum4;
2987           t[row+4] = sum5;
2988           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2989           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2990           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2991           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2992           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2993           ibdiag  += 25;
2994           break;
2995         default:
2996           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2997         }
2998       }
2999 
3000       xb   = t;
3001       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3002     } else xb = b;
3003     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3004 
3005       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3006       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3007         ibdiag -= sizes[i]*sizes[i];
3008         sz      = ii[row+1] - diag[row] - 1;
3009         v1      = a->a + diag[row] + 1;
3010         idx     = a->j + diag[row] + 1;
3011 
3012         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3013         switch (sizes[i]) {
3014         case 1:
3015 
3016           sum1 = xb[row];
3017           for (n = 0; n<sz-1; n+=2) {
3018             i1    = idx[0];
3019             i2    = idx[1];
3020             idx  += 2;
3021             tmp0  = x[i1];
3022             tmp1  = x[i2];
3023             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3024           }
3025 
3026           if (n == sz-1) {
3027             tmp0  = x[*idx];
3028             sum1 -= *v1*tmp0;
3029           }
3030           x[row--] = sum1*(*ibdiag);
3031           break;
3032 
3033         case 2:
3034 
3035           sum1 = xb[row];
3036           sum2 = xb[row-1];
3037           /* note that sum1 is associated with the second of the two rows */
3038           v2 = a->a + diag[row-1] + 2;
3039           for (n = 0; n<sz-1; n+=2) {
3040             i1    = idx[0];
3041             i2    = idx[1];
3042             idx  += 2;
3043             tmp0  = x[i1];
3044             tmp1  = x[i2];
3045             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3046             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3047           }
3048 
3049           if (n == sz-1) {
3050             tmp0  = x[*idx];
3051             sum1 -= *v1*tmp0;
3052             sum2 -= *v2*tmp0;
3053           }
3054           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3055           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3056           break;
3057         case 3:
3058 
3059           sum1 = xb[row];
3060           sum2 = xb[row-1];
3061           sum3 = xb[row-2];
3062           v2   = a->a + diag[row-1] + 2;
3063           v3   = a->a + diag[row-2] + 3;
3064           for (n = 0; n<sz-1; n+=2) {
3065             i1    = idx[0];
3066             i2    = idx[1];
3067             idx  += 2;
3068             tmp0  = x[i1];
3069             tmp1  = x[i2];
3070             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3071             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3072             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3073           }
3074 
3075           if (n == sz-1) {
3076             tmp0  = x[*idx];
3077             sum1 -= *v1*tmp0;
3078             sum2 -= *v2*tmp0;
3079             sum3 -= *v3*tmp0;
3080           }
3081           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3082           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3083           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3084           break;
3085         case 4:
3086 
3087           sum1 = xb[row];
3088           sum2 = xb[row-1];
3089           sum3 = xb[row-2];
3090           sum4 = xb[row-3];
3091           v2   = a->a + diag[row-1] + 2;
3092           v3   = a->a + diag[row-2] + 3;
3093           v4   = a->a + diag[row-3] + 4;
3094           for (n = 0; n<sz-1; n+=2) {
3095             i1    = idx[0];
3096             i2    = idx[1];
3097             idx  += 2;
3098             tmp0  = x[i1];
3099             tmp1  = x[i2];
3100             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3101             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3102             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3103             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3104           }
3105 
3106           if (n == sz-1) {
3107             tmp0  = x[*idx];
3108             sum1 -= *v1*tmp0;
3109             sum2 -= *v2*tmp0;
3110             sum3 -= *v3*tmp0;
3111             sum4 -= *v4*tmp0;
3112           }
3113           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3114           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3115           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3116           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3117           break;
3118         case 5:
3119 
3120           sum1 = xb[row];
3121           sum2 = xb[row-1];
3122           sum3 = xb[row-2];
3123           sum4 = xb[row-3];
3124           sum5 = xb[row-4];
3125           v2   = a->a + diag[row-1] + 2;
3126           v3   = a->a + diag[row-2] + 3;
3127           v4   = a->a + diag[row-3] + 4;
3128           v5   = a->a + diag[row-4] + 5;
3129           for (n = 0; n<sz-1; n+=2) {
3130             i1    = idx[0];
3131             i2    = idx[1];
3132             idx  += 2;
3133             tmp0  = x[i1];
3134             tmp1  = x[i2];
3135             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3136             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3137             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3138             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3139             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3140           }
3141 
3142           if (n == sz-1) {
3143             tmp0  = x[*idx];
3144             sum1 -= *v1*tmp0;
3145             sum2 -= *v2*tmp0;
3146             sum3 -= *v3*tmp0;
3147             sum4 -= *v4*tmp0;
3148             sum5 -= *v5*tmp0;
3149           }
3150           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3151           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3152           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3153           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3154           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3155           break;
3156         default:
3157           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3158         }
3159       }
3160 
3161       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3162     }
3163     its--;
3164   }
3165   while (its--) {
3166 
3167     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3168       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3169            i<m;
3170            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3171 
3172         sz  = diag[row] - ii[row];
3173         v1  = a->a + ii[row];
3174         idx = a->j + ii[row];
3175         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3176         switch (sizes[i]) {
3177         case 1:
3178           sum1 = b[row];
3179           for (n = 0; n<sz-1; n+=2) {
3180             i1    = idx[0];
3181             i2    = idx[1];
3182             idx  += 2;
3183             tmp0  = x[i1];
3184             tmp1  = x[i2];
3185             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3186           }
3187           if (n == sz-1) {
3188             tmp0  = x[*idx++];
3189             sum1 -= *v1 * tmp0;
3190             v1++;
3191           }
3192           t[row]   = sum1;
3193           sz      = ii[row+1] - diag[row] - 1;
3194           idx     = a->j + diag[row] + 1;
3195           v1 += 1;
3196           for (n = 0; n<sz-1; n+=2) {
3197             i1    = idx[0];
3198             i2    = idx[1];
3199             idx  += 2;
3200             tmp0  = x[i1];
3201             tmp1  = x[i2];
3202             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3203           }
3204           if (n == sz-1) {
3205             tmp0  = x[*idx++];
3206             sum1 -= *v1 * tmp0;
3207           }
3208           /* in MatSOR_SeqAIJ this line would be
3209            *
3210            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3211            *
3212            * but omega == 1, so this becomes
3213            *
3214            * x[row] = sum1*(*ibdiag++);
3215            *
3216            */
3217           x[row] = sum1*(*ibdiag);
3218           break;
3219         case 2:
3220           v2   = a->a + ii[row+1];
3221           sum1 = b[row];
3222           sum2 = b[row+1];
3223           for (n = 0; n<sz-1; n+=2) {
3224             i1    = idx[0];
3225             i2    = idx[1];
3226             idx  += 2;
3227             tmp0  = x[i1];
3228             tmp1  = x[i2];
3229             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3230             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3231           }
3232           if (n == sz-1) {
3233             tmp0  = x[*idx++];
3234             sum1 -= v1[0] * tmp0;
3235             sum2 -= v2[0] * tmp0;
3236             v1++; v2++;
3237           }
3238           t[row]   = sum1;
3239           t[row+1] = sum2;
3240           sz      = ii[row+1] - diag[row] - 2;
3241           idx     = a->j + diag[row] + 2;
3242           v1 += 2;
3243           v2 += 2;
3244           for (n = 0; n<sz-1; n+=2) {
3245             i1    = idx[0];
3246             i2    = idx[1];
3247             idx  += 2;
3248             tmp0  = x[i1];
3249             tmp1  = x[i2];
3250             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3251             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3252           }
3253           if (n == sz-1) {
3254             tmp0  = x[*idx];
3255             sum1 -= v1[0] * tmp0;
3256             sum2 -= v2[0] * tmp0;
3257           }
3258           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3259           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3260           break;
3261         case 3:
3262           v2   = a->a + ii[row+1];
3263           v3   = a->a + ii[row+2];
3264           sum1 = b[row];
3265           sum2 = b[row+1];
3266           sum3 = b[row+2];
3267           for (n = 0; n<sz-1; n+=2) {
3268             i1    = idx[0];
3269             i2    = idx[1];
3270             idx  += 2;
3271             tmp0  = x[i1];
3272             tmp1  = x[i2];
3273             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3274             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3275             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3276           }
3277           if (n == sz-1) {
3278             tmp0  = x[*idx++];
3279             sum1 -= v1[0] * tmp0;
3280             sum2 -= v2[0] * tmp0;
3281             sum3 -= v3[0] * tmp0;
3282             v1++; v2++; v3++;
3283           }
3284           t[row]   = sum1;
3285           t[row+1] = sum2;
3286           t[row+2] = sum3;
3287           sz      = ii[row+1] - diag[row] - 3;
3288           idx     = a->j + diag[row] + 3;
3289           v1 += 3;
3290           v2 += 3;
3291           v3 += 3;
3292           for (n = 0; n<sz-1; n+=2) {
3293             i1    = idx[0];
3294             i2    = idx[1];
3295             idx  += 2;
3296             tmp0  = x[i1];
3297             tmp1  = x[i2];
3298             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3299             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3300             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3301           }
3302           if (n == sz-1) {
3303             tmp0  = x[*idx];
3304             sum1 -= v1[0] * tmp0;
3305             sum2 -= v2[0] * tmp0;
3306             sum3 -= v3[0] * tmp0;
3307           }
3308           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3309           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3310           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3311           break;
3312         case 4:
3313           v2   = a->a + ii[row+1];
3314           v3   = a->a + ii[row+2];
3315           v4   = a->a + ii[row+3];
3316           sum1 = b[row];
3317           sum2 = b[row+1];
3318           sum3 = b[row+2];
3319           sum4 = b[row+3];
3320           for (n = 0; n<sz-1; n+=2) {
3321             i1    = idx[0];
3322             i2    = idx[1];
3323             idx  += 2;
3324             tmp0  = x[i1];
3325             tmp1  = x[i2];
3326             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3327             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3328             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3329             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3330           }
3331           if (n == sz-1) {
3332             tmp0  = x[*idx++];
3333             sum1 -= v1[0] * tmp0;
3334             sum2 -= v2[0] * tmp0;
3335             sum3 -= v3[0] * tmp0;
3336             sum4 -= v4[0] * tmp0;
3337             v1++; v2++; v3++; v4++;
3338           }
3339           t[row]   = sum1;
3340           t[row+1] = sum2;
3341           t[row+2] = sum3;
3342           t[row+3] = sum4;
3343           sz      = ii[row+1] - diag[row] - 4;
3344           idx     = a->j + diag[row] + 4;
3345           v1 += 4;
3346           v2 += 4;
3347           v3 += 4;
3348           v4 += 4;
3349           for (n = 0; n<sz-1; n+=2) {
3350             i1    = idx[0];
3351             i2    = idx[1];
3352             idx  += 2;
3353             tmp0  = x[i1];
3354             tmp1  = x[i2];
3355             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3356             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3357             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3358             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3359           }
3360           if (n == sz-1) {
3361             tmp0  = x[*idx];
3362             sum1 -= v1[0] * tmp0;
3363             sum2 -= v2[0] * tmp0;
3364             sum3 -= v3[0] * tmp0;
3365             sum4 -= v4[0] * tmp0;
3366           }
3367           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3368           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3369           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3370           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3371           break;
3372         case 5:
3373           v2   = a->a + ii[row+1];
3374           v3   = a->a + ii[row+2];
3375           v4   = a->a + ii[row+3];
3376           v5   = a->a + ii[row+4];
3377           sum1 = b[row];
3378           sum2 = b[row+1];
3379           sum3 = b[row+2];
3380           sum4 = b[row+3];
3381           sum5 = b[row+4];
3382           for (n = 0; n<sz-1; n+=2) {
3383             i1    = idx[0];
3384             i2    = idx[1];
3385             idx  += 2;
3386             tmp0  = x[i1];
3387             tmp1  = x[i2];
3388             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3389             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3390             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3391             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3392             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3393           }
3394           if (n == sz-1) {
3395             tmp0  = x[*idx++];
3396             sum1 -= v1[0] * tmp0;
3397             sum2 -= v2[0] * tmp0;
3398             sum3 -= v3[0] * tmp0;
3399             sum4 -= v4[0] * tmp0;
3400             sum5 -= v5[0] * tmp0;
3401             v1++; v2++; v3++; v4++; v5++;
3402           }
3403           t[row]   = sum1;
3404           t[row+1] = sum2;
3405           t[row+2] = sum3;
3406           t[row+3] = sum4;
3407           t[row+4] = sum5;
3408           sz      = ii[row+1] - diag[row] - 5;
3409           idx     = a->j + diag[row] + 5;
3410           v1 += 5;
3411           v2 += 5;
3412           v3 += 5;
3413           v4 += 5;
3414           v5 += 5;
3415           for (n = 0; n<sz-1; n+=2) {
3416             i1    = idx[0];
3417             i2    = idx[1];
3418             idx  += 2;
3419             tmp0  = x[i1];
3420             tmp1  = x[i2];
3421             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3422             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3423             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3424             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3425             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3426           }
3427           if (n == sz-1) {
3428             tmp0  = x[*idx];
3429             sum1 -= v1[0] * tmp0;
3430             sum2 -= v2[0] * tmp0;
3431             sum3 -= v3[0] * tmp0;
3432             sum4 -= v4[0] * tmp0;
3433             sum5 -= v5[0] * tmp0;
3434           }
3435           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3436           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3437           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3438           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3439           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3440           break;
3441         default:
3442           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3443         }
3444       }
3445       xb   = t;
3446       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3447     } else xb = b;
3448 
3449     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3450 
3451       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3452       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3453         ibdiag -= sizes[i]*sizes[i];
3454 
3455         /* set RHS */
3456         if (xb == b) {
3457           /* whole (old way) */
3458           sz      = ii[row+1] - ii[row];
3459           idx     = a->j + ii[row];
3460           switch (sizes[i]) {
3461           case 5:
3462             v5      = a->a + ii[row-4];
3463           case 4: /* fall through */
3464             v4      = a->a + ii[row-3];
3465           case 3:
3466             v3      = a->a + ii[row-2];
3467           case 2:
3468             v2      = a->a + ii[row-1];
3469           case 1:
3470             v1      = a->a + ii[row];
3471             break;
3472           default:
3473             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3474           }
3475         } else {
3476           /* upper, no diag */
3477           sz      = ii[row+1] - diag[row] - 1;
3478           idx     = a->j + diag[row] + 1;
3479           switch (sizes[i]) {
3480           case 5:
3481             v5      = a->a + diag[row-4] + 5;
3482           case 4: /* fall through */
3483             v4      = a->a + diag[row-3] + 4;
3484           case 3:
3485             v3      = a->a + diag[row-2] + 3;
3486           case 2:
3487             v2      = a->a + diag[row-1] + 2;
3488           case 1:
3489             v1      = a->a + diag[row] + 1;
3490           }
3491         }
3492         /* set sum */
3493         switch (sizes[i]) {
3494         case 5:
3495           sum5 = xb[row-4];
3496         case 4: /* fall through */
3497           sum4 = xb[row-3];
3498         case 3:
3499           sum3 = xb[row-2];
3500         case 2:
3501           sum2 = xb[row-1];
3502         case 1:
3503           /* note that sum1 is associated with the last row */
3504           sum1 = xb[row];
3505         }
3506         /* do sums */
3507         for (n = 0; n<sz-1; n+=2) {
3508             i1    = idx[0];
3509             i2    = idx[1];
3510             idx  += 2;
3511             tmp0  = x[i1];
3512             tmp1  = x[i2];
3513             switch (sizes[i]) {
3514             case 5:
3515               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3516             case 4: /* fall through */
3517               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3518             case 3:
3519               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3520             case 2:
3521               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3522             case 1:
3523               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3524             }
3525         }
3526         /* ragged edge */
3527         if (n == sz-1) {
3528           tmp0  = x[*idx];
3529           switch (sizes[i]) {
3530           case 5:
3531             sum5 -= *v5*tmp0;
3532           case 4: /* fall through */
3533             sum4 -= *v4*tmp0;
3534           case 3:
3535             sum3 -= *v3*tmp0;
3536           case 2:
3537             sum2 -= *v2*tmp0;
3538           case 1:
3539             sum1 -= *v1*tmp0;
3540           }
3541         }
3542         /* update */
3543         if (xb == b) {
3544           /* whole (old way) w/ diag */
3545           switch (sizes[i]) {
3546           case 5:
3547             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3548             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3549             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3550             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3551             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3552             break;
3553           case 4:
3554             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3555             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3556             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3557             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3558             break;
3559           case 3:
3560             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3561             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3562             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3563             break;
3564           case 2:
3565             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3566             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3567             break;
3568           case 1:
3569             x[row--] += sum1*(*ibdiag);
3570             break;
3571           }
3572         } else {
3573           /* no diag so set =  */
3574           switch (sizes[i]) {
3575           case 5:
3576             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3577             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3578             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3579             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3580             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3581             break;
3582           case 4:
3583             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3584             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3585             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3586             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3587             break;
3588           case 3:
3589             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3590             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3591             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3592             break;
3593           case 2:
3594             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3595             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3596             break;
3597           case 1:
3598             x[row--] = sum1*(*ibdiag);
3599             break;
3600           }
3601         }
3602       }
3603       if (xb == b) {
3604         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3605       } else {
3606         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3607       }
3608     }
3609   }
3610   if (flag & SOR_EISENSTAT) {
3611     /*
3612           Apply  (U + D)^-1  where D is now the block diagonal
3613     */
3614     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3615     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3616       ibdiag -= sizes[i]*sizes[i];
3617       sz      = ii[row+1] - diag[row] - 1;
3618       v1      = a->a + diag[row] + 1;
3619       idx     = a->j + diag[row] + 1;
3620       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3621       switch (sizes[i]) {
3622       case 1:
3623 
3624         sum1 = b[row];
3625         for (n = 0; n<sz-1; n+=2) {
3626           i1    = idx[0];
3627           i2    = idx[1];
3628           idx  += 2;
3629           tmp0  = x[i1];
3630           tmp1  = x[i2];
3631           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3632         }
3633 
3634         if (n == sz-1) {
3635           tmp0  = x[*idx];
3636           sum1 -= *v1*tmp0;
3637         }
3638         x[row] = sum1*(*ibdiag);row--;
3639         break;
3640 
3641       case 2:
3642 
3643         sum1 = b[row];
3644         sum2 = b[row-1];
3645         /* note that sum1 is associated with the second of the two rows */
3646         v2 = a->a + diag[row-1] + 2;
3647         for (n = 0; n<sz-1; n+=2) {
3648           i1    = idx[0];
3649           i2    = idx[1];
3650           idx  += 2;
3651           tmp0  = x[i1];
3652           tmp1  = x[i2];
3653           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3654           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3655         }
3656 
3657         if (n == sz-1) {
3658           tmp0  = x[*idx];
3659           sum1 -= *v1*tmp0;
3660           sum2 -= *v2*tmp0;
3661         }
3662         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3663         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3664         row     -= 2;
3665         break;
3666       case 3:
3667 
3668         sum1 = b[row];
3669         sum2 = b[row-1];
3670         sum3 = b[row-2];
3671         v2   = a->a + diag[row-1] + 2;
3672         v3   = a->a + diag[row-2] + 3;
3673         for (n = 0; n<sz-1; n+=2) {
3674           i1    = idx[0];
3675           i2    = idx[1];
3676           idx  += 2;
3677           tmp0  = x[i1];
3678           tmp1  = x[i2];
3679           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3680           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3681           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3682         }
3683 
3684         if (n == sz-1) {
3685           tmp0  = x[*idx];
3686           sum1 -= *v1*tmp0;
3687           sum2 -= *v2*tmp0;
3688           sum3 -= *v3*tmp0;
3689         }
3690         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3691         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3692         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3693         row     -= 3;
3694         break;
3695       case 4:
3696 
3697         sum1 = b[row];
3698         sum2 = b[row-1];
3699         sum3 = b[row-2];
3700         sum4 = b[row-3];
3701         v2   = a->a + diag[row-1] + 2;
3702         v3   = a->a + diag[row-2] + 3;
3703         v4   = a->a + diag[row-3] + 4;
3704         for (n = 0; n<sz-1; n+=2) {
3705           i1    = idx[0];
3706           i2    = idx[1];
3707           idx  += 2;
3708           tmp0  = x[i1];
3709           tmp1  = x[i2];
3710           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3711           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3712           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3713           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3714         }
3715 
3716         if (n == sz-1) {
3717           tmp0  = x[*idx];
3718           sum1 -= *v1*tmp0;
3719           sum2 -= *v2*tmp0;
3720           sum3 -= *v3*tmp0;
3721           sum4 -= *v4*tmp0;
3722         }
3723         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3724         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3725         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3726         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3727         row     -= 4;
3728         break;
3729       case 5:
3730 
3731         sum1 = b[row];
3732         sum2 = b[row-1];
3733         sum3 = b[row-2];
3734         sum4 = b[row-3];
3735         sum5 = b[row-4];
3736         v2   = a->a + diag[row-1] + 2;
3737         v3   = a->a + diag[row-2] + 3;
3738         v4   = a->a + diag[row-3] + 4;
3739         v5   = a->a + diag[row-4] + 5;
3740         for (n = 0; n<sz-1; n+=2) {
3741           i1    = idx[0];
3742           i2    = idx[1];
3743           idx  += 2;
3744           tmp0  = x[i1];
3745           tmp1  = x[i2];
3746           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3747           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3748           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3749           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3750           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3751         }
3752 
3753         if (n == sz-1) {
3754           tmp0  = x[*idx];
3755           sum1 -= *v1*tmp0;
3756           sum2 -= *v2*tmp0;
3757           sum3 -= *v3*tmp0;
3758           sum4 -= *v4*tmp0;
3759           sum5 -= *v5*tmp0;
3760         }
3761         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3762         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3763         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3764         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3765         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3766         row     -= 5;
3767         break;
3768       default:
3769         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3770       }
3771     }
3772     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3773 
3774     /*
3775            t = b - D x    where D is the block diagonal
3776     */
3777     cnt = 0;
3778     for (i=0, row=0; i<m; i++) {
3779       switch (sizes[i]) {
3780       case 1:
3781         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3782         break;
3783       case 2:
3784         x1       = x[row]; x2 = x[row+1];
3785         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3786         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3787         t[row]   = b[row] - tmp1;
3788         t[row+1] = b[row+1] - tmp2; row += 2;
3789         cnt     += 4;
3790         break;
3791       case 3:
3792         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3793         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3794         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3795         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3796         t[row]   = b[row] - tmp1;
3797         t[row+1] = b[row+1] - tmp2;
3798         t[row+2] = b[row+2] - tmp3; row += 3;
3799         cnt     += 9;
3800         break;
3801       case 4:
3802         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3803         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3804         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3805         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3806         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3807         t[row]   = b[row] - tmp1;
3808         t[row+1] = b[row+1] - tmp2;
3809         t[row+2] = b[row+2] - tmp3;
3810         t[row+3] = b[row+3] - tmp4; row += 4;
3811         cnt     += 16;
3812         break;
3813       case 5:
3814         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3815         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3816         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3817         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3818         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3819         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3820         t[row]   = b[row] - tmp1;
3821         t[row+1] = b[row+1] - tmp2;
3822         t[row+2] = b[row+2] - tmp3;
3823         t[row+3] = b[row+3] - tmp4;
3824         t[row+4] = b[row+4] - tmp5;row += 5;
3825         cnt     += 25;
3826         break;
3827       default:
3828         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3829       }
3830     }
3831     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3832 
3833 
3834 
3835     /*
3836           Apply (L + D)^-1 where D is the block diagonal
3837     */
3838     for (i=0, row=0; i<m; i++) {
3839       sz  = diag[row] - ii[row];
3840       v1  = a->a + ii[row];
3841       idx = a->j + ii[row];
3842       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3843       switch (sizes[i]) {
3844       case 1:
3845 
3846         sum1 = t[row];
3847         for (n = 0; n<sz-1; n+=2) {
3848           i1    = idx[0];
3849           i2    = idx[1];
3850           idx  += 2;
3851           tmp0  = t[i1];
3852           tmp1  = t[i2];
3853           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3854         }
3855 
3856         if (n == sz-1) {
3857           tmp0  = t[*idx];
3858           sum1 -= *v1 * tmp0;
3859         }
3860         x[row] += t[row] = sum1*(*ibdiag++); row++;
3861         break;
3862       case 2:
3863         v2   = a->a + ii[row+1];
3864         sum1 = t[row];
3865         sum2 = t[row+1];
3866         for (n = 0; n<sz-1; n+=2) {
3867           i1    = idx[0];
3868           i2    = idx[1];
3869           idx  += 2;
3870           tmp0  = t[i1];
3871           tmp1  = t[i2];
3872           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3873           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3874         }
3875 
3876         if (n == sz-1) {
3877           tmp0  = t[*idx];
3878           sum1 -= v1[0] * tmp0;
3879           sum2 -= v2[0] * tmp0;
3880         }
3881         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3882         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3883         ibdiag   += 4; row += 2;
3884         break;
3885       case 3:
3886         v2   = a->a + ii[row+1];
3887         v3   = a->a + ii[row+2];
3888         sum1 = t[row];
3889         sum2 = t[row+1];
3890         sum3 = t[row+2];
3891         for (n = 0; n<sz-1; n+=2) {
3892           i1    = idx[0];
3893           i2    = idx[1];
3894           idx  += 2;
3895           tmp0  = t[i1];
3896           tmp1  = t[i2];
3897           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3898           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3899           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3900         }
3901 
3902         if (n == sz-1) {
3903           tmp0  = t[*idx];
3904           sum1 -= v1[0] * tmp0;
3905           sum2 -= v2[0] * tmp0;
3906           sum3 -= v3[0] * tmp0;
3907         }
3908         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3909         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3910         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3911         ibdiag   += 9; row += 3;
3912         break;
3913       case 4:
3914         v2   = a->a + ii[row+1];
3915         v3   = a->a + ii[row+2];
3916         v4   = a->a + ii[row+3];
3917         sum1 = t[row];
3918         sum2 = t[row+1];
3919         sum3 = t[row+2];
3920         sum4 = t[row+3];
3921         for (n = 0; n<sz-1; n+=2) {
3922           i1    = idx[0];
3923           i2    = idx[1];
3924           idx  += 2;
3925           tmp0  = t[i1];
3926           tmp1  = t[i2];
3927           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3928           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3929           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3930           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3931         }
3932 
3933         if (n == sz-1) {
3934           tmp0  = t[*idx];
3935           sum1 -= v1[0] * tmp0;
3936           sum2 -= v2[0] * tmp0;
3937           sum3 -= v3[0] * tmp0;
3938           sum4 -= v4[0] * tmp0;
3939         }
3940         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3941         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3942         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3943         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3944         ibdiag   += 16; row += 4;
3945         break;
3946       case 5:
3947         v2   = a->a + ii[row+1];
3948         v3   = a->a + ii[row+2];
3949         v4   = a->a + ii[row+3];
3950         v5   = a->a + ii[row+4];
3951         sum1 = t[row];
3952         sum2 = t[row+1];
3953         sum3 = t[row+2];
3954         sum4 = t[row+3];
3955         sum5 = t[row+4];
3956         for (n = 0; n<sz-1; n+=2) {
3957           i1    = idx[0];
3958           i2    = idx[1];
3959           idx  += 2;
3960           tmp0  = t[i1];
3961           tmp1  = t[i2];
3962           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3963           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3964           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3965           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3966           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3967         }
3968 
3969         if (n == sz-1) {
3970           tmp0  = t[*idx];
3971           sum1 -= v1[0] * tmp0;
3972           sum2 -= v2[0] * tmp0;
3973           sum3 -= v3[0] * tmp0;
3974           sum4 -= v4[0] * tmp0;
3975           sum5 -= v5[0] * tmp0;
3976         }
3977         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3978         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3979         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3980         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3981         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3982         ibdiag   += 25; row += 5;
3983         break;
3984       default:
3985         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3986       }
3987     }
3988     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3989   }
3990   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3991   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3992   PetscFunctionReturn(0);
3993 }
3994 
3995 #undef __FUNCT__
3996 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3997 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3998 {
3999   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
4000   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
4001   const MatScalar   *bdiag = a->inode.bdiag;
4002   const PetscScalar *b;
4003   PetscErrorCode    ierr;
4004   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4005   const PetscInt    *sizes = a->inode.size;
4006 
4007   PetscFunctionBegin;
4008   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4009   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4010   cnt  = 0;
4011   for (i=0, row=0; i<m; i++) {
4012     switch (sizes[i]) {
4013     case 1:
4014       x[row] = b[row]*bdiag[cnt++];row++;
4015       break;
4016     case 2:
4017       x1       = b[row]; x2 = b[row+1];
4018       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4019       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4020       x[row++] = tmp1;
4021       x[row++] = tmp2;
4022       cnt     += 4;
4023       break;
4024     case 3:
4025       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4026       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4027       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4028       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4029       x[row++] = tmp1;
4030       x[row++] = tmp2;
4031       x[row++] = tmp3;
4032       cnt     += 9;
4033       break;
4034     case 4:
4035       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4036       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4037       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4038       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4039       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4040       x[row++] = tmp1;
4041       x[row++] = tmp2;
4042       x[row++] = tmp3;
4043       x[row++] = tmp4;
4044       cnt     += 16;
4045       break;
4046     case 5:
4047       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4048       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4049       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4050       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4051       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4052       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4053       x[row++] = tmp1;
4054       x[row++] = tmp2;
4055       x[row++] = tmp3;
4056       x[row++] = tmp4;
4057       x[row++] = tmp5;
4058       cnt     += 25;
4059       break;
4060     default:
4061       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4062     }
4063   }
4064   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4065   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4066   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4067   PetscFunctionReturn(0);
4068 }
4069 
4070 /*
4071     samestructure indicates that the matrix has not changed its nonzero structure so we
4072     do not need to recompute the inodes
4073 */
4074 #undef __FUNCT__
4075 #define __FUNCT__ "MatSeqAIJCheckInode"
4076 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4077 {
4078   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4079   PetscErrorCode ierr;
4080   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4081   PetscBool      flag;
4082   const PetscInt *idx,*idy,*ii;
4083 
4084   PetscFunctionBegin;
4085   if (!a->inode.use) PetscFunctionReturn(0);
4086   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4087 
4088   m = A->rmap->n;
4089   if (a->inode.size) ns = a->inode.size;
4090   else {
4091     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4092   }
4093 
4094   i          = 0;
4095   node_count = 0;
4096   idx        = a->j;
4097   ii         = a->i;
4098   while (i < m) {                /* For each row */
4099     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4100     /* Limits the number of elements in a node to 'a->inode.limit' */
4101     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4102       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4103       if (nzy != nzx) break;
4104       idy += nzx;              /* Same nonzero pattern */
4105       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4106       if (!flag) break;
4107     }
4108     ns[node_count++] = blk_size;
4109     idx             += blk_size*nzx;
4110     i                = j;
4111   }
4112   /* If not enough inodes found,, do not use inode version of the routines */
4113   if (!m || node_count > .8*m) {
4114     ierr = PetscFree(ns);CHKERRQ(ierr);
4115 
4116     a->inode.node_count       = 0;
4117     a->inode.size             = NULL;
4118     a->inode.use              = PETSC_FALSE;
4119     A->ops->mult              = MatMult_SeqAIJ;
4120     A->ops->sor               = MatSOR_SeqAIJ;
4121     A->ops->multadd           = MatMultAdd_SeqAIJ;
4122     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4123     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4124     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4125     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4126     A->ops->coloringpatch     = 0;
4127     A->ops->multdiagonalblock = 0;
4128 
4129     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4130   } else {
4131     if (!A->factortype) {
4132       A->ops->mult              = MatMult_SeqAIJ_Inode;
4133       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4134       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4135       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4136       if (A->rmap->n == A->cmap->n) {
4137         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4138         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4139         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4140         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4141         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4142       }
4143     } else {
4144       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4145     }
4146     a->inode.node_count = node_count;
4147     a->inode.size       = ns;
4148     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4149   }
4150   a->inode.checked          = PETSC_TRUE;
4151   a->inode.mat_nonzerostate = A->nonzerostate;
4152   PetscFunctionReturn(0);
4153 }
4154 
4155 #undef __FUNCT__
4156 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4157 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4158 {
4159   Mat            B =*C;
4160   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4161   PetscErrorCode ierr;
4162   PetscInt       m=A->rmap->n;
4163 
4164   PetscFunctionBegin;
4165   c->inode.use       = a->inode.use;
4166   c->inode.limit     = a->inode.limit;
4167   c->inode.max_limit = a->inode.max_limit;
4168   if (a->inode.size) {
4169     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4170     c->inode.node_count = a->inode.node_count;
4171     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4172     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4173     if (!B->factortype) {
4174       B->ops->mult              = MatMult_SeqAIJ_Inode;
4175       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4176       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4177       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4178       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4179       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4180       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4181       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4182       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4183     } else {
4184       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4185     }
4186   } else {
4187     c->inode.size       = 0;
4188     c->inode.node_count = 0;
4189   }
4190   c->inode.ibdiagvalid = PETSC_FALSE;
4191   c->inode.ibdiag      = 0;
4192   c->inode.bdiag       = 0;
4193   PetscFunctionReturn(0);
4194 }
4195 
4196 #undef __FUNCT__
4197 #define __FUNCT__ "MatGetRow_FactoredLU"
4198 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4199 {
4200   PetscInt       k;
4201   const PetscInt *vi;
4202 
4203   PetscFunctionBegin;
4204   vi = aj + ai[row];
4205   for (k=0; k<nzl; k++) cols[k] = vi[k];
4206   vi        = aj + adiag[row];
4207   cols[nzl] = vi[0];
4208   vi        = aj + adiag[row+1]+1;
4209   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4210   PetscFunctionReturn(0);
4211 }
4212 /*
4213    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4214    Modified from MatSeqAIJCheckInode().
4215 
4216    Input Parameters:
4217 .  Mat A - ILU or LU matrix factor
4218 
4219 */
4220 #undef __FUNCT__
4221 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4222 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4223 {
4224   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4225   PetscErrorCode ierr;
4226   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4227   PetscInt       *cols1,*cols2,*ns;
4228   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4229   PetscBool      flag;
4230 
4231   PetscFunctionBegin;
4232   if (!a->inode.use)    PetscFunctionReturn(0);
4233   if (a->inode.checked) PetscFunctionReturn(0);
4234 
4235   m = A->rmap->n;
4236   if (a->inode.size) ns = a->inode.size;
4237   else {
4238     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4239   }
4240 
4241   i          = 0;
4242   node_count = 0;
4243   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4244   while (i < m) {                /* For each row */
4245     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4246     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4247     nzx  = nzl1 + nzu1 + 1;
4248     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4249 
4250     /* Limits the number of elements in a node to 'a->inode.limit' */
4251     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4252       nzl2 = ai[j+1] - ai[j];
4253       nzu2 = adiag[j] - adiag[j+1] - 1;
4254       nzy  = nzl2 + nzu2 + 1;
4255       if (nzy != nzx) break;
4256       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4257       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4258       if (!flag) break;
4259     }
4260     ns[node_count++] = blk_size;
4261     i                = j;
4262   }
4263   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4264   /* If not enough inodes found,, do not use inode version of the routines */
4265   if (!m || node_count > .8*m) {
4266     ierr = PetscFree(ns);CHKERRQ(ierr);
4267 
4268     a->inode.node_count = 0;
4269     a->inode.size       = NULL;
4270     a->inode.use        = PETSC_FALSE;
4271 
4272     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4273   } else {
4274     A->ops->mult              = 0;
4275     A->ops->sor               = 0;
4276     A->ops->multadd           = 0;
4277     A->ops->getrowij          = 0;
4278     A->ops->restorerowij      = 0;
4279     A->ops->getcolumnij       = 0;
4280     A->ops->restorecolumnij   = 0;
4281     A->ops->coloringpatch     = 0;
4282     A->ops->multdiagonalblock = 0;
4283     a->inode.node_count       = node_count;
4284     a->inode.size             = ns;
4285 
4286     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4287   }
4288   a->inode.checked = PETSC_TRUE;
4289   PetscFunctionReturn(0);
4290 }
4291 
4292 #undef __FUNCT__
4293 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4294 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4295 {
4296   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4297 
4298   PetscFunctionBegin;
4299   a->inode.ibdiagvalid = PETSC_FALSE;
4300   PetscFunctionReturn(0);
4301 }
4302 
4303 /*
4304      This is really ugly. if inodes are used this replaces the
4305   permutations with ones that correspond to rows/cols of the matrix
4306   rather then inode blocks
4307 */
4308 #undef __FUNCT__
4309 #define __FUNCT__ "MatInodeAdjustForInodes"
4310 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4311 {
4312   PetscErrorCode ierr;
4313 
4314   PetscFunctionBegin;
4315   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4316   PetscFunctionReturn(0);
4317 }
4318 
4319 #undef __FUNCT__
4320 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4321 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4322 {
4323   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4324   PetscErrorCode ierr;
4325   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4326   const PetscInt *ridx,*cidx;
4327   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4328   PetscInt       nslim_col,*ns_col;
4329   IS             ris = *rperm,cis = *cperm;
4330 
4331   PetscFunctionBegin;
4332   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4333   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4334 
4335   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4336   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4337   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4338 
4339   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4340   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4341 
4342   /* Form the inode structure for the rows of permuted matric using inv perm*/
4343   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4344 
4345   /* Construct the permutations for rows*/
4346   for (i=0,row = 0; i<nslim_row; ++i) {
4347     indx      = ridx[i];
4348     start_val = tns[indx];
4349     end_val   = tns[indx + 1];
4350     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4351   }
4352 
4353   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4354   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4355 
4356   /* Construct permutations for columns */
4357   for (i=0,col=0; i<nslim_col; ++i) {
4358     indx      = cidx[i];
4359     start_val = tns[indx];
4360     end_val   = tns[indx + 1];
4361     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4362   }
4363 
4364   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4365   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4366   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4367   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4368 
4369   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4370   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4371 
4372   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4373   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4374   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4375   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4376   ierr = PetscFree(tns);CHKERRQ(ierr);
4377   PetscFunctionReturn(0);
4378 }
4379 
4380 #undef __FUNCT__
4381 #define __FUNCT__ "MatInodeGetInodeSizes"
4382 /*@C
4383    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4384 
4385    Not Collective
4386 
4387    Input Parameter:
4388 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4389 
4390    Output Parameter:
4391 +  node_count - no of inodes present in the matrix.
4392 .  sizes      - an array of size node_count,with sizes of each inode.
4393 -  limit      - the max size used to generate the inodes.
4394 
4395    Level: advanced
4396 
4397    Notes: This routine returns some internal storage information
4398    of the matrix, it is intended to be used by advanced users.
4399    It should be called after the matrix is assembled.
4400    The contents of the sizes[] array should not be changed.
4401    NULL may be passed for information not requested.
4402 
4403 .keywords: matrix, seqaij, get, inode
4404 
4405 .seealso: MatGetInfo()
4406 @*/
4407 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4408 {
4409   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4410 
4411   PetscFunctionBegin;
4412   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4413   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4414   if (f) {
4415     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4416   }
4417   PetscFunctionReturn(0);
4418 }
4419 
4420 #undef __FUNCT__
4421 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4422 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4423 {
4424   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4425 
4426   PetscFunctionBegin;
4427   if (node_count) *node_count = a->inode.node_count;
4428   if (sizes)      *sizes      = a->inode.size;
4429   if (limit)      *limit      = a->inode.limit;
4430   PetscFunctionReturn(0);
4431 }
4432