xref: /petsc/src/mat/impls/aij/seq/inode.c (revision dced61a5cfeeeda68dfa4ee3e53b34d3cfb8da9f) !
1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
8 #undef __FUNCT__
9 #define __FUNCT__ "Mat_CreateColInode"
10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns)
11 {
12   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
13   PetscErrorCode ierr;
14   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
15 
16   PetscFunctionBegin;
17   n      = A->cmap->n;
18   m      = A->rmap->n;
19   ns_row = a->inode.size;
20 
21   min_mn = (m < n) ? m : n;
22   if (!ns) {
23     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
24     for (; count+1 < n; count++,i++) ;
25     if (count < n)  {
26       i++;
27     }
28     *size = i;
29     PetscFunctionReturn(0);
30   }
31   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
32 
33   /* Use the same row structure wherever feasible. */
34   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
35     ns_col[i] = ns_row[i];
36   }
37 
38   /* if m < n; pad up the remainder with inode_limit */
39   for (; count+1 < n; count++,i++) {
40     ns_col[i] = 1;
41   }
42   /* The last node is the odd ball. padd it up with the remaining rows; */
43   if (count < n) {
44     ns_col[i] = n - count;
45     i++;
46   } else if (count > n) {
47     /* Adjust for the over estimation */
48     ns_col[i-1] += n - count;
49   }
50   *size = i;
51   *ns   = ns_col;
52   PetscFunctionReturn(0);
53 }
54 
55 
56 /*
57       This builds symmetric version of nonzero structure,
58 */
59 #undef __FUNCT__
60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric"
61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
62 {
63   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
64   PetscErrorCode ierr;
65   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
66   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
67   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
68 
69   PetscFunctionBegin;
70   nslim_row = a->inode.node_count;
71   m         = A->rmap->n;
72   n         = A->cmap->n;
73   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
74 
75   /* Use the row_inode as column_inode */
76   nslim_col = nslim_row;
77   ns_col    = ns_row;
78 
79   /* allocate space for reformated inode structure */
80   ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr);
81   ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr);
82   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
83 
84   for (i1=0,col=0; i1<nslim_col; ++i1) {
85     nsz = ns_col[i1];
86     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
87   }
88   /* allocate space for row pointers */
89   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
90   *iia = ia;
91   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
92   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
93 
94   /* determine the number of columns in each row */
95   ia[0] = oshift;
96   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
97 
98     j    = aj + ai[row] + ishift;
99     jmax = aj + ai[row+1] + ishift;
100     col  = *j++ + ishift;
101     i2   = tvc[col];
102     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
103       ia[i1+1]++;
104       ia[i2+1]++;
105       i2++;                     /* Start col of next node */
106       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
107       i2 = tvc[col];
108     }
109     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
110   }
111 
112   /* shift ia[i] to point to next row */
113   for (i1=1; i1<nslim_row+1; i1++) {
114     row        = ia[i1-1];
115     ia[i1]    += row;
116     work[i1-1] = row - oshift;
117   }
118 
119   /* allocate space for column pointers */
120   nz   = ia[nslim_row] + (!ishift);
121   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
122   *jja = ja;
123 
124   /* loop over lower triangular part putting into ja */
125   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
126     j    = aj + ai[row] + ishift;
127     jmax = aj + ai[row+1] + ishift;
128     col  = *j++ + ishift;
129     i2   = tvc[col];
130     while (i2<i1 && j<jmax) {
131       ja[work[i2]++] = i1 + oshift;
132       ja[work[i1]++] = i2 + oshift;
133       ++i2;
134       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
135       i2 = tvc[col];
136     }
137     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
138 
139   }
140   ierr = PetscFree(work);CHKERRQ(ierr);
141   ierr = PetscFree(tns);CHKERRQ(ierr);
142   ierr = PetscFree(tvc);CHKERRQ(ierr);
143   PetscFunctionReturn(0);
144 }
145 
146 /*
147       This builds nonsymmetric version of nonzero structure,
148 */
149 #undef __FUNCT__
150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric"
151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
152 {
153   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
154   PetscErrorCode ierr;
155   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
156   PetscInt       *tns,*tvc,nsz,i1,i2;
157   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
158 
159   PetscFunctionBegin;
160   nslim_row = a->inode.node_count;
161   n         = A->cmap->n;
162 
163   /* Create The column_inode for this matrix */
164   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
165 
166   /* allocate space for reformated column_inode structure */
167   ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr);
168   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
169   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
170 
171   for (i1=0,col=0; i1<nslim_col; ++i1) {
172     nsz = ns_col[i1];
173     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
174   }
175   /* allocate space for row pointers */
176   ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
177   *iia = ia;
178   ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr);
179   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
180 
181   /* determine the number of columns in each row */
182   ia[0] = oshift;
183   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
184     j   = aj + ai[row] + ishift;
185     col = *j++ + ishift;
186     i2  = tvc[col];
187     nz  = ai[row+1] - ai[row];
188     while (nz-- > 0) {           /* off-diagonal elemets */
189       ia[i1+1]++;
190       i2++;                     /* Start col of next node */
191       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
192       if (nz > 0) i2 = tvc[col];
193     }
194   }
195 
196   /* shift ia[i] to point to next row */
197   for (i1=1; i1<nslim_row+1; i1++) {
198     row        = ia[i1-1];
199     ia[i1]    += row;
200     work[i1-1] = row - oshift;
201   }
202 
203   /* allocate space for column pointers */
204   nz   = ia[nslim_row] + (!ishift);
205   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
206   *jja = ja;
207 
208   /* loop over matrix putting into ja */
209   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
210     j   = aj + ai[row] + ishift;
211     col = *j++ + ishift;
212     i2  = tvc[col];
213     nz  = ai[row+1] - ai[row];
214     while (nz-- > 0) {
215       ja[work[i1]++] = i2 + oshift;
216       ++i2;
217       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
218       if (nz > 0) i2 = tvc[col];
219     }
220   }
221   ierr = PetscFree(ns_col);CHKERRQ(ierr);
222   ierr = PetscFree(work);CHKERRQ(ierr);
223   ierr = PetscFree(tns);CHKERRQ(ierr);
224   ierr = PetscFree(tvc);CHKERRQ(ierr);
225   PetscFunctionReturn(0);
226 }
227 
228 #undef __FUNCT__
229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode"
230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
231 {
232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   *n = a->inode.node_count;
237   if (!ia) PetscFunctionReturn(0);
238   if (!blockcompressed) {
239     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
240   } else if (symmetric) {
241     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
242   } else {
243     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #undef __FUNCT__
249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode"
250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
251 {
252   PetscErrorCode ierr;
253 
254   PetscFunctionBegin;
255   if (!ia) PetscFunctionReturn(0);
256 
257   if (!blockcompressed) {
258     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
259   } else {
260     ierr = PetscFree(*ia);CHKERRQ(ierr);
261     ierr = PetscFree(*ja);CHKERRQ(ierr);
262   }
263   PetscFunctionReturn(0);
264 }
265 
266 /* ----------------------------------------------------------- */
267 
268 #undef __FUNCT__
269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric"
270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
271 {
272   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273   PetscErrorCode ierr;
274   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
275   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276 
277   PetscFunctionBegin;
278   nslim_row = a->inode.node_count;
279   n         = A->cmap->n;
280 
281   /* Create The column_inode for this matrix */
282   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
283 
284   /* allocate space for reformated column_inode structure */
285   ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr);
286   ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr);
287   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
288 
289   for (i1=0,col=0; i1<nslim_col; ++i1) {
290     nsz = ns_col[i1];
291     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
292   }
293   /* allocate space for column pointers */
294   ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
295   *iia = ia;
296   ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr);
297   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
298 
299   /* determine the number of columns in each row */
300   ia[0] = oshift;
301   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
302     j   = aj + ai[row] + ishift;
303     col = *j++ + ishift;
304     i2  = tvc[col];
305     nz  = ai[row+1] - ai[row];
306     while (nz-- > 0) {           /* off-diagonal elemets */
307       /* ia[i1+1]++; */
308       ia[i2+1]++;
309       i2++;
310       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
311       if (nz > 0) i2 = tvc[col];
312     }
313   }
314 
315   /* shift ia[i] to point to next col */
316   for (i1=1; i1<nslim_col+1; i1++) {
317     col        = ia[i1-1];
318     ia[i1]    += col;
319     work[i1-1] = col - oshift;
320   }
321 
322   /* allocate space for column pointers */
323   nz   = ia[nslim_col] + (!ishift);
324   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
325   *jja = ja;
326 
327   /* loop over matrix putting into ja */
328   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
329     j   = aj + ai[row] + ishift;
330     col = *j++ + ishift;
331     i2  = tvc[col];
332     nz  = ai[row+1] - ai[row];
333     while (nz-- > 0) {
334       /* ja[work[i1]++] = i2 + oshift; */
335       ja[work[i2]++] = i1 + oshift;
336       i2++;
337       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
338       if (nz > 0) i2 = tvc[col];
339     }
340   }
341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
342   ierr = PetscFree(work);CHKERRQ(ierr);
343   ierr = PetscFree(tns);CHKERRQ(ierr);
344   ierr = PetscFree(tvc);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode"
350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
351 {
352   PetscErrorCode ierr;
353 
354   PetscFunctionBegin;
355   ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr);
356   if (!ia) PetscFunctionReturn(0);
357 
358   if (!blockcompressed) {
359     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
360   } else if (symmetric) {
361     /* Since the indices are symmetric it does'nt matter */
362     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
363   } else {
364     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 #undef __FUNCT__
370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode"
371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   if (!ia) PetscFunctionReturn(0);
377   if (!blockcompressed) {
378     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);;
379   } else {
380     ierr = PetscFree(*ia);CHKERRQ(ierr);
381     ierr = PetscFree(*ja);CHKERRQ(ierr);
382   }
383   PetscFunctionReturn(0);
384 }
385 
386 /* ----------------------------------------------------------- */
387 
388 #undef __FUNCT__
389 #define __FUNCT__ "MatMult_SeqAIJ_Inode"
390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
391 {
392   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
393   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
394   PetscScalar       *y;
395   const PetscScalar *x;
396   const MatScalar   *v1,*v2,*v3,*v4,*v5;
397   PetscErrorCode    ierr;
398   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
399   const PetscInt    *idx,*ns,*ii;
400 
401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
403 #endif
404 
405   PetscFunctionBegin;
406   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
407   node_max = a->inode.node_count;
408   ns       = a->inode.size;     /* Node Size array */
409   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
411   idx      = a->j;
412   v1       = a->a;
413   ii       = a->i;
414 
415   for (i = 0,row = 0; i< node_max; ++i) {
416     nsz         = ns[i];
417     n           = ii[1] - ii[0];
418     nonzerorow += (n>0)*nsz;
419     ii         += nsz;
420     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
421     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
422     sz = n;                     /* No of non zeros in this row */
423                                 /* Switch on the size of Node */
424     switch (nsz) {               /* Each loop in 'case' is unrolled */
425     case 1:
426       sum1 = 0.;
427 
428       for (n = 0; n< sz-1; n+=2) {
429         i1    = idx[0];         /* The instructions are ordered to */
430         i2    = idx[1];         /* make the compiler's job easy */
431         idx  += 2;
432         tmp0  = x[i1];
433         tmp1  = x[i2];
434         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
435       }
436 
437       if (n == sz-1) {          /* Take care of the last nonzero  */
438         tmp0  = x[*idx++];
439         sum1 += *v1++ *tmp0;
440       }
441       y[row++]=sum1;
442       break;
443     case 2:
444       sum1 = 0.;
445       sum2 = 0.;
446       v2   = v1 + n;
447 
448       for (n = 0; n< sz-1; n+=2) {
449         i1    = idx[0];
450         i2    = idx[1];
451         idx  += 2;
452         tmp0  = x[i1];
453         tmp1  = x[i2];
454         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
455         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
456       }
457       if (n == sz-1) {
458         tmp0  = x[*idx++];
459         sum1 += *v1++ * tmp0;
460         sum2 += *v2++ * tmp0;
461       }
462       y[row++]=sum1;
463       y[row++]=sum2;
464       v1      =v2;              /* Since the next block to be processed starts there*/
465       idx    +=sz;
466       break;
467     case 3:
468       sum1 = 0.;
469       sum2 = 0.;
470       sum3 = 0.;
471       v2   = v1 + n;
472       v3   = v2 + n;
473 
474       for (n = 0; n< sz-1; n+=2) {
475         i1    = idx[0];
476         i2    = idx[1];
477         idx  += 2;
478         tmp0  = x[i1];
479         tmp1  = x[i2];
480         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
481         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
482         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
483       }
484       if (n == sz-1) {
485         tmp0  = x[*idx++];
486         sum1 += *v1++ * tmp0;
487         sum2 += *v2++ * tmp0;
488         sum3 += *v3++ * tmp0;
489       }
490       y[row++]=sum1;
491       y[row++]=sum2;
492       y[row++]=sum3;
493       v1      =v3;              /* Since the next block to be processed starts there*/
494       idx    +=2*sz;
495       break;
496     case 4:
497       sum1 = 0.;
498       sum2 = 0.;
499       sum3 = 0.;
500       sum4 = 0.;
501       v2   = v1 + n;
502       v3   = v2 + n;
503       v4   = v3 + n;
504 
505       for (n = 0; n< sz-1; n+=2) {
506         i1    = idx[0];
507         i2    = idx[1];
508         idx  += 2;
509         tmp0  = x[i1];
510         tmp1  = x[i2];
511         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
512         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
513         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
514         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
515       }
516       if (n == sz-1) {
517         tmp0  = x[*idx++];
518         sum1 += *v1++ * tmp0;
519         sum2 += *v2++ * tmp0;
520         sum3 += *v3++ * tmp0;
521         sum4 += *v4++ * tmp0;
522       }
523       y[row++]=sum1;
524       y[row++]=sum2;
525       y[row++]=sum3;
526       y[row++]=sum4;
527       v1      =v4;              /* Since the next block to be processed starts there*/
528       idx    +=3*sz;
529       break;
530     case 5:
531       sum1 = 0.;
532       sum2 = 0.;
533       sum3 = 0.;
534       sum4 = 0.;
535       sum5 = 0.;
536       v2   = v1 + n;
537       v3   = v2 + n;
538       v4   = v3 + n;
539       v5   = v4 + n;
540 
541       for (n = 0; n<sz-1; n+=2) {
542         i1    = idx[0];
543         i2    = idx[1];
544         idx  += 2;
545         tmp0  = x[i1];
546         tmp1  = x[i2];
547         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
548         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
549         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
550         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
551         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
552       }
553       if (n == sz-1) {
554         tmp0  = x[*idx++];
555         sum1 += *v1++ * tmp0;
556         sum2 += *v2++ * tmp0;
557         sum3 += *v3++ * tmp0;
558         sum4 += *v4++ * tmp0;
559         sum5 += *v5++ * tmp0;
560       }
561       y[row++]=sum1;
562       y[row++]=sum2;
563       y[row++]=sum3;
564       y[row++]=sum4;
565       y[row++]=sum5;
566       v1      =v5;       /* Since the next block to be processed starts there */
567       idx    +=4*sz;
568       break;
569     default:
570       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
571     }
572   }
573   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
574   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
575   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
576   PetscFunctionReturn(0);
577 }
578 /* ----------------------------------------------------------- */
579 /* Almost same code as the MatMult_SeqAIJ_Inode() */
580 #undef __FUNCT__
581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode"
582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
583 {
584   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
585   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
586   const MatScalar   *v1,*v2,*v3,*v4,*v5;
587   const PetscScalar *x;
588   PetscScalar       *y,*z,*zt;
589   PetscErrorCode    ierr;
590   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
591   const PetscInt    *idx,*ns,*ii;
592 
593   PetscFunctionBegin;
594   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
595   node_max = a->inode.node_count;
596   ns       = a->inode.size;     /* Node Size array */
597 
598   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
599   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
600   zt = z;
601 
602   idx = a->j;
603   v1  = a->a;
604   ii  = a->i;
605 
606   for (i = 0,row = 0; i< node_max; ++i) {
607     nsz = ns[i];
608     n   = ii[1] - ii[0];
609     ii += nsz;
610     sz  = n;                    /* No of non zeros in this row */
611                                 /* Switch on the size of Node */
612     switch (nsz) {               /* Each loop in 'case' is unrolled */
613     case 1:
614       sum1 = *zt++;
615 
616       for (n = 0; n< sz-1; n+=2) {
617         i1    = idx[0];         /* The instructions are ordered to */
618         i2    = idx[1];         /* make the compiler's job easy */
619         idx  += 2;
620         tmp0  = x[i1];
621         tmp1  = x[i2];
622         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
623       }
624 
625       if (n   == sz-1) {          /* Take care of the last nonzero  */
626         tmp0  = x[*idx++];
627         sum1 += *v1++ * tmp0;
628       }
629       y[row++]=sum1;
630       break;
631     case 2:
632       sum1 = *zt++;
633       sum2 = *zt++;
634       v2   = v1 + n;
635 
636       for (n = 0; n< sz-1; n+=2) {
637         i1    = idx[0];
638         i2    = idx[1];
639         idx  += 2;
640         tmp0  = x[i1];
641         tmp1  = x[i2];
642         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
643         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
644       }
645       if (n   == sz-1) {
646         tmp0  = x[*idx++];
647         sum1 += *v1++ * tmp0;
648         sum2 += *v2++ * tmp0;
649       }
650       y[row++]=sum1;
651       y[row++]=sum2;
652       v1      =v2;              /* Since the next block to be processed starts there*/
653       idx    +=sz;
654       break;
655     case 3:
656       sum1 = *zt++;
657       sum2 = *zt++;
658       sum3 = *zt++;
659       v2   = v1 + n;
660       v3   = v2 + n;
661 
662       for (n = 0; n< sz-1; n+=2) {
663         i1    = idx[0];
664         i2    = idx[1];
665         idx  += 2;
666         tmp0  = x[i1];
667         tmp1  = x[i2];
668         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
669         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
670         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
671       }
672       if (n == sz-1) {
673         tmp0  = x[*idx++];
674         sum1 += *v1++ * tmp0;
675         sum2 += *v2++ * tmp0;
676         sum3 += *v3++ * tmp0;
677       }
678       y[row++]=sum1;
679       y[row++]=sum2;
680       y[row++]=sum3;
681       v1      =v3;              /* Since the next block to be processed starts there*/
682       idx    +=2*sz;
683       break;
684     case 4:
685       sum1 = *zt++;
686       sum2 = *zt++;
687       sum3 = *zt++;
688       sum4 = *zt++;
689       v2   = v1 + n;
690       v3   = v2 + n;
691       v4   = v3 + n;
692 
693       for (n = 0; n< sz-1; n+=2) {
694         i1    = idx[0];
695         i2    = idx[1];
696         idx  += 2;
697         tmp0  = x[i1];
698         tmp1  = x[i2];
699         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
700         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
701         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
702         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
703       }
704       if (n == sz-1) {
705         tmp0  = x[*idx++];
706         sum1 += *v1++ * tmp0;
707         sum2 += *v2++ * tmp0;
708         sum3 += *v3++ * tmp0;
709         sum4 += *v4++ * tmp0;
710       }
711       y[row++]=sum1;
712       y[row++]=sum2;
713       y[row++]=sum3;
714       y[row++]=sum4;
715       v1      =v4;              /* Since the next block to be processed starts there*/
716       idx    +=3*sz;
717       break;
718     case 5:
719       sum1 = *zt++;
720       sum2 = *zt++;
721       sum3 = *zt++;
722       sum4 = *zt++;
723       sum5 = *zt++;
724       v2   = v1 + n;
725       v3   = v2 + n;
726       v4   = v3 + n;
727       v5   = v4 + n;
728 
729       for (n = 0; n<sz-1; n+=2) {
730         i1    = idx[0];
731         i2    = idx[1];
732         idx  += 2;
733         tmp0  = x[i1];
734         tmp1  = x[i2];
735         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
736         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
737         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
738         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
739         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
740       }
741       if (n   == sz-1) {
742         tmp0  = x[*idx++];
743         sum1 += *v1++ * tmp0;
744         sum2 += *v2++ * tmp0;
745         sum3 += *v3++ * tmp0;
746         sum4 += *v4++ * tmp0;
747         sum5 += *v5++ * tmp0;
748       }
749       y[row++]=sum1;
750       y[row++]=sum2;
751       y[row++]=sum3;
752       y[row++]=sum4;
753       y[row++]=sum5;
754       v1      =v5;       /* Since the next block to be processed starts there */
755       idx    +=4*sz;
756       break;
757     default:
758       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
759     }
760   }
761   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
762   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
763   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 /* ----------------------------------------------------------- */
768 #undef __FUNCT__
769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace"
770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
771 {
772   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
773   IS                iscol = a->col,isrow = a->row;
774   PetscErrorCode    ierr;
775   const PetscInt    *r,*c,*rout,*cout;
776   PetscInt          i,j,n = A->rmap->n,nz;
777   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
778   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
779   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
780   PetscScalar       sum1,sum2,sum3,sum4,sum5;
781   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
782   const PetscScalar *b;
783 
784   PetscFunctionBegin;
785   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
786   node_max = a->inode.node_count;
787   ns       = a->inode.size;     /* Node Size array */
788 
789   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
791   tmp  = a->solve_work;
792 
793   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
794   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
795 
796   /* forward solve the lower triangular */
797   tmps = tmp;
798   aa   = a_a;
799   aj   = a_j;
800   ad   = a->diag;
801 
802   for (i = 0,row = 0; i< node_max; ++i) {
803     nsz = ns[i];
804     aii = ai[row];
805     v1  = aa + aii;
806     vi  = aj + aii;
807     nz  = ad[row]- aii;
808     if (i < node_max-1) {
809       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
810       * but our indexing to determine it's size could. */
811       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
812       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
813       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
814       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
815     }
816 
817     switch (nsz) {               /* Each loop in 'case' is unrolled */
818     case 1:
819       sum1 = b[*r++];
820       for (j=0; j<nz-1; j+=2) {
821         i0    = vi[0];
822         i1    = vi[1];
823         vi   +=2;
824         tmp0  = tmps[i0];
825         tmp1  = tmps[i1];
826         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
827       }
828       if (j == nz-1) {
829         tmp0  = tmps[*vi++];
830         sum1 -= *v1++ *tmp0;
831       }
832       tmp[row++]=sum1;
833       break;
834     case 2:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       v2   = aa + ai[row+1];
838 
839       for (j=0; j<nz-1; j+=2) {
840         i0    = vi[0];
841         i1    = vi[1];
842         vi   +=2;
843         tmp0  = tmps[i0];
844         tmp1  = tmps[i1];
845         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
846         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
847       }
848       if (j == nz-1) {
849         tmp0  = tmps[*vi++];
850         sum1 -= *v1++ *tmp0;
851         sum2 -= *v2++ *tmp0;
852       }
853       sum2     -= *v2++ *sum1;
854       tmp[row++]=sum1;
855       tmp[row++]=sum2;
856       break;
857     case 3:
858       sum1 = b[*r++];
859       sum2 = b[*r++];
860       sum3 = b[*r++];
861       v2   = aa + ai[row+1];
862       v3   = aa + ai[row+2];
863 
864       for (j=0; j<nz-1; j+=2) {
865         i0    = vi[0];
866         i1    = vi[1];
867         vi   +=2;
868         tmp0  = tmps[i0];
869         tmp1  = tmps[i1];
870         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
871         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
872         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
873       }
874       if (j == nz-1) {
875         tmp0  = tmps[*vi++];
876         sum1 -= *v1++ *tmp0;
877         sum2 -= *v2++ *tmp0;
878         sum3 -= *v3++ *tmp0;
879       }
880       sum2 -= *v2++ * sum1;
881       sum3 -= *v3++ * sum1;
882       sum3 -= *v3++ * sum2;
883 
884       tmp[row++]=sum1;
885       tmp[row++]=sum2;
886       tmp[row++]=sum3;
887       break;
888 
889     case 4:
890       sum1 = b[*r++];
891       sum2 = b[*r++];
892       sum3 = b[*r++];
893       sum4 = b[*r++];
894       v2   = aa + ai[row+1];
895       v3   = aa + ai[row+2];
896       v4   = aa + ai[row+3];
897 
898       for (j=0; j<nz-1; j+=2) {
899         i0    = vi[0];
900         i1    = vi[1];
901         vi   +=2;
902         tmp0  = tmps[i0];
903         tmp1  = tmps[i1];
904         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
905         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
906         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
907         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
908       }
909       if (j == nz-1) {
910         tmp0  = tmps[*vi++];
911         sum1 -= *v1++ *tmp0;
912         sum2 -= *v2++ *tmp0;
913         sum3 -= *v3++ *tmp0;
914         sum4 -= *v4++ *tmp0;
915       }
916       sum2 -= *v2++ * sum1;
917       sum3 -= *v3++ * sum1;
918       sum4 -= *v4++ * sum1;
919       sum3 -= *v3++ * sum2;
920       sum4 -= *v4++ * sum2;
921       sum4 -= *v4++ * sum3;
922 
923       tmp[row++]=sum1;
924       tmp[row++]=sum2;
925       tmp[row++]=sum3;
926       tmp[row++]=sum4;
927       break;
928     case 5:
929       sum1 = b[*r++];
930       sum2 = b[*r++];
931       sum3 = b[*r++];
932       sum4 = b[*r++];
933       sum5 = b[*r++];
934       v2   = aa + ai[row+1];
935       v3   = aa + ai[row+2];
936       v4   = aa + ai[row+3];
937       v5   = aa + ai[row+4];
938 
939       for (j=0; j<nz-1; j+=2) {
940         i0    = vi[0];
941         i1    = vi[1];
942         vi   +=2;
943         tmp0  = tmps[i0];
944         tmp1  = tmps[i1];
945         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
946         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
947         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
948         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
949         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
950       }
951       if (j == nz-1) {
952         tmp0  = tmps[*vi++];
953         sum1 -= *v1++ *tmp0;
954         sum2 -= *v2++ *tmp0;
955         sum3 -= *v3++ *tmp0;
956         sum4 -= *v4++ *tmp0;
957         sum5 -= *v5++ *tmp0;
958       }
959 
960       sum2 -= *v2++ * sum1;
961       sum3 -= *v3++ * sum1;
962       sum4 -= *v4++ * sum1;
963       sum5 -= *v5++ * sum1;
964       sum3 -= *v3++ * sum2;
965       sum4 -= *v4++ * sum2;
966       sum5 -= *v5++ * sum2;
967       sum4 -= *v4++ * sum3;
968       sum5 -= *v5++ * sum3;
969       sum5 -= *v5++ * sum4;
970 
971       tmp[row++]=sum1;
972       tmp[row++]=sum2;
973       tmp[row++]=sum3;
974       tmp[row++]=sum4;
975       tmp[row++]=sum5;
976       break;
977     default:
978       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
979     }
980   }
981   /* backward solve the upper triangular */
982   for (i=node_max -1,row = n-1; i>=0; i--) {
983     nsz = ns[i];
984     aii = ai[row+1] -1;
985     v1  = aa + aii;
986     vi  = aj + aii;
987     nz  = aii- ad[row];
988     switch (nsz) {               /* Each loop in 'case' is unrolled */
989     case 1:
990       sum1 = tmp[row];
991 
992       for (j=nz; j>1; j-=2) {
993         vi   -=2;
994         i0    = vi[2];
995         i1    = vi[1];
996         tmp0  = tmps[i0];
997         tmp1  = tmps[i1];
998         v1   -= 2;
999         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1000       }
1001       if (j==1) {
1002         tmp0  = tmps[*vi--];
1003         sum1 -= *v1-- * tmp0;
1004       }
1005       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       break;
1007     case 2:
1008       sum1 = tmp[row];
1009       sum2 = tmp[row -1];
1010       v2   = aa + ai[row]-1;
1011       for (j=nz; j>1; j-=2) {
1012         vi   -=2;
1013         i0    = vi[2];
1014         i1    = vi[1];
1015         tmp0  = tmps[i0];
1016         tmp1  = tmps[i1];
1017         v1   -= 2;
1018         v2   -= 2;
1019         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1020         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1021       }
1022       if (j==1) {
1023         tmp0  = tmps[*vi--];
1024         sum1 -= *v1-- * tmp0;
1025         sum2 -= *v2-- * tmp0;
1026       }
1027 
1028       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1029       sum2   -= *v2-- * tmp0;
1030       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1031       break;
1032     case 3:
1033       sum1 = tmp[row];
1034       sum2 = tmp[row -1];
1035       sum3 = tmp[row -2];
1036       v2   = aa + ai[row]-1;
1037       v3   = aa + ai[row -1]-1;
1038       for (j=nz; j>1; j-=2) {
1039         vi   -=2;
1040         i0    = vi[2];
1041         i1    = vi[1];
1042         tmp0  = tmps[i0];
1043         tmp1  = tmps[i1];
1044         v1   -= 2;
1045         v2   -= 2;
1046         v3   -= 2;
1047         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1048         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1049         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1050       }
1051       if (j==1) {
1052         tmp0  = tmps[*vi--];
1053         sum1 -= *v1-- * tmp0;
1054         sum2 -= *v2-- * tmp0;
1055         sum3 -= *v3-- * tmp0;
1056       }
1057       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1058       sum2   -= *v2-- * tmp0;
1059       sum3   -= *v3-- * tmp0;
1060       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1061       sum3   -= *v3-- * tmp0;
1062       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1063 
1064       break;
1065     case 4:
1066       sum1 = tmp[row];
1067       sum2 = tmp[row -1];
1068       sum3 = tmp[row -2];
1069       sum4 = tmp[row -3];
1070       v2   = aa + ai[row]-1;
1071       v3   = aa + ai[row -1]-1;
1072       v4   = aa + ai[row -2]-1;
1073 
1074       for (j=nz; j>1; j-=2) {
1075         vi   -=2;
1076         i0    = vi[2];
1077         i1    = vi[1];
1078         tmp0  = tmps[i0];
1079         tmp1  = tmps[i1];
1080         v1   -= 2;
1081         v2   -= 2;
1082         v3   -= 2;
1083         v4   -= 2;
1084         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1085         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1086         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1087         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1088       }
1089       if (j==1) {
1090         tmp0  = tmps[*vi--];
1091         sum1 -= *v1-- * tmp0;
1092         sum2 -= *v2-- * tmp0;
1093         sum3 -= *v3-- * tmp0;
1094         sum4 -= *v4-- * tmp0;
1095       }
1096 
1097       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1098       sum2   -= *v2-- * tmp0;
1099       sum3   -= *v3-- * tmp0;
1100       sum4   -= *v4-- * tmp0;
1101       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1102       sum3   -= *v3-- * tmp0;
1103       sum4   -= *v4-- * tmp0;
1104       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1105       sum4   -= *v4-- * tmp0;
1106       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1107       break;
1108     case 5:
1109       sum1 = tmp[row];
1110       sum2 = tmp[row -1];
1111       sum3 = tmp[row -2];
1112       sum4 = tmp[row -3];
1113       sum5 = tmp[row -4];
1114       v2   = aa + ai[row]-1;
1115       v3   = aa + ai[row -1]-1;
1116       v4   = aa + ai[row -2]-1;
1117       v5   = aa + ai[row -3]-1;
1118       for (j=nz; j>1; j-=2) {
1119         vi   -= 2;
1120         i0    = vi[2];
1121         i1    = vi[1];
1122         tmp0  = tmps[i0];
1123         tmp1  = tmps[i1];
1124         v1   -= 2;
1125         v2   -= 2;
1126         v3   -= 2;
1127         v4   -= 2;
1128         v5   -= 2;
1129         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1130         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1131         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1132         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1133         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1134       }
1135       if (j==1) {
1136         tmp0  = tmps[*vi--];
1137         sum1 -= *v1-- * tmp0;
1138         sum2 -= *v2-- * tmp0;
1139         sum3 -= *v3-- * tmp0;
1140         sum4 -= *v4-- * tmp0;
1141         sum5 -= *v5-- * tmp0;
1142       }
1143 
1144       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1145       sum2   -= *v2-- * tmp0;
1146       sum3   -= *v3-- * tmp0;
1147       sum4   -= *v4-- * tmp0;
1148       sum5   -= *v5-- * tmp0;
1149       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1150       sum3   -= *v3-- * tmp0;
1151       sum4   -= *v4-- * tmp0;
1152       sum5   -= *v5-- * tmp0;
1153       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1154       sum4   -= *v4-- * tmp0;
1155       sum5   -= *v5-- * tmp0;
1156       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1157       sum5   -= *v5-- * tmp0;
1158       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1159       break;
1160     default:
1161       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1162     }
1163   }
1164   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1165   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1166   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1167   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1168   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1169   PetscFunctionReturn(0);
1170 }
1171 
1172 #undef __FUNCT__
1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode"
1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1175 {
1176   Mat             C     =B;
1177   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1178   IS              isrow = b->row,isicol = b->icol;
1179   PetscErrorCode  ierr;
1180   const PetscInt  *r,*ic,*ics;
1181   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1182   PetscInt        i,j,k,nz,nzL,row,*pj;
1183   const PetscInt  *ajtmp,*bjtmp;
1184   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1185   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1186   FactorShiftCtx  sctx;
1187   const PetscInt  *ddiag;
1188   PetscReal       rs;
1189   MatScalar       d;
1190   PetscInt        inod,nodesz,node_max,col;
1191   const PetscInt  *ns;
1192   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1193 
1194   PetscFunctionBegin;
1195   /* MatPivotSetUp(): initialize shift context sctx */
1196   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1197 
1198   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1199     ddiag          = a->diag;
1200     sctx.shift_top = info->zeropivot;
1201     for (i=0; i<n; i++) {
1202       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1203       d  = (aa)[ddiag[i]];
1204       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1205       v  = aa+ai[i];
1206       nz = ai[i+1] - ai[i];
1207       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1208       if (rs>sctx.shift_top) sctx.shift_top = rs;
1209     }
1210     sctx.shift_top *= 1.1;
1211     sctx.nshift_max = 5;
1212     sctx.shift_lo   = 0.;
1213     sctx.shift_hi   = 1.;
1214   }
1215 
1216   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1217   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1218 
1219   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1220   ics   = ic;
1221 
1222   node_max = a->inode.node_count;
1223   ns       = a->inode.size;
1224   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1225 
1226   /* If max inode size > 4, split it into two inodes.*/
1227   /* also map the inode sizes according to the ordering */
1228   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1229   for (i=0,j=0; i<node_max; ++i,++j) {
1230     if (ns[i] > 4) {
1231       tmp_vec1[j] = 4;
1232       ++j;
1233       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1234     } else {
1235       tmp_vec1[j] = ns[i];
1236     }
1237   }
1238   /* Use the correct node_max */
1239   node_max = j;
1240 
1241   /* Now reorder the inode info based on mat re-ordering info */
1242   /* First create a row -> inode_size_array_index map */
1243   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1244   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1245   for (i=0,row=0; i<node_max; i++) {
1246     nodesz = tmp_vec1[i];
1247     for (j=0; j<nodesz; j++,row++) {
1248       nsmap[row] = i;
1249     }
1250   }
1251   /* Using nsmap, create a reordered ns structure */
1252   for (i=0,j=0; i< node_max; i++) {
1253     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1254     tmp_vec2[i] = nodesz;
1255     j          += nodesz;
1256   }
1257   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1258   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1259 
1260   /* Now use the correct ns */
1261   ns = tmp_vec2;
1262 
1263   do {
1264     sctx.newshift = PETSC_FALSE;
1265     /* Now loop over each block-row, and do the factorization */
1266     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1267       nodesz = ns[inod];
1268 
1269       switch (nodesz) {
1270       case 1:
1271         /*----------*/
1272         /* zero rtmp1 */
1273         /* L part */
1274         nz    = bi[i+1] - bi[i];
1275         bjtmp = bj + bi[i];
1276         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1277 
1278         /* U part */
1279         nz    = bdiag[i]-bdiag[i+1];
1280         bjtmp = bj + bdiag[i+1]+1;
1281         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1282 
1283         /* load in initial (unfactored row) */
1284         nz    = ai[r[i]+1] - ai[r[i]];
1285         ajtmp = aj + ai[r[i]];
1286         v     = aa + ai[r[i]];
1287         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1288 
1289         /* ZeropivotApply() */
1290         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1291 
1292         /* elimination */
1293         bjtmp = bj + bi[i];
1294         row   = *bjtmp++;
1295         nzL   = bi[i+1] - bi[i];
1296         for (k=0; k < nzL; k++) {
1297           pc = rtmp1 + row;
1298           if (*pc != 0.0) {
1299             pv   = b->a + bdiag[row];
1300             mul1 = *pc * (*pv);
1301             *pc  = mul1;
1302             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1303             pv   = b->a + bdiag[row+1]+1;
1304             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1305             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1306             ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1307           }
1308           row = *bjtmp++;
1309         }
1310 
1311         /* finished row so stick it into b->a */
1312         rs = 0.0;
1313         /* L part */
1314         pv = b->a + bi[i];
1315         pj = b->j + bi[i];
1316         nz = bi[i+1] - bi[i];
1317         for (j=0; j<nz; j++) {
1318           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1319         }
1320 
1321         /* U part */
1322         pv = b->a + bdiag[i+1]+1;
1323         pj = b->j + bdiag[i+1]+1;
1324         nz = bdiag[i] - bdiag[i+1]-1;
1325         for (j=0; j<nz; j++) {
1326           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1327         }
1328 
1329         /* Check zero pivot */
1330         sctx.rs = rs;
1331         sctx.pv = rtmp1[i];
1332         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1333         if (sctx.newshift) break;
1334 
1335         /* Mark diagonal and invert diagonal for simplier triangular solves */
1336         pv  = b->a + bdiag[i];
1337         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1338         break;
1339 
1340       case 2:
1341         /*----------*/
1342         /* zero rtmp1 and rtmp2 */
1343         /* L part */
1344         nz    = bi[i+1] - bi[i];
1345         bjtmp = bj + bi[i];
1346         for  (j=0; j<nz; j++) {
1347           col        = bjtmp[j];
1348           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1349         }
1350 
1351         /* U part */
1352         nz    = bdiag[i]-bdiag[i+1];
1353         bjtmp = bj + bdiag[i+1]+1;
1354         for  (j=0; j<nz; j++) {
1355           col        = bjtmp[j];
1356           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1357         }
1358 
1359         /* load in initial (unfactored row) */
1360         nz    = ai[r[i]+1] - ai[r[i]];
1361         ajtmp = aj + ai[r[i]];
1362         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1363         for (j=0; j<nz; j++) {
1364           col        = ics[ajtmp[j]];
1365           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1366         }
1367         /* ZeropivotApply(): shift the diagonal of the matrix  */
1368         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1369 
1370         /* elimination */
1371         bjtmp = bj + bi[i];
1372         row   = *bjtmp++; /* pivot row */
1373         nzL   = bi[i+1] - bi[i];
1374         for (k=0; k < nzL; k++) {
1375           pc1 = rtmp1 + row;
1376           pc2 = rtmp2 + row;
1377           if (*pc1 != 0.0 || *pc2 != 0.0) {
1378             pv   = b->a + bdiag[row];
1379             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1380             *pc1 = mul1;       *pc2 = mul2;
1381 
1382             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1383             pv = b->a + bdiag[row+1]+1;
1384             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1385             for (j=0; j<nz; j++) {
1386               col         = pj[j];
1387               rtmp1[col] -= mul1 * pv[j];
1388               rtmp2[col] -= mul2 * pv[j];
1389             }
1390             ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1391           }
1392           row = *bjtmp++;
1393         }
1394 
1395         /* finished row i; check zero pivot, then stick row i into b->a */
1396         rs = 0.0;
1397         /* L part */
1398         pc1 = b->a + bi[i];
1399         pj  = b->j + bi[i];
1400         nz  = bi[i+1] - bi[i];
1401         for (j=0; j<nz; j++) {
1402           col    = pj[j];
1403           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1404         }
1405         /* U part */
1406         pc1 = b->a + bdiag[i+1]+1;
1407         pj  = b->j + bdiag[i+1]+1;
1408         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1409         for (j=0; j<nz; j++) {
1410           col    = pj[j];
1411           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1412         }
1413 
1414         sctx.rs = rs;
1415         sctx.pv = rtmp1[i];
1416         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1417         if (sctx.newshift) break;
1418         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1419         *pc1 = 1.0/sctx.pv;
1420 
1421         /* Now take care of diagonal 2x2 block. */
1422         pc2 = rtmp2 + i;
1423         if (*pc2 != 0.0) {
1424           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1425           *pc2 = mul1;          /* insert L entry */
1426           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1427           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1428           for (j=0; j<nz; j++) {
1429             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1430           }
1431           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1432         }
1433 
1434         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1435         rs = 0.0;
1436         /* L part */
1437         pc2 = b->a + bi[i+1];
1438         pj  = b->j + bi[i+1];
1439         nz  = bi[i+2] - bi[i+1];
1440         for (j=0; j<nz; j++) {
1441           col    = pj[j];
1442           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1443         }
1444         /* U part */
1445         pc2 = b->a + bdiag[i+2]+1;
1446         pj  = b->j + bdiag[i+2]+1;
1447         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1448         for (j=0; j<nz; j++) {
1449           col    = pj[j];
1450           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1451         }
1452 
1453         sctx.rs = rs;
1454         sctx.pv = rtmp2[i+1];
1455         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1456         if (sctx.newshift) break;
1457         pc2  = b->a + bdiag[i+1];
1458         *pc2 = 1.0/sctx.pv;
1459         break;
1460 
1461       case 3:
1462         /*----------*/
1463         /* zero rtmp */
1464         /* L part */
1465         nz    = bi[i+1] - bi[i];
1466         bjtmp = bj + bi[i];
1467         for  (j=0; j<nz; j++) {
1468           col        = bjtmp[j];
1469           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1470         }
1471 
1472         /* U part */
1473         nz    = bdiag[i]-bdiag[i+1];
1474         bjtmp = bj + bdiag[i+1]+1;
1475         for  (j=0; j<nz; j++) {
1476           col        = bjtmp[j];
1477           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1478         }
1479 
1480         /* load in initial (unfactored row) */
1481         nz    = ai[r[i]+1] - ai[r[i]];
1482         ajtmp = aj + ai[r[i]];
1483         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1484         for (j=0; j<nz; j++) {
1485           col        = ics[ajtmp[j]];
1486           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1487         }
1488         /* ZeropivotApply(): shift the diagonal of the matrix  */
1489         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1490 
1491         /* elimination */
1492         bjtmp = bj + bi[i];
1493         row   = *bjtmp++; /* pivot row */
1494         nzL   = bi[i+1] - bi[i];
1495         for (k=0; k < nzL; k++) {
1496           pc1 = rtmp1 + row;
1497           pc2 = rtmp2 + row;
1498           pc3 = rtmp3 + row;
1499           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1500             pv   = b->a + bdiag[row];
1501             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1502             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1503 
1504             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1505             pv = b->a + bdiag[row+1]+1;
1506             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1507             for (j=0; j<nz; j++) {
1508               col         = pj[j];
1509               rtmp1[col] -= mul1 * pv[j];
1510               rtmp2[col] -= mul2 * pv[j];
1511               rtmp3[col] -= mul3 * pv[j];
1512             }
1513             ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1514           }
1515           row = *bjtmp++;
1516         }
1517 
1518         /* finished row i; check zero pivot, then stick row i into b->a */
1519         rs = 0.0;
1520         /* L part */
1521         pc1 = b->a + bi[i];
1522         pj  = b->j + bi[i];
1523         nz  = bi[i+1] - bi[i];
1524         for (j=0; j<nz; j++) {
1525           col    = pj[j];
1526           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1527         }
1528         /* U part */
1529         pc1 = b->a + bdiag[i+1]+1;
1530         pj  = b->j + bdiag[i+1]+1;
1531         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1532         for (j=0; j<nz; j++) {
1533           col    = pj[j];
1534           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1535         }
1536 
1537         sctx.rs = rs;
1538         sctx.pv = rtmp1[i];
1539         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1540         if (sctx.newshift) break;
1541         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1542         *pc1 = 1.0/sctx.pv;
1543 
1544         /* Now take care of 1st column of diagonal 3x3 block. */
1545         pc2 = rtmp2 + i;
1546         pc3 = rtmp3 + i;
1547         if (*pc2 != 0.0 || *pc3 != 0.0) {
1548           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1549           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1550           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1551           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1552           for (j=0; j<nz; j++) {
1553             col         = pj[j];
1554             rtmp2[col] -= mul2 * rtmp1[col];
1555             rtmp3[col] -= mul3 * rtmp1[col];
1556           }
1557           ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr);
1558         }
1559 
1560         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1561         rs = 0.0;
1562         /* L part */
1563         pc2 = b->a + bi[i+1];
1564         pj  = b->j + bi[i+1];
1565         nz  = bi[i+2] - bi[i+1];
1566         for (j=0; j<nz; j++) {
1567           col    = pj[j];
1568           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1569         }
1570         /* U part */
1571         pc2 = b->a + bdiag[i+2]+1;
1572         pj  = b->j + bdiag[i+2]+1;
1573         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1574         for (j=0; j<nz; j++) {
1575           col    = pj[j];
1576           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1577         }
1578 
1579         sctx.rs = rs;
1580         sctx.pv = rtmp2[i+1];
1581         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1582         if (sctx.newshift) break;
1583         pc2  = b->a + bdiag[i+1];
1584         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1585 
1586         /* Now take care of 2nd column of diagonal 3x3 block. */
1587         pc3 = rtmp3 + i+1;
1588         if (*pc3 != 0.0) {
1589           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1590           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1591           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1592           for (j=0; j<nz; j++) {
1593             col         = pj[j];
1594             rtmp3[col] -= mul3 * rtmp2[col];
1595           }
1596           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1597         }
1598 
1599         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1600         rs = 0.0;
1601         /* L part */
1602         pc3 = b->a + bi[i+2];
1603         pj  = b->j + bi[i+2];
1604         nz  = bi[i+3] - bi[i+2];
1605         for (j=0; j<nz; j++) {
1606           col    = pj[j];
1607           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1608         }
1609         /* U part */
1610         pc3 = b->a + bdiag[i+3]+1;
1611         pj  = b->j + bdiag[i+3]+1;
1612         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1613         for (j=0; j<nz; j++) {
1614           col    = pj[j];
1615           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1616         }
1617 
1618         sctx.rs = rs;
1619         sctx.pv = rtmp3[i+2];
1620         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1621         if (sctx.newshift) break;
1622         pc3  = b->a + bdiag[i+2];
1623         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1624         break;
1625       case 4:
1626         /*----------*/
1627         /* zero rtmp */
1628         /* L part */
1629         nz    = bi[i+1] - bi[i];
1630         bjtmp = bj + bi[i];
1631         for  (j=0; j<nz; j++) {
1632           col        = bjtmp[j];
1633           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1634         }
1635 
1636         /* U part */
1637         nz    = bdiag[i]-bdiag[i+1];
1638         bjtmp = bj + bdiag[i+1]+1;
1639         for  (j=0; j<nz; j++) {
1640           col        = bjtmp[j];
1641           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1642         }
1643 
1644         /* load in initial (unfactored row) */
1645         nz    = ai[r[i]+1] - ai[r[i]];
1646         ajtmp = aj + ai[r[i]];
1647         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1648         for (j=0; j<nz; j++) {
1649           col        = ics[ajtmp[j]];
1650           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1651         }
1652         /* ZeropivotApply(): shift the diagonal of the matrix  */
1653         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1654 
1655         /* elimination */
1656         bjtmp = bj + bi[i];
1657         row   = *bjtmp++; /* pivot row */
1658         nzL   = bi[i+1] - bi[i];
1659         for (k=0; k < nzL; k++) {
1660           pc1 = rtmp1 + row;
1661           pc2 = rtmp2 + row;
1662           pc3 = rtmp3 + row;
1663           pc4 = rtmp4 + row;
1664           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1665             pv   = b->a + bdiag[row];
1666             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1667             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1668 
1669             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1670             pv = b->a + bdiag[row+1]+1;
1671             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1672             for (j=0; j<nz; j++) {
1673               col         = pj[j];
1674               rtmp1[col] -= mul1 * pv[j];
1675               rtmp2[col] -= mul2 * pv[j];
1676               rtmp3[col] -= mul3 * pv[j];
1677               rtmp4[col] -= mul4 * pv[j];
1678             }
1679             ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr);
1680           }
1681           row = *bjtmp++;
1682         }
1683 
1684         /* finished row i; check zero pivot, then stick row i into b->a */
1685         rs = 0.0;
1686         /* L part */
1687         pc1 = b->a + bi[i];
1688         pj  = b->j + bi[i];
1689         nz  = bi[i+1] - bi[i];
1690         for (j=0; j<nz; j++) {
1691           col    = pj[j];
1692           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1693         }
1694         /* U part */
1695         pc1 = b->a + bdiag[i+1]+1;
1696         pj  = b->j + bdiag[i+1]+1;
1697         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1698         for (j=0; j<nz; j++) {
1699           col    = pj[j];
1700           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1701         }
1702 
1703         sctx.rs = rs;
1704         sctx.pv = rtmp1[i];
1705         ierr    = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr);
1706         if (sctx.newshift) break;
1707         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1708         *pc1 = 1.0/sctx.pv;
1709 
1710         /* Now take care of 1st column of diagonal 4x4 block. */
1711         pc2 = rtmp2 + i;
1712         pc3 = rtmp3 + i;
1713         pc4 = rtmp4 + i;
1714         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1715           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1716           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1717           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1718           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1719           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1720           for (j=0; j<nz; j++) {
1721             col         = pj[j];
1722             rtmp2[col] -= mul2 * rtmp1[col];
1723             rtmp3[col] -= mul3 * rtmp1[col];
1724             rtmp4[col] -= mul4 * rtmp1[col];
1725           }
1726           ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr);
1727         }
1728 
1729         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1730         rs = 0.0;
1731         /* L part */
1732         pc2 = b->a + bi[i+1];
1733         pj  = b->j + bi[i+1];
1734         nz  = bi[i+2] - bi[i+1];
1735         for (j=0; j<nz; j++) {
1736           col    = pj[j];
1737           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1738         }
1739         /* U part */
1740         pc2 = b->a + bdiag[i+2]+1;
1741         pj  = b->j + bdiag[i+2]+1;
1742         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1743         for (j=0; j<nz; j++) {
1744           col    = pj[j];
1745           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1746         }
1747 
1748         sctx.rs = rs;
1749         sctx.pv = rtmp2[i+1];
1750         ierr    = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr);
1751         if (sctx.newshift) break;
1752         pc2  = b->a + bdiag[i+1];
1753         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1754 
1755         /* Now take care of 2nd column of diagonal 4x4 block. */
1756         pc3 = rtmp3 + i+1;
1757         pc4 = rtmp4 + i+1;
1758         if (*pc3 != 0.0 || *pc4 != 0.0) {
1759           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1760           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1761           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1762           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1763           for (j=0; j<nz; j++) {
1764             col         = pj[j];
1765             rtmp3[col] -= mul3 * rtmp2[col];
1766             rtmp4[col] -= mul4 * rtmp2[col];
1767           }
1768           ierr = PetscLogFlops(4*nz);CHKERRQ(ierr);
1769         }
1770 
1771         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1772         rs = 0.0;
1773         /* L part */
1774         pc3 = b->a + bi[i+2];
1775         pj  = b->j + bi[i+2];
1776         nz  = bi[i+3] - bi[i+2];
1777         for (j=0; j<nz; j++) {
1778           col    = pj[j];
1779           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1780         }
1781         /* U part */
1782         pc3 = b->a + bdiag[i+3]+1;
1783         pj  = b->j + bdiag[i+3]+1;
1784         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1785         for (j=0; j<nz; j++) {
1786           col    = pj[j];
1787           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1788         }
1789 
1790         sctx.rs = rs;
1791         sctx.pv = rtmp3[i+2];
1792         ierr    = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr);
1793         if (sctx.newshift) break;
1794         pc3  = b->a + bdiag[i+2];
1795         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1796 
1797         /* Now take care of 3rd column of diagonal 4x4 block. */
1798         pc4 = rtmp4 + i+2;
1799         if (*pc4 != 0.0) {
1800           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1801           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1802           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1803           for (j=0; j<nz; j++) {
1804             col         = pj[j];
1805             rtmp4[col] -= mul4 * rtmp3[col];
1806           }
1807           ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr);
1808         }
1809 
1810         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1811         rs = 0.0;
1812         /* L part */
1813         pc4 = b->a + bi[i+3];
1814         pj  = b->j + bi[i+3];
1815         nz  = bi[i+4] - bi[i+3];
1816         for (j=0; j<nz; j++) {
1817           col    = pj[j];
1818           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1819         }
1820         /* U part */
1821         pc4 = b->a + bdiag[i+4]+1;
1822         pj  = b->j + bdiag[i+4]+1;
1823         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1824         for (j=0; j<nz; j++) {
1825           col    = pj[j];
1826           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1827         }
1828 
1829         sctx.rs = rs;
1830         sctx.pv = rtmp4[i+3];
1831         ierr    = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr);
1832         if (sctx.newshift) break;
1833         pc4  = b->a + bdiag[i+3];
1834         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1835         break;
1836 
1837       default:
1838         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1839       }
1840       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1841       i += nodesz;                 /* Update the row */
1842     }
1843 
1844     /* MatPivotRefine() */
1845     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1846       /*
1847        * if no shift in this attempt & shifting & started shifting & can refine,
1848        * then try lower shift
1849        */
1850       sctx.shift_hi       = sctx.shift_fraction;
1851       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1852       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1853       sctx.newshift       = PETSC_TRUE;
1854       sctx.nshift++;
1855     }
1856   } while (sctx.newshift);
1857 
1858   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1859   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1860   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1861   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1862 
1863   if (b->inode.size) {
1864     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1865   } else {
1866     C->ops->solve           = MatSolve_SeqAIJ;
1867   }
1868   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1869   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1870   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1871   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1872   C->assembled              = PETSC_TRUE;
1873   C->preallocated           = PETSC_TRUE;
1874 
1875   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1876 
1877   /* MatShiftView(A,info,&sctx) */
1878   if (sctx.nshift) {
1879     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1880       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1881     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1882       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1883     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1884       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1885     }
1886   }
1887   PetscFunctionReturn(0);
1888 }
1889 
1890 #undef __FUNCT__
1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace"
1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1893 {
1894   Mat             C     = B;
1895   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1896   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1897   PetscErrorCode  ierr;
1898   const PetscInt  *r,*ic,*c,*ics;
1899   PetscInt        n   = A->rmap->n,*bi = b->i;
1900   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1901   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1902   PetscInt        *ai = a->i,*aj = a->j;
1903   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1904   PetscScalar     mul1,mul2,mul3,tmp;
1905   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1906   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1907   PetscReal       rs=0.0;
1908   FactorShiftCtx  sctx;
1909 
1910   PetscFunctionBegin;
1911   sctx.shift_top      = 0;
1912   sctx.nshift_max     = 0;
1913   sctx.shift_lo       = 0;
1914   sctx.shift_hi       = 0;
1915   sctx.shift_fraction = 0;
1916 
1917   /* if both shift schemes are chosen by user, only use info->shiftpd */
1918   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1919     sctx.shift_top = 0;
1920     for (i=0; i<n; i++) {
1921       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1922       rs    = 0.0;
1923       ajtmp = aj + ai[i];
1924       rtmp1 = aa + ai[i];
1925       nz    = ai[i+1] - ai[i];
1926       for (j=0; j<nz; j++) {
1927         if (*ajtmp != i) {
1928           rs += PetscAbsScalar(*rtmp1++);
1929         } else {
1930           rs -= PetscRealPart(*rtmp1++);
1931         }
1932         ajtmp++;
1933       }
1934       if (rs>sctx.shift_top) sctx.shift_top = rs;
1935     }
1936     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1937     sctx.shift_top *= 1.1;
1938     sctx.nshift_max = 5;
1939     sctx.shift_lo   = 0.;
1940     sctx.shift_hi   = 1.;
1941   }
1942   sctx.shift_amount = 0;
1943   sctx.nshift       = 0;
1944 
1945   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1946   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1947   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1948   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1949   ics    = ic;
1950 
1951   node_max = a->inode.node_count;
1952   ns       = a->inode.size;
1953   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1954 
1955   /* If max inode size > 3, split it into two inodes.*/
1956   /* also map the inode sizes according to the ordering */
1957   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1958   for (i=0,j=0; i<node_max; ++i,++j) {
1959     if (ns[i]>3) {
1960       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1961       ++j;
1962       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1963     } else {
1964       tmp_vec1[j] = ns[i];
1965     }
1966   }
1967   /* Use the correct node_max */
1968   node_max = j;
1969 
1970   /* Now reorder the inode info based on mat re-ordering info */
1971   /* First create a row -> inode_size_array_index map */
1972   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1973   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1974   for (i=0,row=0; i<node_max; i++) {
1975     nodesz = tmp_vec1[i];
1976     for (j=0; j<nodesz; j++,row++) {
1977       nsmap[row] = i;
1978     }
1979   }
1980   /* Using nsmap, create a reordered ns structure */
1981   for (i=0,j=0; i< node_max; i++) {
1982     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1983     tmp_vec2[i] = nodesz;
1984     j          += nodesz;
1985   }
1986   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1987   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1988   /* Now use the correct ns */
1989   ns = tmp_vec2;
1990 
1991   do {
1992     sctx.newshift = PETSC_FALSE;
1993     /* Now loop over each block-row, and do the factorization */
1994     for (i=0,row=0; i<node_max; i++) {
1995       nodesz = ns[i];
1996       nz     = bi[row+1] - bi[row];
1997       bjtmp  = bj + bi[row];
1998 
1999       switch (nodesz) {
2000       case 1:
2001         for  (j=0; j<nz; j++) {
2002           idx         = bjtmp[j];
2003           rtmp11[idx] = 0.0;
2004         }
2005 
2006         /* load in initial (unfactored row) */
2007         idx    = r[row];
2008         nz_tmp = ai[idx+1] - ai[idx];
2009         ajtmp  = aj + ai[idx];
2010         v1     = aa + ai[idx];
2011 
2012         for (j=0; j<nz_tmp; j++) {
2013           idx         = ics[ajtmp[j]];
2014           rtmp11[idx] = v1[j];
2015         }
2016         rtmp11[ics[r[row]]] += sctx.shift_amount;
2017 
2018         prow = *bjtmp++;
2019         while (prow < row) {
2020           pc1 = rtmp11 + prow;
2021           if (*pc1 != 0.0) {
2022             pv     = ba + bd[prow];
2023             pj     = nbj + bd[prow];
2024             mul1   = *pc1 * *pv++;
2025             *pc1   = mul1;
2026             nz_tmp = bi[prow+1] - bd[prow] - 1;
2027             ierr   = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2028             for (j=0; j<nz_tmp; j++) {
2029               tmp          = pv[j];
2030               idx          = pj[j];
2031               rtmp11[idx] -= mul1 * tmp;
2032             }
2033           }
2034           prow = *bjtmp++;
2035         }
2036         pj  = bj + bi[row];
2037         pc1 = ba + bi[row];
2038 
2039         sctx.pv     = rtmp11[row];
2040         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2041         rs          = 0.0;
2042         for (j=0; j<nz; j++) {
2043           idx    = pj[j];
2044           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2045           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2046         }
2047         sctx.rs = rs;
2048         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2049         if (sctx.newshift) goto endofwhile;
2050         break;
2051 
2052       case 2:
2053         for (j=0; j<nz; j++) {
2054           idx         = bjtmp[j];
2055           rtmp11[idx] = 0.0;
2056           rtmp22[idx] = 0.0;
2057         }
2058 
2059         /* load in initial (unfactored row) */
2060         idx    = r[row];
2061         nz_tmp = ai[idx+1] - ai[idx];
2062         ajtmp  = aj + ai[idx];
2063         v1     = aa + ai[idx];
2064         v2     = aa + ai[idx+1];
2065         for (j=0; j<nz_tmp; j++) {
2066           idx         = ics[ajtmp[j]];
2067           rtmp11[idx] = v1[j];
2068           rtmp22[idx] = v2[j];
2069         }
2070         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2071         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2072 
2073         prow = *bjtmp++;
2074         while (prow < row) {
2075           pc1 = rtmp11 + prow;
2076           pc2 = rtmp22 + prow;
2077           if (*pc1 != 0.0 || *pc2 != 0.0) {
2078             pv   = ba + bd[prow];
2079             pj   = nbj + bd[prow];
2080             mul1 = *pc1 * *pv;
2081             mul2 = *pc2 * *pv;
2082             ++pv;
2083             *pc1 = mul1;
2084             *pc2 = mul2;
2085 
2086             nz_tmp = bi[prow+1] - bd[prow] - 1;
2087             for (j=0; j<nz_tmp; j++) {
2088               tmp          = pv[j];
2089               idx          = pj[j];
2090               rtmp11[idx] -= mul1 * tmp;
2091               rtmp22[idx] -= mul2 * tmp;
2092             }
2093             ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2094           }
2095           prow = *bjtmp++;
2096         }
2097 
2098         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2099         pc1 = rtmp11 + prow;
2100         pc2 = rtmp22 + prow;
2101 
2102         sctx.pv = *pc1;
2103         pj      = bj + bi[prow];
2104         rs      = 0.0;
2105         for (j=0; j<nz; j++) {
2106           idx = pj[j];
2107           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2108         }
2109         sctx.rs = rs;
2110         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2111         if (sctx.newshift) goto endofwhile;
2112 
2113         if (*pc2 != 0.0) {
2114           pj     = nbj + bd[prow];
2115           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2116           *pc2   = mul2;
2117           nz_tmp = bi[prow+1] - bd[prow] - 1;
2118           for (j=0; j<nz_tmp; j++) {
2119             idx          = pj[j];
2120             tmp          = rtmp11[idx];
2121             rtmp22[idx] -= mul2 * tmp;
2122           }
2123           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2124         }
2125 
2126         pj  = bj + bi[row];
2127         pc1 = ba + bi[row];
2128         pc2 = ba + bi[row+1];
2129 
2130         sctx.pv       = rtmp22[row+1];
2131         rs            = 0.0;
2132         rtmp11[row]   = 1.0/rtmp11[row];
2133         rtmp22[row+1] = 1.0/rtmp22[row+1];
2134         /* copy row entries from dense representation to sparse */
2135         for (j=0; j<nz; j++) {
2136           idx    = pj[j];
2137           pc1[j] = rtmp11[idx];
2138           pc2[j] = rtmp22[idx];
2139           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2140         }
2141         sctx.rs = rs;
2142         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2143         if (sctx.newshift) goto endofwhile;
2144         break;
2145 
2146       case 3:
2147         for  (j=0; j<nz; j++) {
2148           idx         = bjtmp[j];
2149           rtmp11[idx] = 0.0;
2150           rtmp22[idx] = 0.0;
2151           rtmp33[idx] = 0.0;
2152         }
2153         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2154         idx    = r[row];
2155         nz_tmp = ai[idx+1] - ai[idx];
2156         ajtmp  = aj + ai[idx];
2157         v1     = aa + ai[idx];
2158         v2     = aa + ai[idx+1];
2159         v3     = aa + ai[idx+2];
2160         for (j=0; j<nz_tmp; j++) {
2161           idx         = ics[ajtmp[j]];
2162           rtmp11[idx] = v1[j];
2163           rtmp22[idx] = v2[j];
2164           rtmp33[idx] = v3[j];
2165         }
2166         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2167         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2168         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2169 
2170         /* loop over all pivot row blocks above this row block */
2171         prow = *bjtmp++;
2172         while (prow < row) {
2173           pc1 = rtmp11 + prow;
2174           pc2 = rtmp22 + prow;
2175           pc3 = rtmp33 + prow;
2176           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2177             pv   = ba  + bd[prow];
2178             pj   = nbj + bd[prow];
2179             mul1 = *pc1 * *pv;
2180             mul2 = *pc2 * *pv;
2181             mul3 = *pc3 * *pv;
2182             ++pv;
2183             *pc1 = mul1;
2184             *pc2 = mul2;
2185             *pc3 = mul3;
2186 
2187             nz_tmp = bi[prow+1] - bd[prow] - 1;
2188             /* update this row based on pivot row */
2189             for (j=0; j<nz_tmp; j++) {
2190               tmp          = pv[j];
2191               idx          = pj[j];
2192               rtmp11[idx] -= mul1 * tmp;
2193               rtmp22[idx] -= mul2 * tmp;
2194               rtmp33[idx] -= mul3 * tmp;
2195             }
2196             ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr);
2197           }
2198           prow = *bjtmp++;
2199         }
2200 
2201         /* Now take care of diagonal 3x3 block in this set of rows */
2202         /* note: prow = row here */
2203         pc1 = rtmp11 + prow;
2204         pc2 = rtmp22 + prow;
2205         pc3 = rtmp33 + prow;
2206 
2207         sctx.pv = *pc1;
2208         pj      = bj + bi[prow];
2209         rs      = 0.0;
2210         for (j=0; j<nz; j++) {
2211           idx = pj[j];
2212           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2213         }
2214         sctx.rs = rs;
2215         ierr    = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr);
2216         if (sctx.newshift) goto endofwhile;
2217 
2218         if (*pc2 != 0.0 || *pc3 != 0.0) {
2219           mul2   = (*pc2)/(*pc1);
2220           mul3   = (*pc3)/(*pc1);
2221           *pc2   = mul2;
2222           *pc3   = mul3;
2223           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224           pj     = nbj + bd[prow];
2225           for (j=0; j<nz_tmp; j++) {
2226             idx          = pj[j];
2227             tmp          = rtmp11[idx];
2228             rtmp22[idx] -= mul2 * tmp;
2229             rtmp33[idx] -= mul3 * tmp;
2230           }
2231           ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr);
2232         }
2233         ++prow;
2234 
2235         pc2     = rtmp22 + prow;
2236         pc3     = rtmp33 + prow;
2237         sctx.pv = *pc2;
2238         pj      = bj + bi[prow];
2239         rs      = 0.0;
2240         for (j=0; j<nz; j++) {
2241           idx = pj[j];
2242           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2243         }
2244         sctx.rs = rs;
2245         ierr    = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr);
2246         if (sctx.newshift) goto endofwhile;
2247 
2248         if (*pc3 != 0.0) {
2249           mul3   = (*pc3)/(*pc2);
2250           *pc3   = mul3;
2251           pj     = nbj + bd[prow];
2252           nz_tmp = bi[prow+1] - bd[prow] - 1;
2253           for (j=0; j<nz_tmp; j++) {
2254             idx          = pj[j];
2255             tmp          = rtmp22[idx];
2256             rtmp33[idx] -= mul3 * tmp;
2257           }
2258           ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr);
2259         }
2260 
2261         pj  = bj + bi[row];
2262         pc1 = ba + bi[row];
2263         pc2 = ba + bi[row+1];
2264         pc3 = ba + bi[row+2];
2265 
2266         sctx.pv       = rtmp33[row+2];
2267         rs            = 0.0;
2268         rtmp11[row]   = 1.0/rtmp11[row];
2269         rtmp22[row+1] = 1.0/rtmp22[row+1];
2270         rtmp33[row+2] = 1.0/rtmp33[row+2];
2271         /* copy row entries from dense representation to sparse */
2272         for (j=0; j<nz; j++) {
2273           idx    = pj[j];
2274           pc1[j] = rtmp11[idx];
2275           pc2[j] = rtmp22[idx];
2276           pc3[j] = rtmp33[idx];
2277           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2278         }
2279 
2280         sctx.rs = rs;
2281         ierr    = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr);
2282         if (sctx.newshift) goto endofwhile;
2283         break;
2284 
2285       default:
2286         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2287       }
2288       row += nodesz;                 /* Update the row */
2289     }
2290 endofwhile:;
2291   } while (sctx.newshift);
2292   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2293   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2294   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2295   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2296   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2297 
2298   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2299   /* do not set solve add, since MatSolve_Inode + Add is faster */
2300   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2301   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2302   C->assembled              = PETSC_TRUE;
2303   C->preallocated           = PETSC_TRUE;
2304   if (sctx.nshift) {
2305     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2306       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2307     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2308       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2309     }
2310   }
2311   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2312   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2313   PetscFunctionReturn(0);
2314 }
2315 
2316 
2317 /* ----------------------------------------------------------- */
2318 #undef __FUNCT__
2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode"
2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2321 {
2322   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2323   IS                iscol = a->col,isrow = a->row;
2324   PetscErrorCode    ierr;
2325   const PetscInt    *r,*c,*rout,*cout;
2326   PetscInt          i,j,n = A->rmap->n;
2327   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2328   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2329   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2330   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2331   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2332   const PetscScalar *b;
2333 
2334   PetscFunctionBegin;
2335   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2336   node_max = a->inode.node_count;
2337   ns       = a->inode.size;     /* Node Size array */
2338 
2339   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2341   tmp  = a->solve_work;
2342 
2343   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2344   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2345 
2346   /* forward solve the lower triangular */
2347   tmps = tmp;
2348   aa   = a_a;
2349   aj   = a_j;
2350   ad   = a->diag;
2351 
2352   for (i = 0,row = 0; i< node_max; ++i) {
2353     nsz = ns[i];
2354     aii = ai[row];
2355     v1  = aa + aii;
2356     vi  = aj + aii;
2357     nz  = ai[row+1]- ai[row];
2358 
2359     if (i < node_max-1) {
2360       /* Prefetch the indices for the next block */
2361       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2362       /* Prefetch the data for the next block */
2363       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2364     }
2365 
2366     switch (nsz) {               /* Each loop in 'case' is unrolled */
2367     case 1:
2368       sum1 = b[r[row]];
2369       for (j=0; j<nz-1; j+=2) {
2370         i0    = vi[j];
2371         i1    = vi[j+1];
2372         tmp0  = tmps[i0];
2373         tmp1  = tmps[i1];
2374         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2375       }
2376       if (j == nz-1) {
2377         tmp0  = tmps[vi[j]];
2378         sum1 -= v1[j]*tmp0;
2379       }
2380       tmp[row++]=sum1;
2381       break;
2382     case 2:
2383       sum1 = b[r[row]];
2384       sum2 = b[r[row+1]];
2385       v2   = aa + ai[row+1];
2386 
2387       for (j=0; j<nz-1; j+=2) {
2388         i0    = vi[j];
2389         i1    = vi[j+1];
2390         tmp0  = tmps[i0];
2391         tmp1  = tmps[i1];
2392         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2393         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2394       }
2395       if (j == nz-1) {
2396         tmp0  = tmps[vi[j]];
2397         sum1 -= v1[j] *tmp0;
2398         sum2 -= v2[j] *tmp0;
2399       }
2400       sum2     -= v2[nz] * sum1;
2401       tmp[row++]=sum1;
2402       tmp[row++]=sum2;
2403       break;
2404     case 3:
2405       sum1 = b[r[row]];
2406       sum2 = b[r[row+1]];
2407       sum3 = b[r[row+2]];
2408       v2   = aa + ai[row+1];
2409       v3   = aa + ai[row+2];
2410 
2411       for (j=0; j<nz-1; j+=2) {
2412         i0    = vi[j];
2413         i1    = vi[j+1];
2414         tmp0  = tmps[i0];
2415         tmp1  = tmps[i1];
2416         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2417         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2418         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2419       }
2420       if (j == nz-1) {
2421         tmp0  = tmps[vi[j]];
2422         sum1 -= v1[j] *tmp0;
2423         sum2 -= v2[j] *tmp0;
2424         sum3 -= v3[j] *tmp0;
2425       }
2426       sum2     -= v2[nz] * sum1;
2427       sum3     -= v3[nz] * sum1;
2428       sum3     -= v3[nz+1] * sum2;
2429       tmp[row++]=sum1;
2430       tmp[row++]=sum2;
2431       tmp[row++]=sum3;
2432       break;
2433 
2434     case 4:
2435       sum1 = b[r[row]];
2436       sum2 = b[r[row+1]];
2437       sum3 = b[r[row+2]];
2438       sum4 = b[r[row+3]];
2439       v2   = aa + ai[row+1];
2440       v3   = aa + ai[row+2];
2441       v4   = aa + ai[row+3];
2442 
2443       for (j=0; j<nz-1; j+=2) {
2444         i0    = vi[j];
2445         i1    = vi[j+1];
2446         tmp0  = tmps[i0];
2447         tmp1  = tmps[i1];
2448         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2449         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2450         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2451         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2452       }
2453       if (j == nz-1) {
2454         tmp0  = tmps[vi[j]];
2455         sum1 -= v1[j] *tmp0;
2456         sum2 -= v2[j] *tmp0;
2457         sum3 -= v3[j] *tmp0;
2458         sum4 -= v4[j] *tmp0;
2459       }
2460       sum2 -= v2[nz] * sum1;
2461       sum3 -= v3[nz] * sum1;
2462       sum4 -= v4[nz] * sum1;
2463       sum3 -= v3[nz+1] * sum2;
2464       sum4 -= v4[nz+1] * sum2;
2465       sum4 -= v4[nz+2] * sum3;
2466 
2467       tmp[row++]=sum1;
2468       tmp[row++]=sum2;
2469       tmp[row++]=sum3;
2470       tmp[row++]=sum4;
2471       break;
2472     case 5:
2473       sum1 = b[r[row]];
2474       sum2 = b[r[row+1]];
2475       sum3 = b[r[row+2]];
2476       sum4 = b[r[row+3]];
2477       sum5 = b[r[row+4]];
2478       v2   = aa + ai[row+1];
2479       v3   = aa + ai[row+2];
2480       v4   = aa + ai[row+3];
2481       v5   = aa + ai[row+4];
2482 
2483       for (j=0; j<nz-1; j+=2) {
2484         i0    = vi[j];
2485         i1    = vi[j+1];
2486         tmp0  = tmps[i0];
2487         tmp1  = tmps[i1];
2488         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2489         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2490         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2491         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2492         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2493       }
2494       if (j == nz-1) {
2495         tmp0  = tmps[vi[j]];
2496         sum1 -= v1[j] *tmp0;
2497         sum2 -= v2[j] *tmp0;
2498         sum3 -= v3[j] *tmp0;
2499         sum4 -= v4[j] *tmp0;
2500         sum5 -= v5[j] *tmp0;
2501       }
2502 
2503       sum2 -= v2[nz] * sum1;
2504       sum3 -= v3[nz] * sum1;
2505       sum4 -= v4[nz] * sum1;
2506       sum5 -= v5[nz] * sum1;
2507       sum3 -= v3[nz+1] * sum2;
2508       sum4 -= v4[nz+1] * sum2;
2509       sum5 -= v5[nz+1] * sum2;
2510       sum4 -= v4[nz+2] * sum3;
2511       sum5 -= v5[nz+2] * sum3;
2512       sum5 -= v5[nz+3] * sum4;
2513 
2514       tmp[row++]=sum1;
2515       tmp[row++]=sum2;
2516       tmp[row++]=sum3;
2517       tmp[row++]=sum4;
2518       tmp[row++]=sum5;
2519       break;
2520     default:
2521       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2522     }
2523   }
2524   /* backward solve the upper triangular */
2525   for (i=node_max -1,row = n-1; i>=0; i--) {
2526     nsz = ns[i];
2527     aii = ad[row+1] + 1;
2528     v1  = aa + aii;
2529     vi  = aj + aii;
2530     nz  = ad[row]- ad[row+1] - 1;
2531 
2532     if (i > 0) {
2533       /* Prefetch the indices for the next block */
2534       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2535       /* Prefetch the data for the next block */
2536       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2537     }
2538 
2539     switch (nsz) {               /* Each loop in 'case' is unrolled */
2540     case 1:
2541       sum1 = tmp[row];
2542 
2543       for (j=0; j<nz-1; j+=2) {
2544         i0    = vi[j];
2545         i1    = vi[j+1];
2546         tmp0  = tmps[i0];
2547         tmp1  = tmps[i1];
2548         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2549       }
2550       if (j == nz-1) {
2551         tmp0  = tmps[vi[j]];
2552         sum1 -= v1[j]*tmp0;
2553       }
2554       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2555       break;
2556     case 2:
2557       sum1 = tmp[row];
2558       sum2 = tmp[row-1];
2559       v2   = aa + ad[row] + 1;
2560       for (j=0; j<nz-1; j+=2) {
2561         i0    = vi[j];
2562         i1    = vi[j+1];
2563         tmp0  = tmps[i0];
2564         tmp1  = tmps[i1];
2565         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2566         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2567       }
2568       if (j == nz-1) {
2569         tmp0  = tmps[vi[j]];
2570         sum1 -= v1[j]* tmp0;
2571         sum2 -= v2[j+1]* tmp0;
2572       }
2573 
2574       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2575       sum2     -= v2[0] * tmp0;
2576       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2577       break;
2578     case 3:
2579       sum1 = tmp[row];
2580       sum2 = tmp[row -1];
2581       sum3 = tmp[row -2];
2582       v2   = aa + ad[row] + 1;
2583       v3   = aa + ad[row -1] + 1;
2584       for (j=0; j<nz-1; j+=2) {
2585         i0    = vi[j];
2586         i1    = vi[j+1];
2587         tmp0  = tmps[i0];
2588         tmp1  = tmps[i1];
2589         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2590         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2591         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2592       }
2593       if (j== nz-1) {
2594         tmp0  = tmps[vi[j]];
2595         sum1 -= v1[j] * tmp0;
2596         sum2 -= v2[j+1] * tmp0;
2597         sum3 -= v3[j+2] * tmp0;
2598       }
2599       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2600       sum2     -= v2[0]* tmp0;
2601       sum3     -= v3[1] * tmp0;
2602       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2603       sum3     -= v3[0]* tmp0;
2604       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2605 
2606       break;
2607     case 4:
2608       sum1 = tmp[row];
2609       sum2 = tmp[row -1];
2610       sum3 = tmp[row -2];
2611       sum4 = tmp[row -3];
2612       v2   = aa + ad[row]+1;
2613       v3   = aa + ad[row -1]+1;
2614       v4   = aa + ad[row -2]+1;
2615 
2616       for (j=0; j<nz-1; j+=2) {
2617         i0    = vi[j];
2618         i1    = vi[j+1];
2619         tmp0  = tmps[i0];
2620         tmp1  = tmps[i1];
2621         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2622         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2623         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2624         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2625       }
2626       if (j== nz-1) {
2627         tmp0  = tmps[vi[j]];
2628         sum1 -= v1[j] * tmp0;
2629         sum2 -= v2[j+1] * tmp0;
2630         sum3 -= v3[j+2] * tmp0;
2631         sum4 -= v4[j+3] * tmp0;
2632       }
2633 
2634       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2635       sum2     -= v2[0] * tmp0;
2636       sum3     -= v3[1] * tmp0;
2637       sum4     -= v4[2] * tmp0;
2638       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2639       sum3     -= v3[0] * tmp0;
2640       sum4     -= v4[1] * tmp0;
2641       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2642       sum4     -= v4[0] * tmp0;
2643       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2644       break;
2645     case 5:
2646       sum1 = tmp[row];
2647       sum2 = tmp[row -1];
2648       sum3 = tmp[row -2];
2649       sum4 = tmp[row -3];
2650       sum5 = tmp[row -4];
2651       v2   = aa + ad[row]+1;
2652       v3   = aa + ad[row -1]+1;
2653       v4   = aa + ad[row -2]+1;
2654       v5   = aa + ad[row -3]+1;
2655       for (j=0; j<nz-1; j+=2) {
2656         i0    = vi[j];
2657         i1    = vi[j+1];
2658         tmp0  = tmps[i0];
2659         tmp1  = tmps[i1];
2660         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2661         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2662         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2663         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2664         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2665       }
2666       if (j==nz-1) {
2667         tmp0  = tmps[vi[j]];
2668         sum1 -= v1[j] * tmp0;
2669         sum2 -= v2[j+1] * tmp0;
2670         sum3 -= v3[j+2] * tmp0;
2671         sum4 -= v4[j+3] * tmp0;
2672         sum5 -= v5[j+4] * tmp0;
2673       }
2674 
2675       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2676       sum2     -= v2[0] * tmp0;
2677       sum3     -= v3[1] * tmp0;
2678       sum4     -= v4[2] * tmp0;
2679       sum5     -= v5[3] * tmp0;
2680       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2681       sum3     -= v3[0] * tmp0;
2682       sum4     -= v4[1] * tmp0;
2683       sum5     -= v5[2] * tmp0;
2684       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2685       sum4     -= v4[0] * tmp0;
2686       sum5     -= v5[1] * tmp0;
2687       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2688       sum5     -= v5[0] * tmp0;
2689       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2690       break;
2691     default:
2692       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2693     }
2694   }
2695   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2696   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2699   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 
2704 /*
2705      Makes a longer coloring[] array and calls the usual code with that
2706 */
2707 #undef __FUNCT__
2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode"
2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2710 {
2711   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2712   PetscErrorCode  ierr;
2713   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2714   PetscInt        *colorused,i;
2715   ISColoringValue *newcolor;
2716 
2717   PetscFunctionBegin;
2718   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2719   /* loop over inodes, marking a color for each column*/
2720   row = 0;
2721   for (i=0; i<m; i++) {
2722     for (j=0; j<ns[i]; j++) {
2723       newcolor[row++] = coloring[i] + j*ncolors;
2724     }
2725   }
2726 
2727   /* eliminate unneeded colors */
2728   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2729   for (i=0; i<n; i++) {
2730     colorused[newcolor[i]] = 1;
2731   }
2732 
2733   for (i=1; i<5*ncolors; i++) {
2734     colorused[i] += colorused[i-1];
2735   }
2736   ncolors = colorused[5*ncolors-1];
2737   for (i=0; i<n; i++) {
2738     newcolor[i] = colorused[newcolor[i]]-1;
2739   }
2740   ierr = PetscFree(colorused);CHKERRQ(ierr);
2741   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2742   ierr = PetscFree(coloring);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 #include <petsc/private/kernels/blockinvert.h>
2747 
2748 #undef __FUNCT__
2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode"
2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2751 {
2752   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2753   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2754   MatScalar         *ibdiag,*bdiag,work[25],*t;
2755   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2756   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2757   const PetscScalar *xb, *b;
2758   PetscReal         zeropivot = 1.0e-15, shift = 0.0;
2759   PetscErrorCode    ierr;
2760   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2761   PetscInt          sz,k,ipvt[5];
2762   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2763 
2764   PetscFunctionBegin;
2765   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2766   if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
2767   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2768 
2769   if (!a->inode.ibdiagvalid) {
2770     if (!a->inode.ibdiag) {
2771       /* calculate space needed for diagonal blocks */
2772       for (i=0; i<m; i++) {
2773         cnt += sizes[i]*sizes[i];
2774       }
2775       a->inode.bdiagsize = cnt;
2776 
2777       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2778     }
2779 
2780     /* copy over the diagonal blocks and invert them */
2781     ibdiag = a->inode.ibdiag;
2782     bdiag  = a->inode.bdiag;
2783     cnt    = 0;
2784     for (i=0, row = 0; i<m; i++) {
2785       for (j=0; j<sizes[i]; j++) {
2786         for (k=0; k<sizes[i]; k++) {
2787           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2788         }
2789       }
2790       ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr);
2791 
2792       switch (sizes[i]) {
2793       case 1:
2794         /* Create matrix data structure */
2795         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2796         ibdiag[cnt] = 1.0/ibdiag[cnt];
2797         break;
2798       case 2:
2799         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr);
2800         break;
2801       case 3:
2802         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr);
2803         break;
2804       case 4:
2805         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr);
2806         break;
2807       case 5:
2808         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr);
2809         break;
2810       default:
2811         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2812       }
2813       cnt += sizes[i]*sizes[i];
2814       row += sizes[i];
2815     }
2816     a->inode.ibdiagvalid = PETSC_TRUE;
2817   }
2818   ibdiag = a->inode.ibdiag;
2819   bdiag  = a->inode.bdiag;
2820   t      = a->inode.ssor_work;
2821 
2822   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2823   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2824   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2825   if (flag & SOR_ZERO_INITIAL_GUESS) {
2826     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2827 
2828       for (i=0, row=0; i<m; i++) {
2829         sz  = diag[row] - ii[row];
2830         v1  = a->a + ii[row];
2831         idx = a->j + ii[row];
2832 
2833         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2834         switch (sizes[i]) {
2835         case 1:
2836 
2837           sum1 = b[row];
2838           for (n = 0; n<sz-1; n+=2) {
2839             i1    = idx[0];
2840             i2    = idx[1];
2841             idx  += 2;
2842             tmp0  = x[i1];
2843             tmp1  = x[i2];
2844             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2845           }
2846 
2847           if (n == sz-1) {
2848             tmp0  = x[*idx];
2849             sum1 -= *v1 * tmp0;
2850           }
2851           t[row]   = sum1;
2852           x[row++] = sum1*(*ibdiag++);
2853           break;
2854         case 2:
2855           v2   = a->a + ii[row+1];
2856           sum1 = b[row];
2857           sum2 = b[row+1];
2858           for (n = 0; n<sz-1; n+=2) {
2859             i1    = idx[0];
2860             i2    = idx[1];
2861             idx  += 2;
2862             tmp0  = x[i1];
2863             tmp1  = x[i2];
2864             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2865             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2866           }
2867 
2868           if (n == sz-1) {
2869             tmp0  = x[*idx];
2870             sum1 -= v1[0] * tmp0;
2871             sum2 -= v2[0] * tmp0;
2872           }
2873           t[row]   = sum1;
2874           t[row+1] = sum2;
2875           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2876           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2877           ibdiag  += 4;
2878           break;
2879         case 3:
2880           v2   = a->a + ii[row+1];
2881           v3   = a->a + ii[row+2];
2882           sum1 = b[row];
2883           sum2 = b[row+1];
2884           sum3 = b[row+2];
2885           for (n = 0; n<sz-1; n+=2) {
2886             i1    = idx[0];
2887             i2    = idx[1];
2888             idx  += 2;
2889             tmp0  = x[i1];
2890             tmp1  = x[i2];
2891             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2892             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2893             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2894           }
2895 
2896           if (n == sz-1) {
2897             tmp0  = x[*idx];
2898             sum1 -= v1[0] * tmp0;
2899             sum2 -= v2[0] * tmp0;
2900             sum3 -= v3[0] * tmp0;
2901           }
2902           t[row]   = sum1;
2903           t[row+1] = sum2;
2904           t[row+2] = sum3;
2905           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2906           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2907           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2908           ibdiag  += 9;
2909           break;
2910         case 4:
2911           v2   = a->a + ii[row+1];
2912           v3   = a->a + ii[row+2];
2913           v4   = a->a + ii[row+3];
2914           sum1 = b[row];
2915           sum2 = b[row+1];
2916           sum3 = b[row+2];
2917           sum4 = b[row+3];
2918           for (n = 0; n<sz-1; n+=2) {
2919             i1    = idx[0];
2920             i2    = idx[1];
2921             idx  += 2;
2922             tmp0  = x[i1];
2923             tmp1  = x[i2];
2924             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2925             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2926             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2927             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2928           }
2929 
2930           if (n == sz-1) {
2931             tmp0  = x[*idx];
2932             sum1 -= v1[0] * tmp0;
2933             sum2 -= v2[0] * tmp0;
2934             sum3 -= v3[0] * tmp0;
2935             sum4 -= v4[0] * tmp0;
2936           }
2937           t[row]   = sum1;
2938           t[row+1] = sum2;
2939           t[row+2] = sum3;
2940           t[row+3] = sum4;
2941           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2942           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2943           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2944           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2945           ibdiag  += 16;
2946           break;
2947         case 5:
2948           v2   = a->a + ii[row+1];
2949           v3   = a->a + ii[row+2];
2950           v4   = a->a + ii[row+3];
2951           v5   = a->a + ii[row+4];
2952           sum1 = b[row];
2953           sum2 = b[row+1];
2954           sum3 = b[row+2];
2955           sum4 = b[row+3];
2956           sum5 = b[row+4];
2957           for (n = 0; n<sz-1; n+=2) {
2958             i1    = idx[0];
2959             i2    = idx[1];
2960             idx  += 2;
2961             tmp0  = x[i1];
2962             tmp1  = x[i2];
2963             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2964             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2965             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2966             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2967             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2968           }
2969 
2970           if (n == sz-1) {
2971             tmp0  = x[*idx];
2972             sum1 -= v1[0] * tmp0;
2973             sum2 -= v2[0] * tmp0;
2974             sum3 -= v3[0] * tmp0;
2975             sum4 -= v4[0] * tmp0;
2976             sum5 -= v5[0] * tmp0;
2977           }
2978           t[row]   = sum1;
2979           t[row+1] = sum2;
2980           t[row+2] = sum3;
2981           t[row+3] = sum4;
2982           t[row+4] = sum5;
2983           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2984           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2985           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2986           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2987           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2988           ibdiag  += 25;
2989           break;
2990         default:
2991           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2992         }
2993       }
2994 
2995       xb   = t;
2996       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2997     } else xb = b;
2998     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2999 
3000       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3001       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3002         ibdiag -= sizes[i]*sizes[i];
3003         sz      = ii[row+1] - diag[row] - 1;
3004         v1      = a->a + diag[row] + 1;
3005         idx     = a->j + diag[row] + 1;
3006 
3007         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3008         switch (sizes[i]) {
3009         case 1:
3010 
3011           sum1 = xb[row];
3012           for (n = 0; n<sz-1; n+=2) {
3013             i1    = idx[0];
3014             i2    = idx[1];
3015             idx  += 2;
3016             tmp0  = x[i1];
3017             tmp1  = x[i2];
3018             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3019           }
3020 
3021           if (n == sz-1) {
3022             tmp0  = x[*idx];
3023             sum1 -= *v1*tmp0;
3024           }
3025           x[row--] = sum1*(*ibdiag);
3026           break;
3027 
3028         case 2:
3029 
3030           sum1 = xb[row];
3031           sum2 = xb[row-1];
3032           /* note that sum1 is associated with the second of the two rows */
3033           v2 = a->a + diag[row-1] + 2;
3034           for (n = 0; n<sz-1; n+=2) {
3035             i1    = idx[0];
3036             i2    = idx[1];
3037             idx  += 2;
3038             tmp0  = x[i1];
3039             tmp1  = x[i2];
3040             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3041             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3042           }
3043 
3044           if (n == sz-1) {
3045             tmp0  = x[*idx];
3046             sum1 -= *v1*tmp0;
3047             sum2 -= *v2*tmp0;
3048           }
3049           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3050           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3051           break;
3052         case 3:
3053 
3054           sum1 = xb[row];
3055           sum2 = xb[row-1];
3056           sum3 = xb[row-2];
3057           v2   = a->a + diag[row-1] + 2;
3058           v3   = a->a + diag[row-2] + 3;
3059           for (n = 0; n<sz-1; n+=2) {
3060             i1    = idx[0];
3061             i2    = idx[1];
3062             idx  += 2;
3063             tmp0  = x[i1];
3064             tmp1  = x[i2];
3065             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3066             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3067             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3068           }
3069 
3070           if (n == sz-1) {
3071             tmp0  = x[*idx];
3072             sum1 -= *v1*tmp0;
3073             sum2 -= *v2*tmp0;
3074             sum3 -= *v3*tmp0;
3075           }
3076           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3077           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3078           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3079           break;
3080         case 4:
3081 
3082           sum1 = xb[row];
3083           sum2 = xb[row-1];
3084           sum3 = xb[row-2];
3085           sum4 = xb[row-3];
3086           v2   = a->a + diag[row-1] + 2;
3087           v3   = a->a + diag[row-2] + 3;
3088           v4   = a->a + diag[row-3] + 4;
3089           for (n = 0; n<sz-1; n+=2) {
3090             i1    = idx[0];
3091             i2    = idx[1];
3092             idx  += 2;
3093             tmp0  = x[i1];
3094             tmp1  = x[i2];
3095             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3096             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3097             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3098             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3099           }
3100 
3101           if (n == sz-1) {
3102             tmp0  = x[*idx];
3103             sum1 -= *v1*tmp0;
3104             sum2 -= *v2*tmp0;
3105             sum3 -= *v3*tmp0;
3106             sum4 -= *v4*tmp0;
3107           }
3108           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3109           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3110           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3111           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3112           break;
3113         case 5:
3114 
3115           sum1 = xb[row];
3116           sum2 = xb[row-1];
3117           sum3 = xb[row-2];
3118           sum4 = xb[row-3];
3119           sum5 = xb[row-4];
3120           v2   = a->a + diag[row-1] + 2;
3121           v3   = a->a + diag[row-2] + 3;
3122           v4   = a->a + diag[row-3] + 4;
3123           v5   = a->a + diag[row-4] + 5;
3124           for (n = 0; n<sz-1; n+=2) {
3125             i1    = idx[0];
3126             i2    = idx[1];
3127             idx  += 2;
3128             tmp0  = x[i1];
3129             tmp1  = x[i2];
3130             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3131             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3132             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3133             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3134             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3135           }
3136 
3137           if (n == sz-1) {
3138             tmp0  = x[*idx];
3139             sum1 -= *v1*tmp0;
3140             sum2 -= *v2*tmp0;
3141             sum3 -= *v3*tmp0;
3142             sum4 -= *v4*tmp0;
3143             sum5 -= *v5*tmp0;
3144           }
3145           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3146           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3147           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3148           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3149           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3150           break;
3151         default:
3152           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3153         }
3154       }
3155 
3156       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3157     }
3158     its--;
3159   }
3160   while (its--) {
3161 
3162     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3163       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3164            i<m;
3165            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3166 
3167         sz  = diag[row] - ii[row];
3168         v1  = a->a + ii[row];
3169         idx = a->j + ii[row];
3170         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3171         switch (sizes[i]) {
3172         case 1:
3173           sum1 = b[row];
3174           for (n = 0; n<sz-1; n+=2) {
3175             i1    = idx[0];
3176             i2    = idx[1];
3177             idx  += 2;
3178             tmp0  = x[i1];
3179             tmp1  = x[i2];
3180             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3181           }
3182           if (n == sz-1) {
3183             tmp0  = x[*idx++];
3184             sum1 -= *v1 * tmp0;
3185             v1++;
3186           }
3187           t[row]   = sum1;
3188           sz      = ii[row+1] - diag[row] - 1;
3189           idx     = a->j + diag[row] + 1;
3190           v1 += 1;
3191           for (n = 0; n<sz-1; n+=2) {
3192             i1    = idx[0];
3193             i2    = idx[1];
3194             idx  += 2;
3195             tmp0  = x[i1];
3196             tmp1  = x[i2];
3197             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3198           }
3199           if (n == sz-1) {
3200             tmp0  = x[*idx++];
3201             sum1 -= *v1 * tmp0;
3202           }
3203           /* in MatSOR_SeqAIJ this line would be
3204            *
3205            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3206            *
3207            * but omega == 1, so this becomes
3208            *
3209            * x[row] = sum1*(*ibdiag++);
3210            *
3211            */
3212           x[row] = sum1*(*ibdiag);
3213           break;
3214         case 2:
3215           v2   = a->a + ii[row+1];
3216           sum1 = b[row];
3217           sum2 = b[row+1];
3218           for (n = 0; n<sz-1; n+=2) {
3219             i1    = idx[0];
3220             i2    = idx[1];
3221             idx  += 2;
3222             tmp0  = x[i1];
3223             tmp1  = x[i2];
3224             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226           }
3227           if (n == sz-1) {
3228             tmp0  = x[*idx++];
3229             sum1 -= v1[0] * tmp0;
3230             sum2 -= v2[0] * tmp0;
3231             v1++; v2++;
3232           }
3233           t[row]   = sum1;
3234           t[row+1] = sum2;
3235           sz      = ii[row+1] - diag[row] - 2;
3236           idx     = a->j + diag[row] + 2;
3237           v1 += 2;
3238           v2 += 2;
3239           for (n = 0; n<sz-1; n+=2) {
3240             i1    = idx[0];
3241             i2    = idx[1];
3242             idx  += 2;
3243             tmp0  = x[i1];
3244             tmp1  = x[i2];
3245             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3246             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3247           }
3248           if (n == sz-1) {
3249             tmp0  = x[*idx];
3250             sum1 -= v1[0] * tmp0;
3251             sum2 -= v2[0] * tmp0;
3252           }
3253           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3254           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3255           break;
3256         case 3:
3257           v2   = a->a + ii[row+1];
3258           v3   = a->a + ii[row+2];
3259           sum1 = b[row];
3260           sum2 = b[row+1];
3261           sum3 = b[row+2];
3262           for (n = 0; n<sz-1; n+=2) {
3263             i1    = idx[0];
3264             i2    = idx[1];
3265             idx  += 2;
3266             tmp0  = x[i1];
3267             tmp1  = x[i2];
3268             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3269             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3270             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3271           }
3272           if (n == sz-1) {
3273             tmp0  = x[*idx++];
3274             sum1 -= v1[0] * tmp0;
3275             sum2 -= v2[0] * tmp0;
3276             sum3 -= v3[0] * tmp0;
3277             v1++; v2++; v3++;
3278           }
3279           t[row]   = sum1;
3280           t[row+1] = sum2;
3281           t[row+2] = sum3;
3282           sz      = ii[row+1] - diag[row] - 3;
3283           idx     = a->j + diag[row] + 3;
3284           v1 += 3;
3285           v2 += 3;
3286           v3 += 3;
3287           for (n = 0; n<sz-1; n+=2) {
3288             i1    = idx[0];
3289             i2    = idx[1];
3290             idx  += 2;
3291             tmp0  = x[i1];
3292             tmp1  = x[i2];
3293             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3294             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3295             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3296           }
3297           if (n == sz-1) {
3298             tmp0  = x[*idx];
3299             sum1 -= v1[0] * tmp0;
3300             sum2 -= v2[0] * tmp0;
3301             sum3 -= v3[0] * tmp0;
3302           }
3303           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3304           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3305           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3306           break;
3307         case 4:
3308           v2   = a->a + ii[row+1];
3309           v3   = a->a + ii[row+2];
3310           v4   = a->a + ii[row+3];
3311           sum1 = b[row];
3312           sum2 = b[row+1];
3313           sum3 = b[row+2];
3314           sum4 = b[row+3];
3315           for (n = 0; n<sz-1; n+=2) {
3316             i1    = idx[0];
3317             i2    = idx[1];
3318             idx  += 2;
3319             tmp0  = x[i1];
3320             tmp1  = x[i2];
3321             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3322             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3323             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3324             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3325           }
3326           if (n == sz-1) {
3327             tmp0  = x[*idx++];
3328             sum1 -= v1[0] * tmp0;
3329             sum2 -= v2[0] * tmp0;
3330             sum3 -= v3[0] * tmp0;
3331             sum4 -= v4[0] * tmp0;
3332             v1++; v2++; v3++; v4++;
3333           }
3334           t[row]   = sum1;
3335           t[row+1] = sum2;
3336           t[row+2] = sum3;
3337           t[row+3] = sum4;
3338           sz      = ii[row+1] - diag[row] - 4;
3339           idx     = a->j + diag[row] + 4;
3340           v1 += 4;
3341           v2 += 4;
3342           v3 += 4;
3343           v4 += 4;
3344           for (n = 0; n<sz-1; n+=2) {
3345             i1    = idx[0];
3346             i2    = idx[1];
3347             idx  += 2;
3348             tmp0  = x[i1];
3349             tmp1  = x[i2];
3350             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3351             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3352             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3353             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3354           }
3355           if (n == sz-1) {
3356             tmp0  = x[*idx];
3357             sum1 -= v1[0] * tmp0;
3358             sum2 -= v2[0] * tmp0;
3359             sum3 -= v3[0] * tmp0;
3360             sum4 -= v4[0] * tmp0;
3361           }
3362           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3363           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3364           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3365           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3366           break;
3367         case 5:
3368           v2   = a->a + ii[row+1];
3369           v3   = a->a + ii[row+2];
3370           v4   = a->a + ii[row+3];
3371           v5   = a->a + ii[row+4];
3372           sum1 = b[row];
3373           sum2 = b[row+1];
3374           sum3 = b[row+2];
3375           sum4 = b[row+3];
3376           sum5 = b[row+4];
3377           for (n = 0; n<sz-1; n+=2) {
3378             i1    = idx[0];
3379             i2    = idx[1];
3380             idx  += 2;
3381             tmp0  = x[i1];
3382             tmp1  = x[i2];
3383             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3384             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3385             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3386             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3387             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3388           }
3389           if (n == sz-1) {
3390             tmp0  = x[*idx++];
3391             sum1 -= v1[0] * tmp0;
3392             sum2 -= v2[0] * tmp0;
3393             sum3 -= v3[0] * tmp0;
3394             sum4 -= v4[0] * tmp0;
3395             sum5 -= v5[0] * tmp0;
3396             v1++; v2++; v3++; v4++; v5++;
3397           }
3398           t[row]   = sum1;
3399           t[row+1] = sum2;
3400           t[row+2] = sum3;
3401           t[row+3] = sum4;
3402           t[row+4] = sum5;
3403           sz      = ii[row+1] - diag[row] - 5;
3404           idx     = a->j + diag[row] + 5;
3405           v1 += 5;
3406           v2 += 5;
3407           v3 += 5;
3408           v4 += 5;
3409           v5 += 5;
3410           for (n = 0; n<sz-1; n+=2) {
3411             i1    = idx[0];
3412             i2    = idx[1];
3413             idx  += 2;
3414             tmp0  = x[i1];
3415             tmp1  = x[i2];
3416             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3417             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3418             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3419             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3420             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3421           }
3422           if (n == sz-1) {
3423             tmp0  = x[*idx];
3424             sum1 -= v1[0] * tmp0;
3425             sum2 -= v2[0] * tmp0;
3426             sum3 -= v3[0] * tmp0;
3427             sum4 -= v4[0] * tmp0;
3428             sum5 -= v5[0] * tmp0;
3429           }
3430           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3431           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3432           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3433           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3434           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3435           break;
3436         default:
3437           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3438         }
3439       }
3440       xb   = t;
3441       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3442     } else xb = b;
3443 
3444     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3445 
3446       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3447       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3448         ibdiag -= sizes[i]*sizes[i];
3449 
3450         /* set RHS */
3451         if (xb == b) {
3452           /* whole (old way) */
3453           sz      = ii[row+1] - ii[row];
3454           idx     = a->j + ii[row];
3455           switch (sizes[i]) {
3456           case 5:
3457             v5      = a->a + ii[row-4];
3458           case 4: /* fall through */
3459             v4      = a->a + ii[row-3];
3460           case 3:
3461             v3      = a->a + ii[row-2];
3462           case 2:
3463             v2      = a->a + ii[row-1];
3464           case 1:
3465             v1      = a->a + ii[row];
3466             break;
3467           default:
3468             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3469           }
3470         } else {
3471           /* upper, no diag */
3472           sz      = ii[row+1] - diag[row] - 1;
3473           idx     = a->j + diag[row] + 1;
3474           switch (sizes[i]) {
3475           case 5:
3476             v5      = a->a + diag[row-4] + 5;
3477           case 4: /* fall through */
3478             v4      = a->a + diag[row-3] + 4;
3479           case 3:
3480             v3      = a->a + diag[row-2] + 3;
3481           case 2:
3482             v2      = a->a + diag[row-1] + 2;
3483           case 1:
3484             v1      = a->a + diag[row] + 1;
3485           }
3486         }
3487         /* set sum */
3488         switch (sizes[i]) {
3489         case 5:
3490           sum5 = xb[row-4];
3491         case 4: /* fall through */
3492           sum4 = xb[row-3];
3493         case 3:
3494           sum3 = xb[row-2];
3495         case 2:
3496           sum2 = xb[row-1];
3497         case 1:
3498           /* note that sum1 is associated with the last row */
3499           sum1 = xb[row];
3500         }
3501         /* do sums */
3502         for (n = 0; n<sz-1; n+=2) {
3503             i1    = idx[0];
3504             i2    = idx[1];
3505             idx  += 2;
3506             tmp0  = x[i1];
3507             tmp1  = x[i2];
3508             switch (sizes[i]) {
3509             case 5:
3510               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3511             case 4: /* fall through */
3512               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3513             case 3:
3514               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3515             case 2:
3516               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3517             case 1:
3518               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3519             }
3520         }
3521         /* ragged edge */
3522         if (n == sz-1) {
3523           tmp0  = x[*idx];
3524           switch (sizes[i]) {
3525           case 5:
3526             sum5 -= *v5*tmp0;
3527           case 4: /* fall through */
3528             sum4 -= *v4*tmp0;
3529           case 3:
3530             sum3 -= *v3*tmp0;
3531           case 2:
3532             sum2 -= *v2*tmp0;
3533           case 1:
3534             sum1 -= *v1*tmp0;
3535           }
3536         }
3537         /* update */
3538         if (xb == b) {
3539           /* whole (old way) w/ diag */
3540           switch (sizes[i]) {
3541           case 5:
3542             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3543             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3544             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3545             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3546             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3547             break;
3548           case 4:
3549             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3550             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3551             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3552             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3553             break;
3554           case 3:
3555             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3556             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3557             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3558             break;
3559           case 2:
3560             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3561             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3562             break;
3563           case 1:
3564             x[row--] += sum1*(*ibdiag);
3565             break;
3566           }
3567         } else {
3568           /* no diag so set =  */
3569           switch (sizes[i]) {
3570           case 5:
3571             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3572             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3573             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3574             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3575             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3576             break;
3577           case 4:
3578             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3579             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3580             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3581             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3582             break;
3583           case 3:
3584             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3585             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3586             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3587             break;
3588           case 2:
3589             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3590             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3591             break;
3592           case 1:
3593             x[row--] = sum1*(*ibdiag);
3594             break;
3595           }
3596         }
3597       }
3598       if (xb == b) {
3599         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3600       } else {
3601         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3602       }
3603     }
3604   }
3605   if (flag & SOR_EISENSTAT) {
3606     /*
3607           Apply  (U + D)^-1  where D is now the block diagonal
3608     */
3609     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3610     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3611       ibdiag -= sizes[i]*sizes[i];
3612       sz      = ii[row+1] - diag[row] - 1;
3613       v1      = a->a + diag[row] + 1;
3614       idx     = a->j + diag[row] + 1;
3615       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3616       switch (sizes[i]) {
3617       case 1:
3618 
3619         sum1 = b[row];
3620         for (n = 0; n<sz-1; n+=2) {
3621           i1    = idx[0];
3622           i2    = idx[1];
3623           idx  += 2;
3624           tmp0  = x[i1];
3625           tmp1  = x[i2];
3626           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3627         }
3628 
3629         if (n == sz-1) {
3630           tmp0  = x[*idx];
3631           sum1 -= *v1*tmp0;
3632         }
3633         x[row] = sum1*(*ibdiag);row--;
3634         break;
3635 
3636       case 2:
3637 
3638         sum1 = b[row];
3639         sum2 = b[row-1];
3640         /* note that sum1 is associated with the second of the two rows */
3641         v2 = a->a + diag[row-1] + 2;
3642         for (n = 0; n<sz-1; n+=2) {
3643           i1    = idx[0];
3644           i2    = idx[1];
3645           idx  += 2;
3646           tmp0  = x[i1];
3647           tmp1  = x[i2];
3648           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3649           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3650         }
3651 
3652         if (n == sz-1) {
3653           tmp0  = x[*idx];
3654           sum1 -= *v1*tmp0;
3655           sum2 -= *v2*tmp0;
3656         }
3657         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3658         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3659         row     -= 2;
3660         break;
3661       case 3:
3662 
3663         sum1 = b[row];
3664         sum2 = b[row-1];
3665         sum3 = b[row-2];
3666         v2   = a->a + diag[row-1] + 2;
3667         v3   = a->a + diag[row-2] + 3;
3668         for (n = 0; n<sz-1; n+=2) {
3669           i1    = idx[0];
3670           i2    = idx[1];
3671           idx  += 2;
3672           tmp0  = x[i1];
3673           tmp1  = x[i2];
3674           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3675           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3676           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3677         }
3678 
3679         if (n == sz-1) {
3680           tmp0  = x[*idx];
3681           sum1 -= *v1*tmp0;
3682           sum2 -= *v2*tmp0;
3683           sum3 -= *v3*tmp0;
3684         }
3685         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3686         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3687         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3688         row     -= 3;
3689         break;
3690       case 4:
3691 
3692         sum1 = b[row];
3693         sum2 = b[row-1];
3694         sum3 = b[row-2];
3695         sum4 = b[row-3];
3696         v2   = a->a + diag[row-1] + 2;
3697         v3   = a->a + diag[row-2] + 3;
3698         v4   = a->a + diag[row-3] + 4;
3699         for (n = 0; n<sz-1; n+=2) {
3700           i1    = idx[0];
3701           i2    = idx[1];
3702           idx  += 2;
3703           tmp0  = x[i1];
3704           tmp1  = x[i2];
3705           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3706           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3707           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3708           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3709         }
3710 
3711         if (n == sz-1) {
3712           tmp0  = x[*idx];
3713           sum1 -= *v1*tmp0;
3714           sum2 -= *v2*tmp0;
3715           sum3 -= *v3*tmp0;
3716           sum4 -= *v4*tmp0;
3717         }
3718         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3719         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3720         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3721         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3722         row     -= 4;
3723         break;
3724       case 5:
3725 
3726         sum1 = b[row];
3727         sum2 = b[row-1];
3728         sum3 = b[row-2];
3729         sum4 = b[row-3];
3730         sum5 = b[row-4];
3731         v2   = a->a + diag[row-1] + 2;
3732         v3   = a->a + diag[row-2] + 3;
3733         v4   = a->a + diag[row-3] + 4;
3734         v5   = a->a + diag[row-4] + 5;
3735         for (n = 0; n<sz-1; n+=2) {
3736           i1    = idx[0];
3737           i2    = idx[1];
3738           idx  += 2;
3739           tmp0  = x[i1];
3740           tmp1  = x[i2];
3741           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3742           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3743           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3744           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3745           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3746         }
3747 
3748         if (n == sz-1) {
3749           tmp0  = x[*idx];
3750           sum1 -= *v1*tmp0;
3751           sum2 -= *v2*tmp0;
3752           sum3 -= *v3*tmp0;
3753           sum4 -= *v4*tmp0;
3754           sum5 -= *v5*tmp0;
3755         }
3756         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3757         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3758         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3759         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3760         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3761         row     -= 5;
3762         break;
3763       default:
3764         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3765       }
3766     }
3767     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3768 
3769     /*
3770            t = b - D x    where D is the block diagonal
3771     */
3772     cnt = 0;
3773     for (i=0, row=0; i<m; i++) {
3774       switch (sizes[i]) {
3775       case 1:
3776         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3777         break;
3778       case 2:
3779         x1       = x[row]; x2 = x[row+1];
3780         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3781         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3782         t[row]   = b[row] - tmp1;
3783         t[row+1] = b[row+1] - tmp2; row += 2;
3784         cnt     += 4;
3785         break;
3786       case 3:
3787         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3788         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3789         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3790         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3791         t[row]   = b[row] - tmp1;
3792         t[row+1] = b[row+1] - tmp2;
3793         t[row+2] = b[row+2] - tmp3; row += 3;
3794         cnt     += 9;
3795         break;
3796       case 4:
3797         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3798         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3799         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3800         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3801         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3802         t[row]   = b[row] - tmp1;
3803         t[row+1] = b[row+1] - tmp2;
3804         t[row+2] = b[row+2] - tmp3;
3805         t[row+3] = b[row+3] - tmp4; row += 4;
3806         cnt     += 16;
3807         break;
3808       case 5:
3809         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3810         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3811         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3812         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3813         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3814         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3815         t[row]   = b[row] - tmp1;
3816         t[row+1] = b[row+1] - tmp2;
3817         t[row+2] = b[row+2] - tmp3;
3818         t[row+3] = b[row+3] - tmp4;
3819         t[row+4] = b[row+4] - tmp5;row += 5;
3820         cnt     += 25;
3821         break;
3822       default:
3823         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3824       }
3825     }
3826     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3827 
3828 
3829 
3830     /*
3831           Apply (L + D)^-1 where D is the block diagonal
3832     */
3833     for (i=0, row=0; i<m; i++) {
3834       sz  = diag[row] - ii[row];
3835       v1  = a->a + ii[row];
3836       idx = a->j + ii[row];
3837       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3838       switch (sizes[i]) {
3839       case 1:
3840 
3841         sum1 = t[row];
3842         for (n = 0; n<sz-1; n+=2) {
3843           i1    = idx[0];
3844           i2    = idx[1];
3845           idx  += 2;
3846           tmp0  = t[i1];
3847           tmp1  = t[i2];
3848           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3849         }
3850 
3851         if (n == sz-1) {
3852           tmp0  = t[*idx];
3853           sum1 -= *v1 * tmp0;
3854         }
3855         x[row] += t[row] = sum1*(*ibdiag++); row++;
3856         break;
3857       case 2:
3858         v2   = a->a + ii[row+1];
3859         sum1 = t[row];
3860         sum2 = t[row+1];
3861         for (n = 0; n<sz-1; n+=2) {
3862           i1    = idx[0];
3863           i2    = idx[1];
3864           idx  += 2;
3865           tmp0  = t[i1];
3866           tmp1  = t[i2];
3867           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3868           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3869         }
3870 
3871         if (n == sz-1) {
3872           tmp0  = t[*idx];
3873           sum1 -= v1[0] * tmp0;
3874           sum2 -= v2[0] * tmp0;
3875         }
3876         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3877         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3878         ibdiag   += 4; row += 2;
3879         break;
3880       case 3:
3881         v2   = a->a + ii[row+1];
3882         v3   = a->a + ii[row+2];
3883         sum1 = t[row];
3884         sum2 = t[row+1];
3885         sum3 = t[row+2];
3886         for (n = 0; n<sz-1; n+=2) {
3887           i1    = idx[0];
3888           i2    = idx[1];
3889           idx  += 2;
3890           tmp0  = t[i1];
3891           tmp1  = t[i2];
3892           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3893           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3894           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3895         }
3896 
3897         if (n == sz-1) {
3898           tmp0  = t[*idx];
3899           sum1 -= v1[0] * tmp0;
3900           sum2 -= v2[0] * tmp0;
3901           sum3 -= v3[0] * tmp0;
3902         }
3903         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3904         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3905         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3906         ibdiag   += 9; row += 3;
3907         break;
3908       case 4:
3909         v2   = a->a + ii[row+1];
3910         v3   = a->a + ii[row+2];
3911         v4   = a->a + ii[row+3];
3912         sum1 = t[row];
3913         sum2 = t[row+1];
3914         sum3 = t[row+2];
3915         sum4 = t[row+3];
3916         for (n = 0; n<sz-1; n+=2) {
3917           i1    = idx[0];
3918           i2    = idx[1];
3919           idx  += 2;
3920           tmp0  = t[i1];
3921           tmp1  = t[i2];
3922           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3923           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3924           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3925           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3926         }
3927 
3928         if (n == sz-1) {
3929           tmp0  = t[*idx];
3930           sum1 -= v1[0] * tmp0;
3931           sum2 -= v2[0] * tmp0;
3932           sum3 -= v3[0] * tmp0;
3933           sum4 -= v4[0] * tmp0;
3934         }
3935         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3936         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3937         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3938         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3939         ibdiag   += 16; row += 4;
3940         break;
3941       case 5:
3942         v2   = a->a + ii[row+1];
3943         v3   = a->a + ii[row+2];
3944         v4   = a->a + ii[row+3];
3945         v5   = a->a + ii[row+4];
3946         sum1 = t[row];
3947         sum2 = t[row+1];
3948         sum3 = t[row+2];
3949         sum4 = t[row+3];
3950         sum5 = t[row+4];
3951         for (n = 0; n<sz-1; n+=2) {
3952           i1    = idx[0];
3953           i2    = idx[1];
3954           idx  += 2;
3955           tmp0  = t[i1];
3956           tmp1  = t[i2];
3957           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3958           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3959           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3960           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3961           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3962         }
3963 
3964         if (n == sz-1) {
3965           tmp0  = t[*idx];
3966           sum1 -= v1[0] * tmp0;
3967           sum2 -= v2[0] * tmp0;
3968           sum3 -= v3[0] * tmp0;
3969           sum4 -= v4[0] * tmp0;
3970           sum5 -= v5[0] * tmp0;
3971         }
3972         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3973         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3974         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3975         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3976         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3977         ibdiag   += 25; row += 5;
3978         break;
3979       default:
3980         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3981       }
3982     }
3983     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3984   }
3985   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3986   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3987   PetscFunctionReturn(0);
3988 }
3989 
3990 #undef __FUNCT__
3991 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode"
3992 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3993 {
3994   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3995   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3996   const MatScalar   *bdiag = a->inode.bdiag;
3997   const PetscScalar *b;
3998   PetscErrorCode    ierr;
3999   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
4000   const PetscInt    *sizes = a->inode.size;
4001 
4002   PetscFunctionBegin;
4003   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4004   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4005   cnt  = 0;
4006   for (i=0, row=0; i<m; i++) {
4007     switch (sizes[i]) {
4008     case 1:
4009       x[row] = b[row]*bdiag[cnt++];row++;
4010       break;
4011     case 2:
4012       x1       = b[row]; x2 = b[row+1];
4013       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
4014       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
4015       x[row++] = tmp1;
4016       x[row++] = tmp2;
4017       cnt     += 4;
4018       break;
4019     case 3:
4020       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
4021       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4022       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4023       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4024       x[row++] = tmp1;
4025       x[row++] = tmp2;
4026       x[row++] = tmp3;
4027       cnt     += 9;
4028       break;
4029     case 4:
4030       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4031       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4032       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4033       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4034       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4035       x[row++] = tmp1;
4036       x[row++] = tmp2;
4037       x[row++] = tmp3;
4038       x[row++] = tmp4;
4039       cnt     += 16;
4040       break;
4041     case 5:
4042       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4043       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4044       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4045       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4046       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4047       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4048       x[row++] = tmp1;
4049       x[row++] = tmp2;
4050       x[row++] = tmp3;
4051       x[row++] = tmp4;
4052       x[row++] = tmp5;
4053       cnt     += 25;
4054       break;
4055     default:
4056       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4057     }
4058   }
4059   ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr);
4060   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4061   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4062   PetscFunctionReturn(0);
4063 }
4064 
4065 /*
4066     samestructure indicates that the matrix has not changed its nonzero structure so we
4067     do not need to recompute the inodes
4068 */
4069 #undef __FUNCT__
4070 #define __FUNCT__ "MatSeqAIJCheckInode"
4071 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4072 {
4073   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4074   PetscErrorCode ierr;
4075   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4076   PetscBool      flag;
4077   const PetscInt *idx,*idy,*ii;
4078 
4079   PetscFunctionBegin;
4080   if (!a->inode.use) PetscFunctionReturn(0);
4081   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4082 
4083   m = A->rmap->n;
4084   if (a->inode.size) ns = a->inode.size;
4085   else {
4086     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4087   }
4088 
4089   i          = 0;
4090   node_count = 0;
4091   idx        = a->j;
4092   ii         = a->i;
4093   while (i < m) {                /* For each row */
4094     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4095     /* Limits the number of elements in a node to 'a->inode.limit' */
4096     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4097       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4098       if (nzy != nzx) break;
4099       idy += nzx;              /* Same nonzero pattern */
4100       ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4101       if (!flag) break;
4102     }
4103     ns[node_count++] = blk_size;
4104     idx             += blk_size*nzx;
4105     i                = j;
4106   }
4107   /* If not enough inodes found,, do not use inode version of the routines */
4108   if (!m || node_count > .8*m) {
4109     ierr = PetscFree(ns);CHKERRQ(ierr);
4110 
4111     a->inode.node_count       = 0;
4112     a->inode.size             = NULL;
4113     a->inode.use              = PETSC_FALSE;
4114     A->ops->mult              = MatMult_SeqAIJ;
4115     A->ops->sor               = MatSOR_SeqAIJ;
4116     A->ops->multadd           = MatMultAdd_SeqAIJ;
4117     A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4118     A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4119     A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4120     A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4121     A->ops->coloringpatch     = 0;
4122     A->ops->multdiagonalblock = 0;
4123 
4124     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4125   } else {
4126     if (!A->factortype) {
4127       A->ops->mult              = MatMult_SeqAIJ_Inode;
4128       A->ops->sor               = MatSOR_SeqAIJ_Inode;
4129       A->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4130       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4131       if (A->rmap->n == A->cmap->n) {
4132         A->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4133         A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4134         A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4135         A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4136         A->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4137       }
4138     } else {
4139       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4140     }
4141     a->inode.node_count = node_count;
4142     a->inode.size       = ns;
4143     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4144   }
4145   a->inode.checked          = PETSC_TRUE;
4146   a->inode.mat_nonzerostate = A->nonzerostate;
4147   PetscFunctionReturn(0);
4148 }
4149 
4150 #undef __FUNCT__
4151 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode"
4152 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4153 {
4154   Mat            B =*C;
4155   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4156   PetscErrorCode ierr;
4157   PetscInt       m=A->rmap->n;
4158 
4159   PetscFunctionBegin;
4160   c->inode.use       = a->inode.use;
4161   c->inode.limit     = a->inode.limit;
4162   c->inode.max_limit = a->inode.max_limit;
4163   if (a->inode.size) {
4164     ierr                = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4165     c->inode.node_count = a->inode.node_count;
4166     ierr                = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
4167     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4168     if (!B->factortype) {
4169       B->ops->mult              = MatMult_SeqAIJ_Inode;
4170       B->ops->sor               = MatSOR_SeqAIJ_Inode;
4171       B->ops->multadd           = MatMultAdd_SeqAIJ_Inode;
4172       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4173       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4174       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4175       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4176       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4177       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4178     } else {
4179       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4180     }
4181   } else {
4182     c->inode.size       = 0;
4183     c->inode.node_count = 0;
4184   }
4185   c->inode.ibdiagvalid = PETSC_FALSE;
4186   c->inode.ibdiag      = 0;
4187   c->inode.bdiag       = 0;
4188   PetscFunctionReturn(0);
4189 }
4190 
4191 #undef __FUNCT__
4192 #define __FUNCT__ "MatGetRow_FactoredLU"
4193 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4194 {
4195   PetscInt       k;
4196   const PetscInt *vi;
4197 
4198   PetscFunctionBegin;
4199   vi = aj + ai[row];
4200   for (k=0; k<nzl; k++) cols[k] = vi[k];
4201   vi        = aj + adiag[row];
4202   cols[nzl] = vi[0];
4203   vi        = aj + adiag[row+1]+1;
4204   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4205   PetscFunctionReturn(0);
4206 }
4207 /*
4208    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4209    Modified from MatSeqAIJCheckInode().
4210 
4211    Input Parameters:
4212 .  Mat A - ILU or LU matrix factor
4213 
4214 */
4215 #undef __FUNCT__
4216 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU"
4217 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4218 {
4219   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4220   PetscErrorCode ierr;
4221   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4222   PetscInt       *cols1,*cols2,*ns;
4223   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4224   PetscBool      flag;
4225 
4226   PetscFunctionBegin;
4227   if (!a->inode.use)    PetscFunctionReturn(0);
4228   if (a->inode.checked) PetscFunctionReturn(0);
4229 
4230   m = A->rmap->n;
4231   if (a->inode.size) ns = a->inode.size;
4232   else {
4233     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4234   }
4235 
4236   i          = 0;
4237   node_count = 0;
4238   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4239   while (i < m) {                /* For each row */
4240     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4241     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4242     nzx  = nzl1 + nzu1 + 1;
4243     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4244 
4245     /* Limits the number of elements in a node to 'a->inode.limit' */
4246     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4247       nzl2 = ai[j+1] - ai[j];
4248       nzu2 = adiag[j] - adiag[j+1] - 1;
4249       nzy  = nzl2 + nzu2 + 1;
4250       if (nzy != nzx) break;
4251       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4252       ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr);
4253       if (!flag) break;
4254     }
4255     ns[node_count++] = blk_size;
4256     i                = j;
4257   }
4258   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4259   /* If not enough inodes found,, do not use inode version of the routines */
4260   if (!m || node_count > .8*m) {
4261     ierr = PetscFree(ns);CHKERRQ(ierr);
4262 
4263     a->inode.node_count = 0;
4264     a->inode.size       = NULL;
4265     a->inode.use        = PETSC_FALSE;
4266 
4267     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4268   } else {
4269     A->ops->mult              = 0;
4270     A->ops->sor               = 0;
4271     A->ops->multadd           = 0;
4272     A->ops->getrowij          = 0;
4273     A->ops->restorerowij      = 0;
4274     A->ops->getcolumnij       = 0;
4275     A->ops->restorecolumnij   = 0;
4276     A->ops->coloringpatch     = 0;
4277     A->ops->multdiagonalblock = 0;
4278     a->inode.node_count       = node_count;
4279     a->inode.size             = ns;
4280 
4281     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4282   }
4283   a->inode.checked = PETSC_TRUE;
4284   PetscFunctionReturn(0);
4285 }
4286 
4287 #undef __FUNCT__
4288 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode"
4289 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4290 {
4291   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4292 
4293   PetscFunctionBegin;
4294   a->inode.ibdiagvalid = PETSC_FALSE;
4295   PetscFunctionReturn(0);
4296 }
4297 
4298 /*
4299      This is really ugly. if inodes are used this replaces the
4300   permutations with ones that correspond to rows/cols of the matrix
4301   rather then inode blocks
4302 */
4303 #undef __FUNCT__
4304 #define __FUNCT__ "MatInodeAdjustForInodes"
4305 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4306 {
4307   PetscErrorCode ierr;
4308 
4309   PetscFunctionBegin;
4310   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4311   PetscFunctionReturn(0);
4312 }
4313 
4314 #undef __FUNCT__
4315 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode"
4316 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4317 {
4318   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4319   PetscErrorCode ierr;
4320   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4321   const PetscInt *ridx,*cidx;
4322   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4323   PetscInt       nslim_col,*ns_col;
4324   IS             ris = *rperm,cis = *cperm;
4325 
4326   PetscFunctionBegin;
4327   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4328   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4329 
4330   ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4331   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4332   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4333 
4334   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4335   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4336 
4337   /* Form the inode structure for the rows of permuted matric using inv perm*/
4338   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4339 
4340   /* Construct the permutations for rows*/
4341   for (i=0,row = 0; i<nslim_row; ++i) {
4342     indx      = ridx[i];
4343     start_val = tns[indx];
4344     end_val   = tns[indx + 1];
4345     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4346   }
4347 
4348   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4349   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4350 
4351   /* Construct permutations for columns */
4352   for (i=0,col=0; i<nslim_col; ++i) {
4353     indx      = cidx[i];
4354     start_val = tns[indx];
4355     end_val   = tns[indx + 1];
4356     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4357   }
4358 
4359   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4360   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4361   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4362   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4363 
4364   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4365   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4366 
4367   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4368   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4369   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4370   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4371   ierr = PetscFree(tns);CHKERRQ(ierr);
4372   PetscFunctionReturn(0);
4373 }
4374 
4375 #undef __FUNCT__
4376 #define __FUNCT__ "MatInodeGetInodeSizes"
4377 /*@C
4378    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4379 
4380    Not Collective
4381 
4382    Input Parameter:
4383 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4384 
4385    Output Parameter:
4386 +  node_count - no of inodes present in the matrix.
4387 .  sizes      - an array of size node_count,with sizes of each inode.
4388 -  limit      - the max size used to generate the inodes.
4389 
4390    Level: advanced
4391 
4392    Notes: This routine returns some internal storage information
4393    of the matrix, it is intended to be used by advanced users.
4394    It should be called after the matrix is assembled.
4395    The contents of the sizes[] array should not be changed.
4396    NULL may be passed for information not requested.
4397 
4398 .keywords: matrix, seqaij, get, inode
4399 
4400 .seealso: MatGetInfo()
4401 @*/
4402 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4403 {
4404   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4405 
4406   PetscFunctionBegin;
4407   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4408   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4409   if (f) {
4410     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4411   }
4412   PetscFunctionReturn(0);
4413 }
4414 
4415 #undef __FUNCT__
4416 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode"
4417 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4418 {
4419   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4420 
4421   PetscFunctionBegin;
4422   if (node_count) *node_count = a->inode.node_count;
4423   if (sizes)      *sizes      = a->inode.size;
4424   if (limit)      *limit      = a->inode.limit;
4425   PetscFunctionReturn(0);
4426 }
4427